diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 4d2f61139a..931297c95e 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -2,7 +2,6 @@ from typing import Optional import pandas as pd -import xlrd from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -28,61 +27,19 @@ class ExcelExtractor(BaseExtractor): self._autodetect_encoding = autodetect_encoding def extract(self) -> list[Document]: - """ parse excel file""" - if self._file_path.endswith('.xls'): - return self._extract4xls() - elif self._file_path.endswith('.xlsx'): - return self._extract4xlsx() - - def _extract4xls(self) -> list[Document]: - wb = xlrd.open_workbook(filename=self._file_path) + """ Load from Excel file in xls or xlsx format using Pandas.""" documents = [] - # loop over all sheets - for sheet in wb.sheets(): - row_header = None - for row_index, row in enumerate(sheet.get_rows(), start=1): - if self.is_blank_row(row): - continue - if row_header is None: - row_header = row - continue - item_arr = [] - for index, cell in enumerate(row): - txt_value = str(cell.value) - item_arr.append(f'"{row_header[index].value}":"{txt_value}"') - item_str = ",".join(item_arr) - document = Document(page_content=item_str, metadata={'source': self._file_path}) - documents.append(document) - return documents - - def _extract4xlsx(self) -> list[Document]: - """Load from file path using Pandas.""" - data = [] # Read each worksheet of an Excel file using Pandas - xls = pd.ExcelFile(self._file_path) - for sheet_name in xls.sheet_names: - df = pd.read_excel(xls, sheet_name=sheet_name) + excel_file = pd.ExcelFile(self._file_path) + for sheet_name in excel_file.sheet_names: + df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name) # filter out rows with all NaN values df.dropna(how='all', inplace=True) # transform each row into a Document - for _, row in df.iterrows(): - item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)) - document = Document(page_content=item, metadata={'source': self._file_path}) - data.append(document) - return data + documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)), + metadata={'source': self._file_path}, + ) for _, row in df.iterrows()] - @staticmethod - def is_blank_row(row): - """ - - Determine whether the specified line is a blank line. - :param row: row object。 - :return: Returns True if the row is blank, False otherwise. - """ - # Iterates through the cells and returns False if a non-empty cell is found - for cell in row: - if cell.value is not None and cell.value != '': - return False - return True + return documents