improve: unify Excel files parsing in either xls or xlsx file format by Pandas (#4965)

2024-11-16 11:42:29 +08:00 · 2024-06-20 16:14:49 +08:00 · 2024-06-20 16:14:49 +08:00 · 39c14ec7c1
commit 39c14ec7c1
parent 0d20df9a51
1 changed files with 8 additions and 51 deletions
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@ -2,7 +2,6 @@
 from typing import Optional

 import pandas as pd
-import xlrd

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -28,61 +27,19 @@ class ExcelExtractor(BaseExtractor):
        self._autodetect_encoding = autodetect_encoding

    def extract(self) -> list[Document]:
-        """ parse excel file"""
-        if self._file_path.endswith('.xls'):
-            return self._extract4xls()
-        elif self._file_path.endswith('.xlsx'):
-            return self._extract4xlsx()
-
-    def _extract4xls(self) -> list[Document]:
-        wb = xlrd.open_workbook(filename=self._file_path)
+        """ Load from Excel file in xls or xlsx format using Pandas."""
        documents = []
-        # loop over all sheets
-        for sheet in wb.sheets():
-            row_header = None
-            for row_index, row in enumerate(sheet.get_rows(), start=1):                
-                if self.is_blank_row(row):
-                    continue
-                if row_header is None:
-                    row_header = row
-                    continue
-                item_arr = []
-                for index, cell in enumerate(row):
-                    txt_value = str(cell.value)
-                    item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
-                item_str = ",".join(item_arr)
-                document = Document(page_content=item_str, metadata={'source': self._file_path})
-                documents.append(document)
-        return documents
-
-    def _extract4xlsx(self) -> list[Document]:
-        """Load from file path using Pandas."""
-        data = []
        # Read each worksheet of an Excel file using Pandas
-        xls = pd.ExcelFile(self._file_path)
-        for sheet_name in xls.sheet_names:
-            df = pd.read_excel(xls, sheet_name=sheet_name)
+        excel_file = pd.ExcelFile(self._file_path)
+        for sheet_name in excel_file.sheet_names:
+            df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)

            # filter out rows with all NaN values
            df.dropna(how='all', inplace=True)

            # transform each row into a Document
-            for _, row in df.iterrows():
-                item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
-                document = Document(page_content=item, metadata={'source': self._file_path})
-                data.append(document)
-        return data
+            documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
+                                   metadata={'source': self._file_path},
+                                   ) for _, row in df.iterrows()]

-    @staticmethod
-    def is_blank_row(row):
-        """
-
-        Determine whether the specified line is a blank line.
-        :param row: row object。
-        :return: Returns True if the row is blank, False otherwise.
-        """
-        # Iterates through the cells and returns False if a non-empty cell is found
-        for cell in row:
-            if cell.value is not None and cell.value != '':
-                return False
-        return True
+        return documents