Fix some RAG bugs (#2570)

Co-authored-by: jyong <jyong@dify.ai>
2024-11-16 03:32:23 +08:00 · 2024-02-27 11:39:05 +08:00 · 2024-02-27 11:39:05 +08:00 · 5b953c1ef2
commit 5b953c1ef2
parent 562ca45e07
10 changed files with 33 additions and 59 deletions
--- a/api/controllers/console/datasets/data_source.py
+++ b/api/controllers/console/datasets/data_source.py
@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource):
            notion_workspace_id=workspace_id,
            notion_obj_id=page_id,
            notion_page_type=page_type,
-            notion_access_token=data_source_binding.access_token
+            notion_access_token=data_source_binding.access_token,
+            tenant_id=current_user.current_tenant_id
        )

        text_docs = extractor.extract()
@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource):
                    notion_info={
                        "notion_workspace_id": workspace_id,
                        "notion_obj_id": page['page_id'],
-                        "notion_page_type": page['type']
+                        "notion_page_type": page['type'],
+                        "tenant_id": current_user.current_tenant_id
                    },
                    document_model=args['doc_form']
                )
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource):
                        notion_info={
                            "notion_workspace_id": workspace_id,
                            "notion_obj_id": page['page_id'],
-                            "notion_page_type": page['type']
+                            "notion_page_type": page['type'],
+                            "tenant_id": current_user.current_tenant_id
                        },
                        document_model=args['doc_form']
                    )
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
                    notion_info={
                        "notion_workspace_id": data_source_info['notion_workspace_id'],
                        "notion_obj_id": data_source_info['notion_page_id'],
-                        "notion_page_type": data_source_info['type']
+                        "notion_page_type": data_source_info['type'],
+                        "tenant_id": current_user.current_tenant_id
                    },
                    document_model=document.doc_form
                )
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@ -366,7 +366,8 @@ class IndexingRunner:
                    "notion_workspace_id": data_source_info['notion_workspace_id'],
                    "notion_obj_id": data_source_info['notion_page_id'],
                    "notion_page_type": data_source_info['type'],
-                    "document": dataset_document
+                    "document": dataset_document,
+                    "tenant_id": dataset_document.tenant_id
                },
                document_model=dataset_document.doc_form
            )
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@ -39,7 +39,8 @@ class RetrievalService:
                'flask_app': current_app._get_current_object(),
                'dataset_id': dataset_id,
                'query': query,
-                'top_k': top_k
+                'top_k': top_k,
+                'all_documents': all_documents
            })
            threads.append(keyword_thread)
            keyword_thread.start()
--- a/api/core/rag/extractor/entity/extract_setting.py
+++ b/api/core/rag/extractor/entity/extract_setting.py
@ -12,6 +12,7 @@ class NotionInfo(BaseModel):
    notion_obj_id: str
    notion_page_type: str
    document: Document = None
+    tenant_id: str

    class Config:
        arbitrary_types_allowed = True
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@ -132,7 +132,8 @@ class ExtractProcessor:
                notion_workspace_id=extract_setting.notion_info.notion_workspace_id,
                notion_obj_id=extract_setting.notion_info.notion_obj_id,
                notion_page_type=extract_setting.notion_info.notion_page_type,
-                document_model=extract_setting.notion_info.document
+                document_model=extract_setting.notion_info.document,
+                tenant_id=extract_setting.notion_info.tenant_id,
            )
            return extractor.extract()
        else:
--- a/api/core/rag/extractor/html_extractor.py
+++ b/api/core/rag/extractor/html_extractor.py
@ -1,13 +1,14 @@
 """Abstract interface for document loader implementations."""
-from typing import Optional
+from bs4 import BeautifulSoup

 from core.rag.extractor.extractor_base import BaseExtractor
-from core.rag.extractor.helpers import detect_file_encodings
 from core.rag.models.document import Document


 class HtmlExtractor(BaseExtractor):
-    """Load html files.
+
+    """
+    Load html files.


    Args:
@ -16,56 +17,18 @@ class HtmlExtractor(BaseExtractor):

    def __init__(
        self,
-            file_path: str,
-            encoding: Optional[str] = None,
-            autodetect_encoding: bool = False,
-            source_column: Optional[str] = None,
-            csv_args: Optional[dict] = None,
+        file_path: str
    ):
        """Initialize with file path."""
        self._file_path = file_path
-        self._encoding = encoding
-        self._autodetect_encoding = autodetect_encoding
-        self.source_column = source_column
-        self.csv_args = csv_args or {}

    def extract(self) -> list[Document]:
-        """Load data into document objects."""
-        try:
-            with open(self._file_path, newline="", encoding=self._encoding) as csvfile:
-                docs = self._read_from_file(csvfile)
-        except UnicodeDecodeError as e:
-            if self._autodetect_encoding:
-                detected_encodings = detect_file_encodings(self._file_path)
-                for encoding in detected_encodings:
-                    try:
-                        with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile:
-                            docs = self._read_from_file(csvfile)
-                        break
-                    except UnicodeDecodeError:
-                        continue
-            else:
-                raise RuntimeError(f"Error loading {self._file_path}") from e
+        return [Document(page_content=self._load_as_text())]

-        return docs
+    def _load_as_text(self) -> str:
+        with open(self._file_path, "rb") as fp:
+            soup = BeautifulSoup(fp, 'html.parser')
+            text = soup.get_text()
+            text = text.strip() if text else ''

-    def _read_from_file(self, csvfile) -> list[Document]:
-        docs = []
-        csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
-        for i, row in enumerate(csv_reader):
-            content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
-            try:
-                source = (
-                    row[self.source_column]
-                    if self.source_column is not None
-                    else ''
-                )
-            except KeyError:
-                raise ValueError(
-                    f"Source column '{self.source_column}' not found in CSV file."
-                )
-            metadata = {"source": source, "row": i}
-            doc = Document(page_content=content, metadata=metadata)
-            docs.append(doc)
-
-        return docs
+        return text
--- a/api/core/rag/extractor/notion_extractor.py
+++ b/api/core/rag/extractor/notion_extractor.py
@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor):
            notion_workspace_id: str,
            notion_obj_id: str,
            notion_page_type: str,
+            tenant_id: str,
            document_model: Optional[DocumentModel] = None,
-            notion_access_token: Optional[str] = None
+            notion_access_token: Optional[str] = None,
+
    ):
        self._notion_access_token = None
        self._document_model = document_model
--- a/api/tasks/document_indexing_sync_task.py
+++ b/api/tasks/document_indexing_sync_task.py
@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
            notion_workspace_id=workspace_id,
            notion_obj_id=page_id,
            notion_page_type=page_type,
-            notion_access_token=data_source_binding.access_token
+            notion_access_token=data_source_binding.access_token,
+            tenant_id=document.tenant_id
        )

        last_edited_time = loader.get_notion_last_edited_time()