mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
Feat/improve document delete logic (#1325)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
parent
c0fe706597
commit
289c93d081
|
@ -11,6 +11,7 @@ from flask import current_app, Flask
|
||||||
from flask_login import current_user
|
from flask_login import current_user
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||||
|
from sqlalchemy.orm.exc import ObjectDeletedError
|
||||||
|
|
||||||
from core.data_loader.file_extractor import FileExtractor
|
from core.data_loader.file_extractor import FileExtractor
|
||||||
from core.data_loader.loader.notion import NotionLoader
|
from core.data_loader.loader.notion import NotionLoader
|
||||||
|
@ -79,6 +80,8 @@ class IndexingRunner:
|
||||||
dataset_document.error = str(e.description)
|
dataset_document.error = str(e.description)
|
||||||
dataset_document.stopped_at = datetime.datetime.utcnow()
|
dataset_document.stopped_at = datetime.datetime.utcnow()
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
except ObjectDeletedError:
|
||||||
|
logging.warning('Document deleted, document id: {}'.format(dataset_document.id))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception("consume document failed")
|
logging.exception("consume document failed")
|
||||||
dataset_document.indexing_status = 'error'
|
dataset_document.indexing_status = 'error'
|
||||||
|
@ -276,7 +279,8 @@ class IndexingRunner:
|
||||||
)
|
)
|
||||||
if len(preview_texts) > 0:
|
if len(preview_texts) > 0:
|
||||||
# qa model document
|
# qa model document
|
||||||
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], doc_language)
|
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0],
|
||||||
|
doc_language)
|
||||||
document_qa_list = self.format_split_text(response)
|
document_qa_list = self.format_split_text(response)
|
||||||
return {
|
return {
|
||||||
"total_segments": total_segments * 20,
|
"total_segments": total_segments * 20,
|
||||||
|
@ -372,7 +376,8 @@ class IndexingRunner:
|
||||||
)
|
)
|
||||||
if len(preview_texts) > 0:
|
if len(preview_texts) > 0:
|
||||||
# qa model document
|
# qa model document
|
||||||
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], doc_language)
|
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0],
|
||||||
|
doc_language)
|
||||||
document_qa_list = self.format_split_text(response)
|
document_qa_list = self.format_split_text(response)
|
||||||
return {
|
return {
|
||||||
"total_segments": total_segments * 20,
|
"total_segments": total_segments * 20,
|
||||||
|
@ -582,7 +587,6 @@ class IndexingRunner:
|
||||||
|
|
||||||
all_qa_documents.extend(format_documents)
|
all_qa_documents.extend(format_documents)
|
||||||
|
|
||||||
|
|
||||||
def _split_to_documents_for_estimate(self, text_docs: List[Document], splitter: TextSplitter,
|
def _split_to_documents_for_estimate(self, text_docs: List[Document], splitter: TextSplitter,
|
||||||
processing_rule: DatasetProcessRule) -> List[Document]:
|
processing_rule: DatasetProcessRule) -> List[Document]:
|
||||||
"""
|
"""
|
||||||
|
@ -734,6 +738,9 @@ class IndexingRunner:
|
||||||
count = DatasetDocument.query.filter_by(id=document_id, is_paused=True).count()
|
count = DatasetDocument.query.filter_by(id=document_id, is_paused=True).count()
|
||||||
if count > 0:
|
if count > 0:
|
||||||
raise DocumentIsPausedException()
|
raise DocumentIsPausedException()
|
||||||
|
document = DatasetDocument.query.filter_by(id=document_id).first()
|
||||||
|
if not document:
|
||||||
|
raise DocumentIsDeletedPausedException()
|
||||||
|
|
||||||
update_params = {
|
update_params = {
|
||||||
DatasetDocument.indexing_status: after_indexing_status
|
DatasetDocument.indexing_status: after_indexing_status
|
||||||
|
@ -781,3 +788,7 @@ class IndexingRunner:
|
||||||
|
|
||||||
class DocumentIsPausedException(Exception):
|
class DocumentIsPausedException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentIsDeletedPausedException(Exception):
|
||||||
|
pass
|
||||||
|
|
|
@ -385,9 +385,6 @@ class DocumentService:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete_document(document):
|
def delete_document(document):
|
||||||
if document.indexing_status in ["parsing", "cleaning", "splitting", "indexing"]:
|
|
||||||
raise DocumentIndexingError()
|
|
||||||
|
|
||||||
# trigger document_was_deleted signal
|
# trigger document_was_deleted signal
|
||||||
document_was_deleted.send(document.id, dataset_id=document.dataset_id)
|
document_was_deleted.send(document.id, dataset_id=document.dataset_id)
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,8 @@ def clean_document_task(document_id: str, dataset_id: str):
|
||||||
kw_index = IndexBuilder.get_index(dataset, 'economy')
|
kw_index = IndexBuilder.get_index(dataset, 'economy')
|
||||||
|
|
||||||
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
|
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
|
||||||
|
# check segment is exist
|
||||||
|
if segments:
|
||||||
index_node_ids = [segment.index_node_id for segment in segments]
|
index_node_ids = [segment.index_node_id for segment in segments]
|
||||||
|
|
||||||
# delete from vector index
|
# delete from vector index
|
||||||
|
|
|
@ -30,9 +30,7 @@ def document_indexing_task(dataset_id: str, document_ids: list):
|
||||||
Document.dataset_id == dataset_id
|
Document.dataset_id == dataset_id
|
||||||
).first()
|
).first()
|
||||||
|
|
||||||
if not document:
|
if document:
|
||||||
raise NotFound('Document not found')
|
|
||||||
|
|
||||||
document.indexing_status = 'parsing'
|
document.indexing_status = 'parsing'
|
||||||
document.processing_started_at = datetime.datetime.utcnow()
|
document.processing_started_at = datetime.datetime.utcnow()
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user