mirror of
https://github.com/langgenius/dify.git
synced 2024-11-15 19:22:36 +08:00
feat(workflow): Support JSON type in document extractor node (#9899)
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
Co-authored-by: -LAN- <laipz8200@outlook.com>
This commit is contained in:
parent
dd3ac7a2c9
commit
216442ddc1
|
@ -1,5 +1,6 @@
|
|||
import csv
|
||||
import io
|
||||
import json
|
||||
|
||||
import docx
|
||||
import pandas as pd
|
||||
|
@ -77,34 +78,31 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
|
|||
|
||||
def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
|
||||
"""Extract text from a file based on its MIME type."""
|
||||
if mime_type.startswith("text/plain") or mime_type in {"text/html", "text/htm", "text/markdown", "text/xml"}:
|
||||
return _extract_text_from_plain_text(file_content)
|
||||
elif mime_type == "application/pdf":
|
||||
return _extract_text_from_pdf(file_content)
|
||||
elif mime_type in {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/msword",
|
||||
}:
|
||||
return _extract_text_from_doc(file_content)
|
||||
elif mime_type == "text/csv":
|
||||
return _extract_text_from_csv(file_content)
|
||||
elif mime_type in {
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.ms-excel",
|
||||
}:
|
||||
return _extract_text_from_excel(file_content)
|
||||
elif mime_type == "application/vnd.ms-powerpoint":
|
||||
return _extract_text_from_ppt(file_content)
|
||||
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||
return _extract_text_from_pptx(file_content)
|
||||
elif mime_type == "application/epub+zip":
|
||||
return _extract_text_from_epub(file_content)
|
||||
elif mime_type == "message/rfc822":
|
||||
return _extract_text_from_eml(file_content)
|
||||
elif mime_type == "application/vnd.ms-outlook":
|
||||
return _extract_text_from_msg(file_content)
|
||||
else:
|
||||
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
||||
match mime_type:
|
||||
case "text/plain" | "text/html" | "text/htm" | "text/markdown" | "text/xml":
|
||||
return _extract_text_from_plain_text(file_content)
|
||||
case "application/pdf":
|
||||
return _extract_text_from_pdf(file_content)
|
||||
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword":
|
||||
return _extract_text_from_doc(file_content)
|
||||
case "text/csv":
|
||||
return _extract_text_from_csv(file_content)
|
||||
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
|
||||
return _extract_text_from_excel(file_content)
|
||||
case "application/vnd.ms-powerpoint":
|
||||
return _extract_text_from_ppt(file_content)
|
||||
case "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||
return _extract_text_from_pptx(file_content)
|
||||
case "application/epub+zip":
|
||||
return _extract_text_from_epub(file_content)
|
||||
case "message/rfc822":
|
||||
return _extract_text_from_eml(file_content)
|
||||
case "application/vnd.ms-outlook":
|
||||
return _extract_text_from_msg(file_content)
|
||||
case "application/json":
|
||||
return _extract_text_from_json(file_content)
|
||||
case _:
|
||||
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
||||
|
||||
|
||||
def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str:
|
||||
|
@ -112,6 +110,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|||
match file_extension:
|
||||
case ".txt" | ".markdown" | ".md" | ".html" | ".htm" | ".xml":
|
||||
return _extract_text_from_plain_text(file_content)
|
||||
case ".json":
|
||||
return _extract_text_from_json(file_content)
|
||||
case ".pdf":
|
||||
return _extract_text_from_pdf(file_content)
|
||||
case ".doc" | ".docx":
|
||||
|
@ -141,6 +141,14 @@ def _extract_text_from_plain_text(file_content: bytes) -> str:
|
|||
raise TextExtractionError("Failed to decode plain text file") from e
|
||||
|
||||
|
||||
def _extract_text_from_json(file_content: bytes) -> str:
|
||||
try:
|
||||
json_data = json.loads(file_content.decode("utf-8"))
|
||||
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
||||
except (UnicodeDecodeError, json.JSONDecodeError) as e:
|
||||
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
||||
|
||||
|
||||
def _extract_text_from_pdf(file_content: bytes) -> str:
|
||||
try:
|
||||
pdf_file = io.BytesIO(file_content)
|
||||
|
@ -183,13 +191,13 @@ def _download_file_content(file: File) -> bytes:
|
|||
|
||||
|
||||
def _extract_text_from_file(file: File):
|
||||
if file.mime_type is None:
|
||||
raise UnsupportedFileTypeError("Unable to determine file type: MIME type is missing")
|
||||
file_content = _download_file_content(file)
|
||||
if file.transfer_method == FileTransferMethod.REMOTE_URL:
|
||||
if file.extension:
|
||||
extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
|
||||
elif file.mime_type:
|
||||
extracted_text = _extract_text_by_mime_type(file_content=file_content, mime_type=file.mime_type)
|
||||
else:
|
||||
extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
|
||||
raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
|
||||
return extracted_text
|
||||
|
||||
|
||||
|
|
|
@ -125,7 +125,7 @@ def test_run_extract_text(
|
|||
result = document_extractor_node._run()
|
||||
|
||||
assert isinstance(result, NodeRunResult)
|
||||
assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED
|
||||
assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED, result.error
|
||||
assert result.outputs is not None
|
||||
assert result.outputs["text"] == expected_text
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user