From 722964667f8bc33fa1af888c36de514433d40eee Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Thu, 14 Nov 2024 17:29:49 +0800
Subject: [PATCH] fix: non utf8 code decode close #10691 (#10698)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 api/core/workflow/nodes/document_extractor/node.py    |  8 ++++----
 .../workflow/nodes/test_document_extractor_node.py    | 11 +++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py
index c90017d5e1..c3cacdab7f 100644
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -143,14 +143,14 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
 
 def _extract_text_from_plain_text(file_content: bytes) -> str:
     try:
-        return file_content.decode("utf-8")
+        return file_content.decode("utf-8", "ignore")
     except UnicodeDecodeError as e:
         raise TextExtractionError("Failed to decode plain text file") from e
 
 
 def _extract_text_from_json(file_content: bytes) -> str:
     try:
-        json_data = json.loads(file_content.decode("utf-8"))
+        json_data = json.loads(file_content.decode("utf-8", "ignore"))
         return json.dumps(json_data, indent=2, ensure_ascii=False)
     except (UnicodeDecodeError, json.JSONDecodeError) as e:
         raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
@@ -159,7 +159,7 @@ def _extract_text_from_json(file_content: bytes) -> str:
 def _extract_text_from_yaml(file_content: bytes) -> str:
     """Extract the content from yaml file"""
     try:
-        yaml_data = yaml.safe_load_all(file_content.decode("utf-8"))
+        yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
         return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
     except (UnicodeDecodeError, yaml.YAMLError) as e:
         raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
@@ -217,7 +217,7 @@ def _extract_text_from_file(file: File):
 
 def _extract_text_from_csv(file_content: bytes) -> str:
     try:
-        csv_file = io.StringIO(file_content.decode("utf-8"))
+        csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
         csv_reader = csv.reader(csv_file)
         rows = list(csv_reader)
 
diff --git a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py
index 4f1f8f05c8..6c4caec25f 100644
--- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py
+++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py
@@ -140,6 +140,17 @@ def test_extract_text_from_plain_text():
     assert text == "Hello, world!"
 
 
+def tet_extract_text_from_plain_text_non_utf8():
+    import tempfile
+
+    non_utf8_content = b"Hello world\xa9."  # \xA9 represents © in Latin-1
+    with tempfile.NamedTemporaryFile(delete=True) as temp_file:
+        temp_file.write(non_utf8_content)
+        temp_file.seek(0)
+        text = _extract_text_from_plain_text(temp_file.read())
+    assert text == "Hello, world."
+
+
 @patch("pypdfium2.PdfDocument")
 def test_extract_text_from_pdf(mock_pdf_document):
     mock_page = Mock()