From d33a269548c87ea2b413e9ffe080e98250337441 Mon Sep 17 00:00:00 2001 From: yezhwi Date: Thu, 31 Aug 2023 14:45:31 +0800 Subject: [PATCH] refactor(file extractor): file extractor (#1059) --- api/controllers/console/datasets/file.py | 4 ++-- api/core/data_loader/file_extractor.py | 13 +++++++------ .../datasets/create/file-uploader/index.tsx | 2 +- web/app/components/datasets/create/index.tsx | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/api/controllers/console/datasets/file.py b/api/controllers/console/datasets/file.py index aef25b24f8..adef2c74cb 100644 --- a/api/controllers/console/datasets/file.py +++ b/api/controllers/console/datasets/file.py @@ -83,7 +83,7 @@ class FileApi(Resource): raise FileTooLargeError(message) extension = file.filename.split('.')[-1] - if extension not in ALLOWED_EXTENSIONS: + if extension.lower() not in ALLOWED_EXTENSIONS: raise UnsupportedFileTypeError() # user uuid as file name @@ -136,7 +136,7 @@ class FilePreviewApi(Resource): # extract text from file extension = upload_file.extension - if extension not in ALLOWED_EXTENSIONS: + if extension.lower() not in ALLOWED_EXTENSIONS: raise UnsupportedFileTypeError() text = FileExtractor.load(upload_file, return_text=True) diff --git a/api/core/data_loader/file_extractor.py b/api/core/data_loader/file_extractor.py index b7f80b988a..4b811f51c6 100644 --- a/api/core/data_loader/file_extractor.py +++ b/api/core/data_loader/file_extractor.py @@ -47,17 +47,18 @@ class FileExtractor: upload_file: Optional[UploadFile] = None) -> Union[List[Document] | str]: input_file = Path(file_path) delimiter = '\n' - if input_file.suffix == '.xlsx': + file_extension = input_file.suffix.lower() + if file_extension == '.xlsx': loader = ExcelLoader(file_path) - elif input_file.suffix == '.pdf': + elif file_extension == '.pdf': loader = PdfLoader(file_path, upload_file=upload_file) - elif input_file.suffix in ['.md', '.markdown']: + elif file_extension in ['.md', '.markdown']: loader = MarkdownLoader(file_path, autodetect_encoding=True) - elif input_file.suffix in ['.htm', '.html']: + elif file_extension in ['.htm', '.html']: loader = HTMLLoader(file_path) - elif input_file.suffix == '.docx': + elif file_extension == '.docx': loader = Docx2txtLoader(file_path) - elif input_file.suffix == '.csv': + elif file_extension == '.csv': loader = CSVLoader(file_path, autodetect_encoding=True) else: # txt diff --git a/web/app/components/datasets/create/file-uploader/index.tsx b/web/app/components/datasets/create/file-uploader/index.tsx index 387e9e0c6c..824f1d4f3f 100644 --- a/web/app/components/datasets/create/file-uploader/index.tsx +++ b/web/app/components/datasets/create/file-uploader/index.tsx @@ -78,7 +78,7 @@ const FileUploader = ({ const isValid = useCallback((file: File) => { const { size } = file const ext = `.${getFileType(file)}` - const isValidType = ACCEPTS.includes(ext) + const isValidType = ACCEPTS.includes(ext.toLowerCase()) if (!isValidType) notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') }) diff --git a/web/app/components/datasets/create/index.tsx b/web/app/components/datasets/create/index.tsx index 2771087ddd..965477d039 100644 --- a/web/app/components/datasets/create/index.tsx +++ b/web/app/components/datasets/create/index.tsx @@ -151,4 +151,4 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { ) } -export default DatasetUpdateForm +export default DatasetUpdateForm \ No newline at end of file