refactor(file extractor): file extractor (#1059)

This commit is contained in:
yezhwi 2023-08-31 14:45:31 +08:00 committed by GitHub
parent d3f8ea2df0
commit d33a269548
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 11 additions and 10 deletions

View File

@ -83,7 +83,7 @@ class FileApi(Resource):
raise FileTooLargeError(message)
extension = file.filename.split('.')[-1]
if extension not in ALLOWED_EXTENSIONS:
if extension.lower() not in ALLOWED_EXTENSIONS:
raise UnsupportedFileTypeError()
# user uuid as file name
@ -136,7 +136,7 @@ class FilePreviewApi(Resource):
# extract text from file
extension = upload_file.extension
if extension not in ALLOWED_EXTENSIONS:
if extension.lower() not in ALLOWED_EXTENSIONS:
raise UnsupportedFileTypeError()
text = FileExtractor.load(upload_file, return_text=True)

View File

@ -47,17 +47,18 @@ class FileExtractor:
upload_file: Optional[UploadFile] = None) -> Union[List[Document] | str]:
input_file = Path(file_path)
delimiter = '\n'
if input_file.suffix == '.xlsx':
file_extension = input_file.suffix.lower()
if file_extension == '.xlsx':
loader = ExcelLoader(file_path)
elif input_file.suffix == '.pdf':
elif file_extension == '.pdf':
loader = PdfLoader(file_path, upload_file=upload_file)
elif input_file.suffix in ['.md', '.markdown']:
elif file_extension in ['.md', '.markdown']:
loader = MarkdownLoader(file_path, autodetect_encoding=True)
elif input_file.suffix in ['.htm', '.html']:
elif file_extension in ['.htm', '.html']:
loader = HTMLLoader(file_path)
elif input_file.suffix == '.docx':
elif file_extension == '.docx':
loader = Docx2txtLoader(file_path)
elif input_file.suffix == '.csv':
elif file_extension == '.csv':
loader = CSVLoader(file_path, autodetect_encoding=True)
else:
# txt

View File

@ -78,7 +78,7 @@ const FileUploader = ({
const isValid = useCallback((file: File) => {
const { size } = file
const ext = `.${getFileType(file)}`
const isValidType = ACCEPTS.includes(ext)
const isValidType = ACCEPTS.includes(ext.toLowerCase())
if (!isValidType)
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') })

View File

@ -151,4 +151,4 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
)
}
export default DatasetUpdateForm
export default DatasetUpdateForm