From ef5f476cd6bb906b8c2df54084ff3780b08aa1ca Mon Sep 17 00:00:00 2001 From: -LAN- Date: Tue, 22 Oct 2024 15:38:08 +0800 Subject: [PATCH] fix(api): enhance file factory URL handling (#9631) --- api/factories/file_factory.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index eac5090c2b..fa88e2b4fe 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -2,6 +2,7 @@ import mimetypes from collections.abc import Mapping, Sequence from typing import Any +import httpx from sqlalchemy import select from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS @@ -154,7 +155,7 @@ def _build_from_local_file( file = File( id=mapping.get("id"), filename=row.name, - extension=row.extension, + extension="." + row.extension, mime_type=row.mime_type, tenant_id=tenant_id, type=file_type, @@ -177,25 +178,29 @@ def _build_from_remote_url( url = mapping.get("url") if not url: raise ValueError("Invalid file url") - resp = ssrf_proxy.head(url, follow_redirects=True) - resp.raise_for_status() - # Try to extract filename from response headers or URL - content_disposition = resp.headers.get("Content-Disposition") - if content_disposition: - filename = content_disposition.split("filename=")[-1].strip('"') + resp = ssrf_proxy.head(url, follow_redirects=True) + if resp.status_code == httpx.codes.OK: + # Try to extract filename from response headers or URL + content_disposition = resp.headers.get("Content-Disposition") + if content_disposition: + filename = content_disposition.split("filename=")[-1].strip('"') + else: + filename = url.split("/")[-1].split("?")[0] + # Create the File object + file_size = int(resp.headers.get("Content-Length", -1)) + mime_type = str(resp.headers.get("Content-Type", "")) else: - filename = url.split("/")[-1].split("?")[0] + filename = "" + file_size = -1 + mime_type = "" + # If filename is empty, set a default one if not filename: filename = "unknown_file" - # Determine file extension extension = "." + filename.split(".")[-1] if "." in filename else ".bin" - # Create the File object - file_size = int(resp.headers.get("Content-Length", -1)) - mime_type = str(resp.headers.get("Content-Type", "")) if not mime_type: mime_type, _ = mimetypes.guess_type(url) file = File(