fix(api): enhance file factory URL handling (#9631)

This commit is contained in:
-LAN- 2024-10-22 15:38:08 +08:00 committed by GitHub
parent 98bf7710e4
commit ef5f476cd6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,6 +2,7 @@ import mimetypes
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
from typing import Any from typing import Any
import httpx
from sqlalchemy import select from sqlalchemy import select
from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS
@ -154,7 +155,7 @@ def _build_from_local_file(
file = File( file = File(
id=mapping.get("id"), id=mapping.get("id"),
filename=row.name, filename=row.name,
extension=row.extension, extension="." + row.extension,
mime_type=row.mime_type, mime_type=row.mime_type,
tenant_id=tenant_id, tenant_id=tenant_id,
type=file_type, type=file_type,
@ -177,25 +178,29 @@ def _build_from_remote_url(
url = mapping.get("url") url = mapping.get("url")
if not url: if not url:
raise ValueError("Invalid file url") raise ValueError("Invalid file url")
resp = ssrf_proxy.head(url, follow_redirects=True)
resp.raise_for_status()
# Try to extract filename from response headers or URL resp = ssrf_proxy.head(url, follow_redirects=True)
content_disposition = resp.headers.get("Content-Disposition") if resp.status_code == httpx.codes.OK:
if content_disposition: # Try to extract filename from response headers or URL
filename = content_disposition.split("filename=")[-1].strip('"') content_disposition = resp.headers.get("Content-Disposition")
if content_disposition:
filename = content_disposition.split("filename=")[-1].strip('"')
else:
filename = url.split("/")[-1].split("?")[0]
# Create the File object
file_size = int(resp.headers.get("Content-Length", -1))
mime_type = str(resp.headers.get("Content-Type", ""))
else: else:
filename = url.split("/")[-1].split("?")[0] filename = ""
file_size = -1
mime_type = ""
# If filename is empty, set a default one # If filename is empty, set a default one
if not filename: if not filename:
filename = "unknown_file" filename = "unknown_file"
# Determine file extension # Determine file extension
extension = "." + filename.split(".")[-1] if "." in filename else ".bin" extension = "." + filename.split(".")[-1] if "." in filename else ".bin"
# Create the File object
file_size = int(resp.headers.get("Content-Length", -1))
mime_type = str(resp.headers.get("Content-Type", ""))
if not mime_type: if not mime_type:
mime_type, _ = mimetypes.guess_type(url) mime_type, _ = mimetypes.guess_type(url)
file = File( file = File(