security/SSRF vulns (#6682)

2024-11-16 11:42:29 +08:00 · 2024-07-25 20:50:26 +08:00 · 2024-07-25 20:50:26 +08:00 · 79cb23e8ac
commit 79cb23e8ac
parent c5ac004f15
3 changed files with 13 additions and 28 deletions
--- a/api/core/helper/ssrf_proxy.py
+++ b/api/core/helper/ssrf_proxy.py
@ -17,12 +17,15 @@ proxies = {
    'https://': SSRF_PROXY_HTTPS_URL
 } if SSRF_PROXY_HTTP_URL and SSRF_PROXY_HTTPS_URL else None

-
 BACKOFF_FACTOR = 0.5
 STATUS_FORCELIST = [429, 500, 502, 503, 504]

-
 def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs):
+    if "allow_redirects" in kwargs:
+        allow_redirects = kwargs.pop("allow_redirects")
+        if "follow_redirects" not in kwargs:
+            kwargs["follow_redirects"] = allow_redirects
+    
    retries = 0
    while retries <= max_retries:
        try:
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@ -4,9 +4,8 @@ from pathlib import Path
 from typing import Union
 from urllib.parse import unquote

-import requests
-
 from configs import dify_config
+from core.helper import ssrf_proxy
 from core.rag.extractor.csv_extractor import CSVExtractor
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting
@ -51,7 +50,7 @@ class ExtractProcessor:

    @classmethod
    def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]:
-        response = requests.get(url, headers={
+        response = ssrf_proxy.get(url, headers={
            "User-Agent": USER_AGENT
        })

--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@ -11,11 +11,10 @@ from contextlib import contextmanager
 from urllib.parse import unquote

 import cloudscraper
-import requests
 from bs4 import BeautifulSoup, CData, Comment, NavigableString
-from newspaper import Article
 from regex import regex

+from core.helper import ssrf_proxy
 from core.rag.extractor import extract_processor
 from core.rag.extractor.extract_processor import ExtractProcessor

@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str:

    main_content_type = None
    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
-    response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
+    response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10))

    if response.status_code == 200:
        # check content-type
@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str:
        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
            return ExtractProcessor.load_from_url(url, return_text=True)

-        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+        response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
    elif response.status_code == 403:
        scraper = cloudscraper.create_scraper()
-        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+        scraper.perform_request = ssrf_proxy.make_request
+        response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))

    if response.status_code != 200:
        return "URL returned status code {}.".format(response.status_code)
@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str:
    a = extract_using_readabilipy(response.text)

    if not a['plain_text'] or not a['plain_text'].strip():
-        return get_url_from_newspaper3k(url)
+        return ''

    res = FULL_TEMPLATE.format(
        title=a['title'],
@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str:
    return res


-def get_url_from_newspaper3k(url: str) -> str:
-
-    a = Article(url)
-    a.download()
-    a.parse()
-
-    res = FULL_TEMPLATE.format(
-        title=a.title,
-        authors=a.authors,
-        publish_date=a.publish_date,
-        top_image=a.top_image,
-        text=a.text,
-    )
-
-    return res
-
-
 def extract_using_readabilipy(html):
    with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
        f_html.write(html)