security/SSRF vulns (#6682)

This commit is contained in:
Yeuoly 2024-07-25 20:50:26 +08:00 committed by GitHub
parent c5ac004f15
commit 79cb23e8ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 28 deletions

View File

@ -17,12 +17,15 @@ proxies = {
'https://': SSRF_PROXY_HTTPS_URL
} if SSRF_PROXY_HTTP_URL and SSRF_PROXY_HTTPS_URL else None
BACKOFF_FACTOR = 0.5
STATUS_FORCELIST = [429, 500, 502, 503, 504]
def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs):
if "allow_redirects" in kwargs:
allow_redirects = kwargs.pop("allow_redirects")
if "follow_redirects" not in kwargs:
kwargs["follow_redirects"] = allow_redirects
retries = 0
while retries <= max_retries:
try:

View File

@ -4,9 +4,8 @@ from pathlib import Path
from typing import Union
from urllib.parse import unquote
import requests
from configs import dify_config
from core.helper import ssrf_proxy
from core.rag.extractor.csv_extractor import CSVExtractor
from core.rag.extractor.entity.datasource_type import DatasourceType
from core.rag.extractor.entity.extract_setting import ExtractSetting
@ -51,7 +50,7 @@ class ExtractProcessor:
@classmethod
def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]:
response = requests.get(url, headers={
response = ssrf_proxy.get(url, headers={
"User-Agent": USER_AGENT
})

View File

@ -11,11 +11,10 @@ from contextlib import contextmanager
from urllib.parse import unquote
import cloudscraper
import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString
from newspaper import Article
from regex import regex
from core.helper import ssrf_proxy
from core.rag.extractor import extract_processor
from core.rag.extractor.extract_processor import ExtractProcessor
@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str:
main_content_type = None
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10))
if response.status_code == 200:
# check content-type
@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str:
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
elif response.status_code == 403:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
scraper.perform_request = ssrf_proxy.make_request
response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str:
a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip():
return get_url_from_newspaper3k(url)
return ''
res = FULL_TEMPLATE.format(
title=a['title'],
@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str:
return res
def get_url_from_newspaper3k(url: str) -> str:
a = Article(url)
a.download()
a.parse()
res = FULL_TEMPLATE.format(
title=a.title,
authors=a.authors,
publish_date=a.publish_date,
top_image=a.top_image,
text=a.text,
)
return res
def extract_using_readabilipy(html):
with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
f_html.write(html)