security/SSRF vulns (#6682)

This commit is contained in:
Yeuoly 2024-07-25 20:50:26 +08:00 committed by GitHub
parent c5ac004f15
commit 79cb23e8ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 28 deletions

View File

@ -17,12 +17,15 @@ proxies = {
'https://': SSRF_PROXY_HTTPS_URL 'https://': SSRF_PROXY_HTTPS_URL
} if SSRF_PROXY_HTTP_URL and SSRF_PROXY_HTTPS_URL else None } if SSRF_PROXY_HTTP_URL and SSRF_PROXY_HTTPS_URL else None
BACKOFF_FACTOR = 0.5 BACKOFF_FACTOR = 0.5
STATUS_FORCELIST = [429, 500, 502, 503, 504] STATUS_FORCELIST = [429, 500, 502, 503, 504]
def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs):
if "allow_redirects" in kwargs:
allow_redirects = kwargs.pop("allow_redirects")
if "follow_redirects" not in kwargs:
kwargs["follow_redirects"] = allow_redirects
retries = 0 retries = 0
while retries <= max_retries: while retries <= max_retries:
try: try:

View File

@ -4,9 +4,8 @@ from pathlib import Path
from typing import Union from typing import Union
from urllib.parse import unquote from urllib.parse import unquote
import requests
from configs import dify_config from configs import dify_config
from core.helper import ssrf_proxy
from core.rag.extractor.csv_extractor import CSVExtractor from core.rag.extractor.csv_extractor import CSVExtractor
from core.rag.extractor.entity.datasource_type import DatasourceType from core.rag.extractor.entity.datasource_type import DatasourceType
from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.entity.extract_setting import ExtractSetting
@ -51,7 +50,7 @@ class ExtractProcessor:
@classmethod @classmethod
def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]: def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]:
response = requests.get(url, headers={ response = ssrf_proxy.get(url, headers={
"User-Agent": USER_AGENT "User-Agent": USER_AGENT
}) })

View File

@ -11,11 +11,10 @@ from contextlib import contextmanager
from urllib.parse import unquote from urllib.parse import unquote
import cloudscraper import cloudscraper
import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
from newspaper import Article
from regex import regex from regex import regex
from core.helper import ssrf_proxy
from core.rag.extractor import extract_processor from core.rag.extractor import extract_processor
from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.extractor.extract_processor import ExtractProcessor
@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str:
main_content_type = None main_content_type = None
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10))
if response.status_code == 200: if response.status_code == 200:
# check content-type # check content-type
@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str:
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True) return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
elif response.status_code == 403: elif response.status_code == 403:
scraper = cloudscraper.create_scraper() scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) scraper.perform_request = ssrf_proxy.make_request
response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
if response.status_code != 200: if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code) return "URL returned status code {}.".format(response.status_code)
@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str:
a = extract_using_readabilipy(response.text) a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():
return get_url_from_newspaper3k(url) return ''
res = FULL_TEMPLATE.format( res = FULL_TEMPLATE.format(
title=a['title'], title=a['title'],
@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str:
return res return res
def get_url_from_newspaper3k(url: str) -> str:
a = Article(url)
a.download()
a.parse()
res = FULL_TEMPLATE.format(
title=a.title,
authors=a.authors,
publish_date=a.publish_date,
top_image=a.top_image,
text=a.text,
)
return res
def extract_using_readabilipy(html): def extract_using_readabilipy(html):
with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html: with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
f_html.write(html) f_html.write(html)