mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 19:59:50 +08:00
3241e4015b
Co-authored-by: jyong <718720800@qq.com>
36 lines
803 B
Python
36 lines
803 B
Python
import logging
|
|
from typing import List
|
|
|
|
from bs4 import BeautifulSoup
|
|
from langchain.document_loaders.base import BaseLoader
|
|
from langchain.schema import Document
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HTMLLoader(BaseLoader):
|
|
"""Load html files.
|
|
|
|
|
|
Args:
|
|
file_path: Path to the file to load.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: str
|
|
):
|
|
"""Initialize with file path."""
|
|
self._file_path = file_path
|
|
|
|
def load(self) -> List[Document]:
|
|
return [Document(page_content=self._load_as_text())]
|
|
|
|
def _load_as_text(self) -> str:
|
|
with open(self._file_path, "rb") as fp:
|
|
soup = BeautifulSoup(fp, 'html.parser')
|
|
text = soup.get_text()
|
|
text = text.strip() if text else ''
|
|
|
|
return text
|