mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-15 19:22:19 +08:00
Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
commit
56179924e6
|
@ -29,27 +29,28 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
|||
const $ = cheerio.load(html);
|
||||
const links: string[] = [];
|
||||
|
||||
// Parse the base URL to get the origin
|
||||
const urlObject = new URL(baseUrl);
|
||||
const origin = urlObject.origin;
|
||||
|
||||
$('a').each((_, element) => {
|
||||
const href = $(element).attr('href');
|
||||
if (href) {
|
||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||
// Absolute URL, add as is
|
||||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to origin
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
try {
|
||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||
// Absolute URL, add as is
|
||||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
}
|
||||
// Fragment-only links (#) are ignored
|
||||
} catch (error) {
|
||||
// Log the error and continue
|
||||
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
|
||||
}
|
||||
// Fragment-only links (#) are ignored
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/
|
|||
import logging
|
||||
import os
|
||||
|
||||
from .firecrawl import FirecrawlApp
|
||||
from .firecrawl import FirecrawlApp # noqa
|
||||
|
||||
__version__ = "1.4.0"
|
||||
|
||||
|
@ -19,24 +19,46 @@ __version__ = "1.4.0"
|
|||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
||||
|
||||
def _basic_config() -> None:
|
||||
"""Set up basic configuration for logging with a specific format and date format."""
|
||||
def _configure_logger() -> None:
|
||||
"""
|
||||
Configure the firecrawl logger for console output.
|
||||
|
||||
The function attaches a handler for console output with a specific format and date
|
||||
format to the firecrawl logger.
|
||||
"""
|
||||
try:
|
||||
logging.basicConfig(
|
||||
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
||||
# Create the formatter
|
||||
formatter = logging.Formatter(
|
||||
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
# Create the console handler and set the formatter
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
# Add the console handler to the firecrawl logger
|
||||
logger.addHandler(console_handler)
|
||||
except Exception as e:
|
||||
logger.error("Failed to configure logging: %s", e)
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
||||
env = os.environ.get(
|
||||
"FIRECRAWL_LOGGING_LEVEL", "INFO"
|
||||
).upper() # Default to 'INFO' level
|
||||
_basic_config()
|
||||
# Check if the firecrawl logger already has a handler
|
||||
if logger.hasHandlers():
|
||||
return # To prevent duplicate logging
|
||||
|
||||
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
||||
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
||||
# Attach a no-op handler to prevent warnings about no handlers
|
||||
logger.addHandler(logging.NullHandler())
|
||||
return
|
||||
|
||||
# Attach the console handler to the firecrawl logger
|
||||
_configure_logger()
|
||||
|
||||
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
||||
if env == "DEBUG":
|
||||
logger.setLevel(logging.DEBUG)
|
||||
elif env == "INFO":
|
||||
|
|
Loading…
Reference in New Issue
Block a user