This commit is contained in:
Nicolas 2024-11-05 13:08:02 -05:00
commit 56179924e6
2 changed files with 49 additions and 26 deletions

View File

@ -29,27 +29,28 @@ export function extractLinks(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html);
const links: string[] = [];
// Parse the base URL to get the origin
const urlObject = new URL(baseUrl);
const origin = urlObject.origin;
$('a').each((_, element) => {
const href = $(element).attr('href');
if (href) {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
try {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
// Log the error and continue
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
}
// Fragment-only links (#) are ignored
}
});

View File

@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/
import logging
import os
from .firecrawl import FirecrawlApp
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.4.0"
@ -19,24 +19,46 @@ __version__ = "1.4.0"
logger: logging.Logger = logging.getLogger("firecrawl")
def _basic_config() -> None:
"""Set up basic configuration for logging with a specific format and date format."""
def _configure_logger() -> None:
"""
Configure the firecrawl logger for console output.
The function attaches a handler for console output with a specific format and date
format to the firecrawl logger.
"""
try:
logging.basicConfig(
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
# Create the formatter
formatter = logging.Formatter(
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
# Create the console handler and set the formatter
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
# Add the console handler to the firecrawl logger
logger.addHandler(console_handler)
except Exception as e:
logger.error("Failed to configure logging: %s", e)
def setup_logging() -> None:
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
env = os.environ.get(
"FIRECRAWL_LOGGING_LEVEL", "INFO"
).upper() # Default to 'INFO' level
_basic_config()
# Check if the firecrawl logger already has a handler
if logger.hasHandlers():
return # To prevent duplicate logging
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
# Attach a no-op handler to prevent warnings about no handlers
logger.addHandler(logging.NullHandler())
return
# Attach the console handler to the firecrawl logger
_configure_logger()
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
if env == "DEBUG":
logger.setLevel(logging.DEBUG)
elif env == "INFO":