From d39d3be64938082b6fb19e367b1d852f7844c442 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 18:38:03 -0700 Subject: [PATCH] Caleb: now extracting and returning a list of all links on the page for a customer --- apps/api/src/lib/entities.ts | 4 +- apps/api/src/scraper/WebScraper/single_url.ts | 44 +++++++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 089d373c..f60e197f 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -89,7 +89,8 @@ export class Document { warning?: string; index?: number; - + linksOnPage?: string[]; // Add this new field as a separate property + constructor(data: Partial) { if (!data.content) { throw new Error("Missing required fields"); @@ -102,6 +103,7 @@ export class Document { this.markdown = data.markdown || ""; this.childrenLinks = data.childrenLinks || undefined; this.provider = data.provider || undefined; + this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided } } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d24e5c2e..0aef2577 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -109,6 +109,38 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } +function extractLinks(html: string, baseUrl: string): string[] { + const $ = cheerio.load(html); + const links: string[] = []; + + // Parse the base URL to get the origin + const urlObject = new URL(baseUrl); + const origin = urlObject.origin; + + $('a').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to origin + links.push(`${origin}${href}`); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(`${baseUrl}/${href}`); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} + export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { @@ -234,7 +266,6 @@ export async function scrapSingleUrl( scraperResponse.text = customScrapedContent.html; screenshot = customScrapedContent.screenshot; } - //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); return { @@ -309,6 +340,10 @@ export async function scrapSingleUrl( const soup = cheerio.load(rawHtml); const metadata = extractMetadata(soup, urlToScrap); + let linksOnPage: string[] | undefined; + + linksOnPage = extractLinks(rawHtml, urlToScrap); + let document: Document; if (screenshot && screenshot.length > 0) { document = { @@ -317,9 +352,10 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + linksOnPage, metadata: { ...metadata, screenshot: screenshot, @@ -335,7 +371,7 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: { @@ -344,6 +380,7 @@ export async function scrapSingleUrl( pageStatusCode: pageStatusCode, pageError: pageError, }, + linksOnPage, }; } @@ -354,6 +391,7 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", + linksOnPage: [], metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode,