Caleb: now extracting and returning a list of all links on the page for a customer

2024-11-16 03:32:22 +08:00 · 2024-07-16 18:38:03 -07:00 · 2024-07-16 18:38:03 -07:00 · d39d3be649
commit d39d3be649
parent db0545014f
2 changed files with 44 additions and 4 deletions
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -89,7 +89,8 @@ export class Document {
  warning?: string;

  index?: number;
-
+  linksOnPage?: string[]; // Add this new field as a separate property
+  
  constructor(data: Partial<Document>) {
    if (!data.content) {
      throw new Error("Missing required fields");
@ -102,6 +103,7 @@ export class Document {
    this.markdown = data.markdown || "";
    this.childrenLinks = data.childrenLinks || undefined;
    this.provider = data.provider || undefined;
+    this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
  }
 }

--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -109,6 +109,38 @@ function getScrapingFallbackOrder(
  return scrapersInOrder as (typeof baseScrapers)[number][];
 }

+function extractLinks(html: string, baseUrl: string): string[] {
+  const $ = cheerio.load(html);
+  const links: string[] = [];
+
+  // Parse the base URL to get the origin
+  const urlObject = new URL(baseUrl);
+  const origin = urlObject.origin;
+
+  $('a').each((_, element) => {
+    const href = $(element).attr('href');
+    if (href) {
+      if (href.startsWith('http://') || href.startsWith('https://')) {
+        // Absolute URL, add as is
+        links.push(href);
+      } else if (href.startsWith('/')) {
+        // Relative URL starting with '/', append to origin
+        links.push(`${origin}${href}`);
+      } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
+        // Relative URL not starting with '/', append to base URL
+        links.push(`${baseUrl}/${href}`);
+      } else if (href.startsWith('mailto:')) {
+        // mailto: links, add as is
+        links.push(href);
+      }
+      // Fragment-only links (#) are ignored
+    }
+  });
+
+  // Remove duplicates and return
+  return [...new Set(links)];
+}
+
 export async function scrapSingleUrl(
  urlToScrap: string,
  pageOptions: PageOptions = {
@ -234,7 +266,6 @@ export async function scrapSingleUrl(
      scraperResponse.text = customScrapedContent.html;
      screenshot = customScrapedContent.screenshot;
    }
-
    //* TODO: add an optional to return markdown or structured/extracted content
    let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
    return {
@ -309,6 +340,10 @@ export async function scrapSingleUrl(
    const soup = cheerio.load(rawHtml);
    const metadata = extractMetadata(soup, urlToScrap);

+    let linksOnPage: string[] | undefined;
+
+    linksOnPage = extractLinks(rawHtml, urlToScrap);
+
    let document: Document;
    if (screenshot && screenshot.length > 0) {
      document = {
@ -317,9 +352,10 @@ export async function scrapSingleUrl(
        html: pageOptions.includeHtml ? html : undefined,
        rawHtml:
          pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
            ? rawHtml
            : undefined,
+        linksOnPage,
        metadata: {
          ...metadata,
          screenshot: screenshot,
@ -335,7 +371,7 @@ export async function scrapSingleUrl(
        html: pageOptions.includeHtml ? html : undefined,
        rawHtml:
          pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
            ? rawHtml
            : undefined,
        metadata: {
@ -344,6 +380,7 @@ export async function scrapSingleUrl(
          pageStatusCode: pageStatusCode,
          pageError: pageError,
        },
+        linksOnPage,
      };
    }

@ -354,6 +391,7 @@ export async function scrapSingleUrl(
      content: "",
      markdown: "",
      html: "",
+      linksOnPage: [],
      metadata: {
        sourceURL: urlToScrap,
        pageStatusCode: pageStatusCode,