From d39d3be64938082b6fb19e367b1d852f7844c442 Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Tue, 16 Jul 2024 18:38:03 -0700
Subject: [PATCH] Caleb: now extracting and returning a list of all links on
 the page for a customer

---
 apps/api/src/lib/entities.ts                  |  4 +-
 apps/api/src/scraper/WebScraper/single_url.ts | 44 +++++++++++++++++--
 2 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 089d373c..f60e197f 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -89,7 +89,8 @@ export class Document {
   warning?: string;
 
   index?: number;
-
+  linksOnPage?: string[]; // Add this new field as a separate property
+  
   constructor(data: Partial<Document>) {
     if (!data.content) {
       throw new Error("Missing required fields");
@@ -102,6 +103,7 @@ export class Document {
     this.markdown = data.markdown || "";
     this.childrenLinks = data.childrenLinks || undefined;
     this.provider = data.provider || undefined;
+    this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
   }
 }
 
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index d24e5c2e..0aef2577 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -109,6 +109,38 @@ function getScrapingFallbackOrder(
   return scrapersInOrder as (typeof baseScrapers)[number][];
 }
 
+function extractLinks(html: string, baseUrl: string): string[] {
+  const $ = cheerio.load(html);
+  const links: string[] = [];
+
+  // Parse the base URL to get the origin
+  const urlObject = new URL(baseUrl);
+  const origin = urlObject.origin;
+
+  $('a').each((_, element) => {
+    const href = $(element).attr('href');
+    if (href) {
+      if (href.startsWith('http://') || href.startsWith('https://')) {
+        // Absolute URL, add as is
+        links.push(href);
+      } else if (href.startsWith('/')) {
+        // Relative URL starting with '/', append to origin
+        links.push(`${origin}${href}`);
+      } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
+        // Relative URL not starting with '/', append to base URL
+        links.push(`${baseUrl}/${href}`);
+      } else if (href.startsWith('mailto:')) {
+        // mailto: links, add as is
+        links.push(href);
+      }
+      // Fragment-only links (#) are ignored
+    }
+  });
+
+  // Remove duplicates and return
+  return [...new Set(links)];
+}
+
 export async function scrapSingleUrl(
   urlToScrap: string,
   pageOptions: PageOptions = {
@@ -234,7 +266,6 @@ export async function scrapSingleUrl(
       scraperResponse.text = customScrapedContent.html;
       screenshot = customScrapedContent.screenshot;
     }
-
     //* TODO: add an optional to return markdown or structured/extracted content
     let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
     return {
@@ -309,6 +340,10 @@ export async function scrapSingleUrl(
     const soup = cheerio.load(rawHtml);
     const metadata = extractMetadata(soup, urlToScrap);
 
+    let linksOnPage: string[] | undefined;
+
+    linksOnPage = extractLinks(rawHtml, urlToScrap);
+
     let document: Document;
     if (screenshot && screenshot.length > 0) {
       document = {
@@ -317,9 +352,10 @@ export async function scrapSingleUrl(
         html: pageOptions.includeHtml ? html : undefined,
         rawHtml:
           pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
             ? rawHtml
             : undefined,
+        linksOnPage,
         metadata: {
           ...metadata,
           screenshot: screenshot,
@@ -335,7 +371,7 @@ export async function scrapSingleUrl(
         html: pageOptions.includeHtml ? html : undefined,
         rawHtml:
           pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
             ? rawHtml
             : undefined,
         metadata: {
@@ -344,6 +380,7 @@ export async function scrapSingleUrl(
           pageStatusCode: pageStatusCode,
           pageError: pageError,
         },
+        linksOnPage,
       };
     }
 
@@ -354,6 +391,7 @@ export async function scrapSingleUrl(
       content: "",
       markdown: "",
       html: "",
+      linksOnPage: [],
       metadata: {
         sourceURL: urlToScrap,
         pageStatusCode: pageStatusCode,