feat(js-sdk): add crawlUrlAndWatch

This commit is contained in:
Gergő Móricz 2024-08-29 20:01:16 +02:00
parent d4001e4528
commit 53018a683f
5 changed files with 206 additions and 5 deletions

View File

@ -29,5 +29,21 @@ if (job.data) {
console.log(job.data[0].markdown);
}
// Map a website:
const mapResult = await app.map('https://firecrawl.dev');
console.log(mapResult)
// Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
watch.addEventListener("document", doc => {
console.log("DOC", doc.detail);
});
watch.addEventListener("error", err => {
console.error("ERR", err.detail.error);
});
watch.addEventListener("done", state => {
console.log("DONE", state.detail.status);
});

View File

@ -32,8 +32,24 @@ const main = async () => {
console.log(checkStatus.data[0].markdown);
}
// Map a website:
const mapResult = await app.mapUrl('https://firecrawl.dev');
console.log(mapResult)
// Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
watch.addEventListener("document", doc => {
console.log("DOC", doc.detail);
});
watch.addEventListener("error", err => {
console.error("ERR", err.detail.error);
});
watch.addEventListener("done", state => {
console.log("DONE", state.detail.status);
});
}
main()

View File

@ -1,16 +1,18 @@
{
"name": "@mendable/firecrawl-js",
"version": "0.0.36",
"version": "1.0.3",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@mendable/firecrawl-js",
"version": "0.0.36",
"version": "1.0.3",
"license": "MIT",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4",
"typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
@ -2137,6 +2139,20 @@
"integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
"dev": true
},
"node_modules/isows": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz",
"integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/wagmi-dev"
}
],
"peerDependencies": {
"ws": "*"
}
},
"node_modules/istanbul-lib-coverage": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz",
@ -3733,6 +3749,11 @@
"node": ">=14.17"
}
},
"node_modules/typescript-event-target": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/typescript-event-target/-/typescript-event-target-1.1.1.tgz",
"integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg=="
},
"node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
@ -3855,6 +3876,27 @@
"node": "^12.13.0 || ^14.15.0 || >=16.0.0"
}
},
"node_modules/ws": {
"version": "8.18.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz",
"integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==",
"peer": true,
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/y18n": {
"version": "5.0.8",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.0.3",
"version": "1.0.4",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/cjs/index.js",
"types": "types/index.d.ts",
@ -30,6 +30,8 @@
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4",
"typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"

View File

@ -1,6 +1,8 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema";
import { WebSocket } from "isows";
import { TypedEventTarget } from "typescript-event-target";
/**
* Configuration interface for FirecrawlApp.
@ -315,8 +317,8 @@ export interface SearchResponseV0 {
* Provides methods for scraping, searching, crawling, and mapping web content.
*/
export default class FirecrawlApp<T extends "v0" | "v1"> {
private apiKey: string;
private apiUrl: string;
public apiKey: string;
public apiUrl: string;
public version: T;
/**
@ -561,6 +563,21 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
} as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse);
}
async crawlUrlAndWatch(
url: string,
params?: this['version'] extends 'v0' ? CrawlParamsV0 : CrawlParams,
idempotencyKey?: string,
) {
if (this.version === 'v0') {
throw new Error("crawlUrlAndWatch is only available on v1");
}
const crawl = await this.crawlUrl(url, params, false, 0, idempotencyKey);
const id = this.version === 'v0' ? (crawl as CrawlResponseV0).jobId : (crawl as CrawlResponse).id;
return new CrawlWatcher(id as string, this as FirecrawlApp<"v1">);
}
async mapUrl(url: string, params?: MapParams): Promise<MapResponse> {
if (this.version == 'v0') {
throw new Error("Map is not supported in v0");
@ -696,3 +713,111 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
}
}
}
interface CrawlWatcherEvents {
document: CustomEvent<FirecrawlDocument>,
done: CustomEvent<{
status: CrawlStatusResponse["status"];
data: FirecrawlDocument[];
}>,
error: CustomEvent<{
status: CrawlStatusResponse["status"],
data: FirecrawlDocument[],
error: string,
}>,
}
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
private ws: WebSocket;
public data: FirecrawlDocument[];
public status: CrawlStatusResponse["status"];
constructor(id: string, app: FirecrawlApp<"v1">) {
super();
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
this.status = "scraping";
this.data = [];
type ErrorMessage = {
type: "error",
error: string,
}
type CatchupMessage = {
type: "catchup",
data: CrawlStatusResponse,
}
type DocumentMessage = {
type: "document",
data: FirecrawlDocument,
}
type DoneMessage = { type: "done" }
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
const messageHandler = (msg: Message) => {
if (msg.type === "done") {
this.status = "completed";
this.dispatchTypedEvent("done", new CustomEvent("done", {
detail: {
status: this.status,
data: this.data,
},
}));
} else if (msg.type === "error") {
this.status = "failed";
this.dispatchTypedEvent("error", new CustomEvent("error", {
detail: {
status: this.status,
data: this.data,
error: msg.error,
},
}));
} else if (msg.type === "catchup") {
this.status = msg.data.status;
this.data.push(...(msg.data.data ?? []));
for (const doc of this.data) {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: doc,
}));
}
} else if (msg.type === "document") {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: msg.data,
}));
}
}
this.ws.onmessage = ((ev: MessageEvent) => {
if (typeof ev.data !== "string") {
this.ws.close();
return;
}
const msg = JSON.parse(ev.data) as Message;
messageHandler(msg);
}).bind(this);
this.ws.onclose = ((ev: CloseEvent) => {
const msg = JSON.parse(ev.reason) as Message;
messageHandler(msg);
}).bind(this);
this.ws.onerror = ((_: Event) => {
this.status = "failed"
this.dispatchTypedEvent("error", new CustomEvent("error", {
detail: {
status: this.status,
data: this.data,
error: "WebSocket error",
},
}));
}).bind(this);
}
close() {
this.ws.close();
}
}