This commit is contained in:
Nicolas 2024-08-09 14:07:46 -04:00
parent 920b7f2f44
commit e28c415cf4
4 changed files with 180 additions and 4 deletions

View File

@ -55,8 +55,9 @@ kill_timeout = '30s'
soft_limit = 20
[[vm]]
size = 'performance-1x'
size = 'performance-2x'
processes = ['app','worker']
memory = 8192

View File

@ -35,7 +35,7 @@ export async function startWebScraperPipeline({
if (partialDocs.length > 50) {
partialDocs = partialDocs.slice(-50);
}
job.updateProgress({ ...progress, partialDocs: partialDocs });
// job.updateProgress({ ...progress, partialDocs: partialDocs });
}
},
onSuccess: (result, mode) => {

175
apps/api/src/run-req.ts Normal file
View File

@ -0,0 +1,175 @@
import axios from "axios";
import { promises as fs } from "fs";
import { v4 as uuidV4 } from "uuid";
interface Result {
start_url: string;
job_id?: string;
idempotency_key?: string;
result_data_jsonb?: any;
}
async function sendCrawl(result: Result): Promise<string | undefined> {
const idempotencyKey = uuidV4();
const url = result.start_url;
try {
const response = await axios.post(
"https://staging-firecrawl-scraper-js.fly.dev/v0/crawl",
{
url: url,
crawlerOptions: {
limit: 75,
},
pageOptions: {
includeHtml: true,
replaceAllPathsWithAbsolutePaths: true,
waitFor: 1000,
},
},
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `,
},
}
);
result.idempotency_key = idempotencyKey;
return response.data.jobId;
} catch (error) {
console.error("Error sending crawl:", error);
return undefined;
}
}
async function getContent(result: Result): Promise<boolean> {
let attempts = 0;
while (attempts < 120) {
// Reduce the number of attempts to speed up
try {
const response = await axios.get(
`https://staging-firecrawl-scraper-js.fly.dev/v0/crawl/status/${result.job_id}`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `,
},
}
);
if (response.data.status === "completed") {
result.result_data_jsonb = response.data.data;
// Job actually completed
return true;
}
} catch (error) {
console.error("Error getting content:", error);
}
const randomSleep = Math.floor(Math.random() * 15000) + 5000;
await new Promise((resolve) => setTimeout(resolve, randomSleep)); // Reduce sleep time to 1.5 seconds
attempts++;
}
// Set result as null if timed out
result.result_data_jsonb = null;
return false;
}
async function processResults(results: Result[]): Promise<void> {
let processedCount = 0;
let starterCount = 0;
const queue: Result[] = [];
const processedUrls = new Set<string>();
// Initialize the queue with the first 1000 results
for (let i = 0; i < Math.min(100, results.length); i++) {
queue.push(results[i]);
processedUrls.add(results[i].start_url);
}
// Function to process a single result
const processSingleResult = async (result: Result) => {
const jobId = await sendCrawl(result);
if (jobId) {
console.log(`Job requested count: ${starterCount}`);
starterCount++;
result.job_id = jobId;
processedCount++;
// Save the result to the file
try {
// Save job id along with the start_url
const resultWithJobId = results.map(r => ({
start_url: r.start_url,
job_id: r.job_id,
}));
await fs.writeFile(
"results_with_job_id_4000_6000.json",
JSON.stringify(resultWithJobId, null, 4)
);
} catch (error) {
console.error("Error writing to results_with_content.json:", error);
}
// Add a new result to the queue if there are more results to process
// if (processedCount < results.length) {
// for (let i = queue.length; i < results.length; i++) {
// if (!processedUrls.has(results[i].start_url)) {
// const nextResult = results[i];
// console.log("Next result:", nextResult.start_url);
// queue.push(nextResult);
// processedUrls.add(nextResult.start_url);
// console.log(`Queue length: ${queue.length}`);
// processSingleResult(nextResult);
// break;
// }
// }
// }
}
};
// Start processing the initial queue concurrently
// for (let i = 0; i < queue.length; i++) {
// processSingleResult(queue[i]);
// if ((i + 1) % 500 === 0) {
// console.log(`Processed ${i + 1} results, waiting for 1 minute before adding the next batch...`);
// await new Promise(resolve => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
// }
// }
// Start processing the initial queue concurrently
// await Promise.all(queue.map(result => processSingleResult(result)));
for (let i = 0; i < results.length; i += 100) {
const batch = results.slice(i, i + 100);
Promise.all(batch.map((result) => processSingleResult(result)))
.then(() => {
console.log(`Processed ${i + 100} results.`);
})
.catch((error) => {
console.error(`Error processing batch starting at index ${i}:`, error);
});
await new Promise((resolve) => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
}
}
// Example call
async function getStartUrls(): Promise<Result[]> {
try {
const data = await fs.readFile("starturls.json", "utf-8");
return JSON.parse(data);
} catch (error) {
console.error("Error reading starturls.json:", error);
return [];
}
}
async function main() {
const results: Result[] = (await getStartUrls()).slice(3999, 6000);
// console.log(results.map((r) => r.start_url).slice(0, 3));
processResults(results)
.then(() => {
console.log("All results processed.");
})
.catch((error) => {
console.error("Error processing results:", error);
});
}
main();

View File

@ -51,9 +51,9 @@ const processJobInternal = async (token: string, job: Job) => {
try {
const result = await processJob(job, token);
const jobState = await job.getState();
if (jobState !== "completed" && jobState !== "failed") {
try{
await job.moveToCompleted(result.docs, token, false);
}catch(e){
}
} catch (error) {
console.log("Job failed, error:", error);