WIP: Cookie fixes

This commit is contained in:
Harsh Gupta 2024-08-15 21:01:49 +05:30
parent bb3f6b3199
commit 12850d79c7
2 changed files with 83 additions and 2 deletions

View File

@ -9,6 +9,11 @@ import { AsyncContext, CloudHTTPv2, FirebaseStorageBucketControl, Logger, Output
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
import { CookieParam as PuppeteerCookieParam } from 'puppeteer';
type CookieParam = Omit<PuppeteerCookieParam, 'expires'> & {
expires?: number | string;
};
const pNormalizeUrl = import("@esm2cjs/normalize-url");
// import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown';
@ -879,7 +884,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl,
cookies: opts.setCookies,
cookies: this.validateCookies(opts.setCookies),
favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
removeSelector: opts.removeSelector,
targetSelector: opts.targetSelector,
@ -892,6 +897,56 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return crawlOpts;
}
validateCookies(cookies?: CookieParam[]): PuppeteerCookieParam[] | undefined {
if (!cookies) return undefined;
return cookies.filter(cookie => {
if (!cookie.name || !cookie.value) {
this.logger.warn(`Invalid cookie: missing name or value`, { cookie });
return false;
}
const validatedCookie: PuppeteerCookieParam = {
name: cookie.name,
value: cookie.value,
domain: cookie.domain,
path: cookie.path,
expires: cookie.expires ? Number(new Date(cookie.expires)) / 1000 : undefined,
httpOnly: cookie.httpOnly,
secure: cookie.secure,
sameSite: cookie.sameSite as 'Strict' | 'Lax' | 'None' | undefined
};
if (cookie.expires && isNaN(validatedCookie.expires!)) {
this.logger.warn(`Invalid cookie: invalid expires date`, { cookie });
return false;
}
if (cookie.domain && typeof cookie.domain !== 'string') {
this.logger.warn(`Invalid cookie: domain must be a string`, { cookie });
return false;
}
if (cookie.path && typeof cookie.path !== 'string') {
this.logger.warn(`Invalid cookie: path must be a string`, { cookie });
return false;
}
if (cookie.secure !== undefined && typeof cookie.secure !== 'boolean') {
this.logger.warn(`Invalid cookie: secure must be a boolean`, { cookie });
return false;
}
if (cookie.httpOnly !== undefined && typeof cookie.httpOnly !== 'boolean') {
this.logger.warn(`Invalid cookie: httpOnly must be a boolean`, { cookie });
return false;
}
if (cookie.sameSite && !['Strict', 'Lax', 'None'].includes(cookie.sameSite)) {
this.logger.warn(`Invalid cookie: sameSite must be 'Strict', 'Lax', or 'None'`, { cookie });
return false;
}
return validatedCookie;
}) as PuppeteerCookieParam[];
}
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
const it = this.scrap(url, { ...opts, minIntervalMs: 500 });

View File

@ -191,7 +191,7 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
static override from<T extends CrawlerOptions>(this: Constructor<T>, input: any, ...args: any[]): T {
const instance = super.from(input, ...args) as T;
const req = args[0] as Request | undefined;
if (req) {
console.log('Request headers:', req.headers);
@ -203,74 +203,92 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
const customMode = getHeader('X-Respond-With') || getHeader('X-Return-Format');
if (customMode) {
instance.respondWith = customMode;
console.log('Set respondWith:', instance.respondWith);
}
const withGeneratedAlt = getHeader('X-With-Generated-Alt');
if (withGeneratedAlt !== undefined) {
instance.withGeneratedAlt = withGeneratedAlt.toLowerCase() === 'true';
console.log('Set withGeneratedAlt:', instance.withGeneratedAlt);
}
const withLinksSummary = getHeader('x-with-links-summary');
if (withLinksSummary !== undefined) {
instance.withLinksSummary = Boolean(withLinksSummary);
console.log('Set withLinksSummary:', instance.withLinksSummary);
}
const withImagesSummary = getHeader('x-with-images-summary');
if (withImagesSummary !== undefined) {
instance.withImagesSummary = Boolean(withImagesSummary);
console.log('Set withImagesSummary:', instance.withImagesSummary);
}
const noCache = getHeader('x-no-cache');
if (noCache !== undefined) {
instance.noCache = Boolean(noCache);
console.log('Set noCache:', instance.noCache);
}
if (instance.noCache && instance.cacheTolerance === undefined) {
instance.cacheTolerance = 0;
console.log('Set cacheTolerance to 0 due to noCache');
}
let cacheTolerance = parseInt(getHeader('x-cache-tolerance') || '');
if (!isNaN(cacheTolerance)) {
instance.cacheTolerance = cacheTolerance;
console.log('Set cacheTolerance:', instance.cacheTolerance);
}
let timeoutSeconds = parseInt(getHeader('x-timeout') || '');
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
console.log('Set timeout:', instance.timeout);
} else if (getHeader('x-timeout')) {
instance.timeout = null;
console.log('Set timeout to null');
}
const removeSelector = getHeader('x-remove-selector')?.split(', ');
instance.removeSelector ??= removeSelector;
console.log('Set removeSelector:', instance.removeSelector);
const targetSelector = getHeader('x-target-selector')?.split(', ');
instance.targetSelector ??= targetSelector;
console.log('Set targetSelector:', instance.targetSelector);
const waitForSelector = getHeader('x-wait-for-selector')?.split(', ');
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
console.log('Set waitForSelector:', instance.waitForSelector);
instance.targetSelector = filterSelector(instance.targetSelector);
console.log('Filtered targetSelector:', instance.targetSelector);
const overrideUserAgent = getHeader('x-user-agent');
instance.userAgent ??= overrideUserAgent;
console.log('Set userAgent:', instance.userAgent);
const keepImgDataUrl = getHeader('x-keep-img-data-url');
if (keepImgDataUrl !== undefined) {
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
console.log('Set keepImgDataUrl:', instance.keepImgDataUrl);
}
const withIframe = getHeader('x-with-iframe');
if (withIframe !== undefined) {
instance.withIframe = Boolean(withIframe);
console.log('Set withIframe:', instance.withIframe);
}
if (instance.withIframe) {
instance.timeout ??= null;
console.log('Set timeout to null due to withIframe');
}
const cookies: CookieParam[] = [];
const setCookieHeaders = getHeader('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
console.log('SetCookieHeaders:', setCookieHeaders);
if (Array.isArray(setCookieHeaders)) {
for (const setCookie of setCookieHeaders) {
cookies.push({
@ -282,12 +300,20 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
});
}
console.log('Set cookies:', cookies); // Cool
if (cookies.length > 0) {
instance.setCookies = cookies;
console.log('Set setCookies:', instance.setCookies);
}
const proxyUrl = getHeader('x-proxy-url');
instance.proxyUrl ??= proxyUrl;
console.log('Set proxyUrl:', instance.proxyUrl);
if (instance.cacheTolerance) {
instance.cacheTolerance = instance.cacheTolerance * 1000;
console.log('Adjusted cacheTolerance:', instance.cacheTolerance);
}
}