import { err, ok } from 'neverthrow'; import { z } from 'zod'; import type { Env } from '../index'; import { parseArticle } from './parsers'; import { tryCatchAsync } from './tryCatchAsync'; import { userAgents } from './utils'; /** * Schema for validating responses from the Cloudflare Browser Rendering API */ export const articleSchema = z.object({ status: z.coerce.boolean(), errors: z.array(z.object({ code: z.number(), message: z.string() })).optional(), result: z.string(), }); /** * Fetches an article using Cloudflare's Browser Rendering API * * This method simulates a real browser to handle modern websites with complex * JavaScript, cookie consent walls, paywalls, and other obstacles that might * prevent content scraping with a regular HTTP client. * * @param env Application environment with Cloudflare credentials * @param url URL of the article to fetch * @returns Result containing either the parsed article content or an error object */ export async function getArticleWithBrowser(env: Env, url: string) { const response = await tryCatchAsync( fetch(`https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/browser-rendering/content`, { method: 'POST', headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${env.CLOUDFLARE_API_TOKEN}`, }, body: JSON.stringify({ url, userAgent: userAgents[Math.floor(Math.random() * userAgents.length)], setExtraHTTPHeaders: { Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', Connection: 'keep-alive', DNT: '1', 'Accept-Language': 'en-US,en;q=0.5', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', }, cookies: [], gotoOptions: { waitUntil: 'networkidle0', timeout: 30000, referer: 'https://www.google.com/', }, viewport: { width: 390, height: 844, deviceScaleFactor: 3, isMobile: true, hasTouch: true, isLandscape: false, }, rejectResourceTypes: ['image', 'media', 'font', 'websocket'], bestAttempt: true, // all of these are very brittle, like all script tag usage // this mostly works for now but good to revisit every once in a while addScriptTag: [ // Ensures consistent date formatting by overriding Intl.DateTimeFormat // to always use 'en-US' locale regardless of browser settings // This prevents inconsistent date parsing across different environments { content: "(() => { Object.defineProperty(Intl, 'DateTimeFormat', { \n writable: true, \n value: new Proxy(Intl.DateTimeFormat, { \n construct: (target, args) => new target('en-US', Object.assign({}, args[1])) \n })\n }); })();", }, // Automatically accepts cookie consent popups by finding buttons that contain // 'accept' and 'cookie'/'consent' text, then programmatically clicking the first match // This bypasses cookie walls that would otherwise block content access { content: "(() => { const cookieButtons = Array.from(document.querySelectorAll(\'button, a\')).filter(el => el.textContent.toLowerCase().includes(\'accept\') && (el.textContent.toLowerCase().includes(\'cookie\') || el.textContent.toLowerCase().includes(\'consent\'))); if(cookieButtons.length > 0) { cookieButtons[0].click(); } })();", }, // Circumvents paywalls by: // 1. Removing elements with paywall/subscribe identifiers in id/class // 2. Removing modal overlays and fixed position barriers // 3. Restoring normal page scroll behavior // This targets common paywall implementations across various sites { content: "(() => { const paywallElements = Array.from(document.querySelectorAll(\'div, section\')).filter(el => el.id.toLowerCase().includes(\'paywall\') || el.className.toLowerCase().includes(\'paywall\') || el.id.toLowerCase().includes(\'subscribe\') || el.className.toLowerCase().includes(\'subscribe\')); paywallElements.forEach(el => el.remove()); document.querySelectorAll(\'.modal, .modal-backdrop, body > div[style*=\"position: fixed\"]\').forEach(el => el.remove()); document.body.style.overflow = \'auto\'; })();", }, // Cleans up the DOM by removing non-content elements that interfere with article parsing: // - Scripts, styles, iframes that might contain tracking or ads // - Ad containers and advertisement blocks // - Social media widgets and sharing buttons // - Comments sections, navbars, headers, footers (except those within articles) // - Various UI elements not relevant to the core article content { content: '(() => { document.querySelectorAll(\'script, style, iframe, .ad, .ads, .advertisement, [class*="social"], [id*="social"], .share, .comments, aside, nav, header:not(article header), footer:not(article footer), [role="complementary"], [role="banner"], [role="navigation"], form, .related, .recommended, .newsletter, .subscription\').forEach(el => el.remove()); })();', }, // Simplifies the DOM by stripping all HTML attributes except essential ones: // - href: preserves links // - src: maintains images and embedded content // - alt: keeps accessibility text for images // - title: retains tooltip text // This reduces noise and potential tracking parameters in the parsed content { content: "(() => { const keepAttributes = [\'href\', \'src\', \'alt\', \'title\']; document.querySelectorAll(\'*\').forEach(el => { [...el.attributes].forEach(attr => { if (!keepAttributes.includes(attr.name.toLowerCase())) { el.removeAttribute(attr.name); }}); }); })();", }, // Recursively removes empty elements to clean up the DOM structure // Continues removing elements until no more empty ones are found // This eliminates spacing artifacts and layout containers that serve no content purpose { content: "(() => { function removeEmpty() { let removed = 0; document.querySelectorAll(\'div, span, p, section, article\').forEach(el => { if (!el.hasChildNodes() || el.textContent.trim() === \'\') { el.remove(); removed++; } }); return removed; } let pass; do { pass = removeEmpty(); } while(pass > 0); })();", }, // Removes simple meta tags that provide minimal information value // Meta tags with only one attribute are typically not useful for content analysis // This helps reduce noise in the document head { content: "(() => { document.querySelectorAll(\'meta\').forEach(meta => { if (meta.attributes.length <= 1) { meta.remove(); } }); })();", }, ], waitForSelector: { selector: 'article, .article, .content, .post, #article, main', timeout: 5000, }, }), }) ); if (response.isErr()) { return err({ type: 'FETCH_ERROR', error: response.error }); } const parsedPageContent = articleSchema.safeParse(await response.value.json()); if (parsedPageContent.success === false) { return err({ type: 'VALIDATION_ERROR', error: parsedPageContent.error }); } const articleResult = parseArticle({ html: parsedPageContent.data.result }); if (articleResult.isErr()) { return err({ type: 'PARSE_ERROR', error: articleResult.error }); } return ok(articleResult.value); } /** * Fetches an article using a simple HTTP request * * This is a lighter-weight alternative to browser rendering that works for * simpler websites that don't rely heavily on client-side JavaScript for content. * * @param url URL of the article to fetch * @returns Result containing either the parsed article content or an error object */ export async function getArticleWithFetch(url: string) { const response = await tryCatchAsync( fetch(url, { method: 'GET', headers: { 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], Referer: 'https://www.google.com/', }, }) ); if (response.isErr()) { return err({ type: 'FETCH_ERROR', error: response.error }); } const articleResult = parseArticle({ html: await response.value.text() }); if (articleResult.isErr()) { return err({ type: 'PARSE_ERROR', error: articleResult.error }); } return ok(articleResult.value); }