Spaces:
Running
Running
import { err, ok } from 'neverthrow'; | |
import { z } from 'zod'; | |
import type { Env } from '../index'; | |
import { parseArticle } from './parsers'; | |
import { tryCatchAsync } from './tryCatchAsync'; | |
import { userAgents } from './utils'; | |
/** | |
* Schema for validating responses from the Cloudflare Browser Rendering API | |
*/ | |
export const articleSchema = z.object({ | |
status: z.coerce.boolean(), | |
errors: z.array(z.object({ code: z.number(), message: z.string() })).optional(), | |
result: z.string(), | |
}); | |
/** | |
* Fetches an article using Cloudflare's Browser Rendering API | |
* | |
* This method simulates a real browser to handle modern websites with complex | |
* JavaScript, cookie consent walls, paywalls, and other obstacles that might | |
* prevent content scraping with a regular HTTP client. | |
* | |
* @param env Application environment with Cloudflare credentials | |
* @param url URL of the article to fetch | |
* @returns Result containing either the parsed article content or an error object | |
*/ | |
export async function getArticleWithBrowser(env: Env, url: string) { | |
const response = await tryCatchAsync( | |
fetch(`https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/browser-rendering/content`, { | |
method: 'POST', | |
headers: { | |
'Content-Type': 'application/json', | |
Authorization: `Bearer ${env.CLOUDFLARE_API_TOKEN}`, | |
}, | |
body: JSON.stringify({ | |
url, | |
userAgent: userAgents[Math.floor(Math.random() * userAgents.length)], | |
setExtraHTTPHeaders: { | |
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', | |
'Accept-Encoding': 'gzip, deflate, br', | |
Connection: 'keep-alive', | |
DNT: '1', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Sec-Fetch-Dest': 'document', | |
'Sec-Fetch-Mode': 'navigate', | |
'Sec-Fetch-Site': 'none', | |
'Sec-Fetch-User': '?1', | |
'Upgrade-Insecure-Requests': '1', | |
}, | |
cookies: [], | |
gotoOptions: { | |
waitUntil: 'networkidle0', | |
timeout: 30000, | |
referer: 'https://www.google.com/', | |
}, | |
viewport: { | |
width: 390, | |
height: 844, | |
deviceScaleFactor: 3, | |
isMobile: true, | |
hasTouch: true, | |
isLandscape: false, | |
}, | |
rejectResourceTypes: ['image', 'media', 'font', 'websocket'], | |
bestAttempt: true, | |
// all of these are very brittle, like all script tag usage | |
// this mostly works for now but good to revisit every once in a while | |
addScriptTag: [ | |
// Ensures consistent date formatting by overriding Intl.DateTimeFormat | |
// to always use 'en-US' locale regardless of browser settings | |
// This prevents inconsistent date parsing across different environments | |
{ | |
content: | |
"(() => { Object.defineProperty(Intl, 'DateTimeFormat', { \n writable: true, \n value: new Proxy(Intl.DateTimeFormat, { \n construct: (target, args) => new target('en-US', Object.assign({}, args[1])) \n })\n }); })();", | |
}, | |
// Automatically accepts cookie consent popups by finding buttons that contain | |
// 'accept' and 'cookie'/'consent' text, then programmatically clicking the first match | |
// This bypasses cookie walls that would otherwise block content access | |
{ | |
content: | |
"(() => { const cookieButtons = Array.from(document.querySelectorAll(\'button, a\')).filter(el => el.textContent.toLowerCase().includes(\'accept\') && (el.textContent.toLowerCase().includes(\'cookie\') || el.textContent.toLowerCase().includes(\'consent\'))); if(cookieButtons.length > 0) { cookieButtons[0].click(); } })();", | |
}, | |
// Circumvents paywalls by: | |
// 1. Removing elements with paywall/subscribe identifiers in id/class | |
// 2. Removing modal overlays and fixed position barriers | |
// 3. Restoring normal page scroll behavior | |
// This targets common paywall implementations across various sites | |
{ | |
content: | |
"(() => { const paywallElements = Array.from(document.querySelectorAll(\'div, section\')).filter(el => el.id.toLowerCase().includes(\'paywall\') || el.className.toLowerCase().includes(\'paywall\') || el.id.toLowerCase().includes(\'subscribe\') || el.className.toLowerCase().includes(\'subscribe\')); paywallElements.forEach(el => el.remove()); document.querySelectorAll(\'.modal, .modal-backdrop, body > div[style*=\"position: fixed\"]\').forEach(el => el.remove()); document.body.style.overflow = \'auto\'; })();", | |
}, | |
// Cleans up the DOM by removing non-content elements that interfere with article parsing: | |
// - Scripts, styles, iframes that might contain tracking or ads | |
// - Ad containers and advertisement blocks | |
// - Social media widgets and sharing buttons | |
// - Comments sections, navbars, headers, footers (except those within articles) | |
// - Various UI elements not relevant to the core article content | |
{ | |
content: | |
'(() => { document.querySelectorAll(\'script, style, iframe, .ad, .ads, .advertisement, [class*="social"], [id*="social"], .share, .comments, aside, nav, header:not(article header), footer:not(article footer), [role="complementary"], [role="banner"], [role="navigation"], form, .related, .recommended, .newsletter, .subscription\').forEach(el => el.remove()); })();', | |
}, | |
// Simplifies the DOM by stripping all HTML attributes except essential ones: | |
// - href: preserves links | |
// - src: maintains images and embedded content | |
// - alt: keeps accessibility text for images | |
// - title: retains tooltip text | |
// This reduces noise and potential tracking parameters in the parsed content | |
{ | |
content: | |
"(() => { const keepAttributes = [\'href\', \'src\', \'alt\', \'title\']; document.querySelectorAll(\'*\').forEach(el => { [...el.attributes].forEach(attr => { if (!keepAttributes.includes(attr.name.toLowerCase())) { el.removeAttribute(attr.name); }}); }); })();", | |
}, | |
// Recursively removes empty elements to clean up the DOM structure | |
// Continues removing elements until no more empty ones are found | |
// This eliminates spacing artifacts and layout containers that serve no content purpose | |
{ | |
content: | |
"(() => { function removeEmpty() { let removed = 0; document.querySelectorAll(\'div, span, p, section, article\').forEach(el => { if (!el.hasChildNodes() || el.textContent.trim() === \'\') { el.remove(); removed++; } }); return removed; } let pass; do { pass = removeEmpty(); } while(pass > 0); })();", | |
}, | |
// Removes simple meta tags that provide minimal information value | |
// Meta tags with only one attribute are typically not useful for content analysis | |
// This helps reduce noise in the document head | |
{ | |
content: | |
"(() => { document.querySelectorAll(\'meta\').forEach(meta => { if (meta.attributes.length <= 1) { meta.remove(); } }); })();", | |
}, | |
], | |
waitForSelector: { | |
selector: 'article, .article, .content, .post, #article, main', | |
timeout: 5000, | |
}, | |
}), | |
}) | |
); | |
if (response.isErr()) { | |
return err({ type: 'FETCH_ERROR', error: response.error }); | |
} | |
const parsedPageContent = articleSchema.safeParse(await response.value.json()); | |
if (parsedPageContent.success === false) { | |
return err({ type: 'VALIDATION_ERROR', error: parsedPageContent.error }); | |
} | |
const articleResult = parseArticle({ html: parsedPageContent.data.result }); | |
if (articleResult.isErr()) { | |
return err({ type: 'PARSE_ERROR', error: articleResult.error }); | |
} | |
return ok(articleResult.value); | |
} | |
/** | |
* Fetches an article using a simple HTTP request | |
* | |
* This is a lighter-weight alternative to browser rendering that works for | |
* simpler websites that don't rely heavily on client-side JavaScript for content. | |
* | |
* @param url URL of the article to fetch | |
* @returns Result containing either the parsed article content or an error object | |
*/ | |
export async function getArticleWithFetch(url: string) { | |
const response = await tryCatchAsync( | |
fetch(url, { | |
method: 'GET', | |
headers: { | |
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], | |
Referer: 'https://www.google.com/', | |
}, | |
}) | |
); | |
if (response.isErr()) { | |
return err({ type: 'FETCH_ERROR', error: response.error }); | |
} | |
const articleResult = parseArticle({ html: await response.value.text() }); | |
if (articleResult.isErr()) { | |
return err({ type: 'PARSE_ERROR', error: articleResult.error }); | |
} | |
return ok(articleResult.value); | |
} | |