yunlonggong's picture
Initial project upload
1b44660
import { err, ok } from 'neverthrow';
import { z } from 'zod';
import type { Env } from '../index';
import { parseArticle } from './parsers';
import { tryCatchAsync } from './tryCatchAsync';
import { userAgents } from './utils';
/**
* Schema for validating responses from the Cloudflare Browser Rendering API
*/
export const articleSchema = z.object({
status: z.coerce.boolean(),
errors: z.array(z.object({ code: z.number(), message: z.string() })).optional(),
result: z.string(),
});
/**
* Fetches an article using Cloudflare's Browser Rendering API
*
* This method simulates a real browser to handle modern websites with complex
* JavaScript, cookie consent walls, paywalls, and other obstacles that might
* prevent content scraping with a regular HTTP client.
*
* @param env Application environment with Cloudflare credentials
* @param url URL of the article to fetch
* @returns Result containing either the parsed article content or an error object
*/
export async function getArticleWithBrowser(env: Env, url: string) {
const response = await tryCatchAsync(
fetch(`https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/browser-rendering/content`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${env.CLOUDFLARE_API_TOKEN}`,
},
body: JSON.stringify({
url,
userAgent: userAgents[Math.floor(Math.random() * userAgents.length)],
setExtraHTTPHeaders: {
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
Connection: 'keep-alive',
DNT: '1',
'Accept-Language': 'en-US,en;q=0.5',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
},
cookies: [],
gotoOptions: {
waitUntil: 'networkidle0',
timeout: 30000,
referer: 'https://www.google.com/',
},
viewport: {
width: 390,
height: 844,
deviceScaleFactor: 3,
isMobile: true,
hasTouch: true,
isLandscape: false,
},
rejectResourceTypes: ['image', 'media', 'font', 'websocket'],
bestAttempt: true,
// all of these are very brittle, like all script tag usage
// this mostly works for now but good to revisit every once in a while
addScriptTag: [
// Ensures consistent date formatting by overriding Intl.DateTimeFormat
// to always use 'en-US' locale regardless of browser settings
// This prevents inconsistent date parsing across different environments
{
content:
"(() => { Object.defineProperty(Intl, 'DateTimeFormat', { \n writable: true, \n value: new Proxy(Intl.DateTimeFormat, { \n construct: (target, args) => new target('en-US', Object.assign({}, args[1])) \n })\n }); })();",
},
// Automatically accepts cookie consent popups by finding buttons that contain
// 'accept' and 'cookie'/'consent' text, then programmatically clicking the first match
// This bypasses cookie walls that would otherwise block content access
{
content:
"(() => { const cookieButtons = Array.from(document.querySelectorAll(\'button, a\')).filter(el => el.textContent.toLowerCase().includes(\'accept\') && (el.textContent.toLowerCase().includes(\'cookie\') || el.textContent.toLowerCase().includes(\'consent\'))); if(cookieButtons.length > 0) { cookieButtons[0].click(); } })();",
},
// Circumvents paywalls by:
// 1. Removing elements with paywall/subscribe identifiers in id/class
// 2. Removing modal overlays and fixed position barriers
// 3. Restoring normal page scroll behavior
// This targets common paywall implementations across various sites
{
content:
"(() => { const paywallElements = Array.from(document.querySelectorAll(\'div, section\')).filter(el => el.id.toLowerCase().includes(\'paywall\') || el.className.toLowerCase().includes(\'paywall\') || el.id.toLowerCase().includes(\'subscribe\') || el.className.toLowerCase().includes(\'subscribe\')); paywallElements.forEach(el => el.remove()); document.querySelectorAll(\'.modal, .modal-backdrop, body > div[style*=\"position: fixed\"]\').forEach(el => el.remove()); document.body.style.overflow = \'auto\'; })();",
},
// Cleans up the DOM by removing non-content elements that interfere with article parsing:
// - Scripts, styles, iframes that might contain tracking or ads
// - Ad containers and advertisement blocks
// - Social media widgets and sharing buttons
// - Comments sections, navbars, headers, footers (except those within articles)
// - Various UI elements not relevant to the core article content
{
content:
'(() => { document.querySelectorAll(\'script, style, iframe, .ad, .ads, .advertisement, [class*="social"], [id*="social"], .share, .comments, aside, nav, header:not(article header), footer:not(article footer), [role="complementary"], [role="banner"], [role="navigation"], form, .related, .recommended, .newsletter, .subscription\').forEach(el => el.remove()); })();',
},
// Simplifies the DOM by stripping all HTML attributes except essential ones:
// - href: preserves links
// - src: maintains images and embedded content
// - alt: keeps accessibility text for images
// - title: retains tooltip text
// This reduces noise and potential tracking parameters in the parsed content
{
content:
"(() => { const keepAttributes = [\'href\', \'src\', \'alt\', \'title\']; document.querySelectorAll(\'*\').forEach(el => { [...el.attributes].forEach(attr => { if (!keepAttributes.includes(attr.name.toLowerCase())) { el.removeAttribute(attr.name); }}); }); })();",
},
// Recursively removes empty elements to clean up the DOM structure
// Continues removing elements until no more empty ones are found
// This eliminates spacing artifacts and layout containers that serve no content purpose
{
content:
"(() => { function removeEmpty() { let removed = 0; document.querySelectorAll(\'div, span, p, section, article\').forEach(el => { if (!el.hasChildNodes() || el.textContent.trim() === \'\') { el.remove(); removed++; } }); return removed; } let pass; do { pass = removeEmpty(); } while(pass > 0); })();",
},
// Removes simple meta tags that provide minimal information value
// Meta tags with only one attribute are typically not useful for content analysis
// This helps reduce noise in the document head
{
content:
"(() => { document.querySelectorAll(\'meta\').forEach(meta => { if (meta.attributes.length <= 1) { meta.remove(); } }); })();",
},
],
waitForSelector: {
selector: 'article, .article, .content, .post, #article, main',
timeout: 5000,
},
}),
})
);
if (response.isErr()) {
return err({ type: 'FETCH_ERROR', error: response.error });
}
const parsedPageContent = articleSchema.safeParse(await response.value.json());
if (parsedPageContent.success === false) {
return err({ type: 'VALIDATION_ERROR', error: parsedPageContent.error });
}
const articleResult = parseArticle({ html: parsedPageContent.data.result });
if (articleResult.isErr()) {
return err({ type: 'PARSE_ERROR', error: articleResult.error });
}
return ok(articleResult.value);
}
/**
* Fetches an article using a simple HTTP request
*
* This is a lighter-weight alternative to browser rendering that works for
* simpler websites that don't rely heavily on client-side JavaScript for content.
*
* @param url URL of the article to fetch
* @returns Result containing either the parsed article content or an error object
*/
export async function getArticleWithFetch(url: string) {
const response = await tryCatchAsync(
fetch(url, {
method: 'GET',
headers: {
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
Referer: 'https://www.google.com/',
},
})
);
if (response.isErr()) {
return err({ type: 'FETCH_ERROR', error: response.error });
}
const articleResult = parseArticle({ html: await response.value.text() });
if (articleResult.isErr()) {
return err({ type: 'PARSE_ERROR', error: articleResult.error });
}
return ok(articleResult.value);
}