import { Readability } from '@mozilla/readability'; import { XMLParser } from 'fast-xml-parser'; import { parseHTML } from 'linkedom'; import { Result, err, ok } from 'neverthrow'; import { z } from 'zod'; const rssFeedSchema = z.object({ title: z.string().min(1), link: z.string(), pubDate: z.date().nullable(), }); function cleanString(text: string) { return text .replace(/[ \t]+/g, ' ') // collapse spaces/tabs .replace(/\n\s+/g, '\n') // clean spaces after newlines .replace(/\s+\n/g, '\n') // clean spaces before newlines .replace(/\n{3,}/g, '\n\n') // keep max 2 consecutive newlines .trim(); // clean edges } function cleanUrl(url: string) { const u = new URL(url); const paramsToRemove = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid']; for (const param of paramsToRemove) { u.searchParams.delete(param); } return u.toString(); } /** * Parses an RSS/XML feed content to extract article information * * Handles various RSS feed formats and structures while normalizing the output. * Extracts titles, links, and publication dates from the feed items. * * @param xml The XML content of the RSS feed as a string * @returns A Result containing either an array of parsed feed items or an error */ export async function parseRSSFeed(xml: string): Promise[], Error>> { const safeParser = Result.fromThrowable( (xml: string) => new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_' }).parse(xml), e => (e instanceof Error ? e : new Error(String(e))) ); const parsedXml = safeParser(xml); if (parsedXml.isErr()) { return err(new Error(`Parse error: ${parsedXml.error.message}`)); } const result = parsedXml.value; // handle various feed structures let items = result.rss?.channel?.item || result.feed?.entry || result.item || result['rdf:RDF']?.item || []; // handle single item case items = Array.isArray(items) ? items : [items]; // biome-ignore lint/suspicious/noExplicitAny: const properItems = items.map((item: any) => { let title = ''; let link = ''; let id = ''; let pubDateString: string | null = null; if (typeof item.title === 'string') { title = item.title; } else if (typeof item.title === 'object' && item.title['#text']) { title = item.title['#text']; } else { title = 'UNKNOWN'; } if (typeof item.link === 'string') { link = item.link; } else if (typeof item.link === 'object' && item.link['@_href']) { link = item.link['@_href']; } else if (typeof item.guid === 'string') { link = item.guid; } else { link = 'UNKNOWN'; } if (typeof item.guid === 'string') { id = item.guid; } else if (typeof item.guid === 'object' && item.guid['#text']) { id = item.guid['#text']; } else { id = 'UNKNOWN'; } if (typeof item.pubDate === 'string') { pubDateString = item.pubDate; } else if (typeof item.published === 'string') { pubDateString = item.published; } else if (typeof item.updated === 'string') { pubDateString = item.updated; } let pubDate: Date | null = null; if (pubDateString) { pubDate = new Date(pubDateString); if (Number.isNaN(pubDate.getTime())) { pubDate = null; } } return { title: cleanString(title), link: cleanUrl(cleanString(link)), id: cleanString(id), pubDate, }; }); // standardize the items const parsedItems = z.array(rssFeedSchema).safeParse(properItems); if (parsedItems.success === false) { return err(new Error(`Validation error: ${parsedItems.error.message}`)); } return ok(parsedItems.data); } /** * Parses HTML content to extract article text and metadata * * Uses Mozilla Readability to identify and extract the main content * from an HTML document, ignoring navigation, ads, and other non-content elements. * * @param opts Object containing the HTML content to parse * @returns A Result containing either the parsed article data or an error object */ export function parseArticle(opts: { html: string }) { const safeReadability = Result.fromThrowable( (html: string) => new Readability(parseHTML(html).document).parse(), e => (e instanceof Error ? e : new Error(String(e))) ); const articleResult = safeReadability(opts.html); if (articleResult.isErr()) { return err({ type: 'READABILITY_ERROR', error: articleResult.error }); } // if we can't parse the article or there is no article, not much we can do const article = articleResult.value; if (article === null || !article.title || !article.textContent) { return err({ type: 'NO_ARTICLE_FOUND', error: new Error('No article found') }); } return ok({ title: article.title, text: cleanString(article.textContent), publishedTime: article.publishedTime || undefined, }); }