yunlonggong's picture
Initial project upload
1b44660
import { Readability } from '@mozilla/readability';
import { XMLParser } from 'fast-xml-parser';
import { parseHTML } from 'linkedom';
import { Result, err, ok } from 'neverthrow';
import { z } from 'zod';
const rssFeedSchema = z.object({
title: z.string().min(1),
link: z.string(),
pubDate: z.date().nullable(),
});
function cleanString(text: string) {
return text
.replace(/[ \t]+/g, ' ') // collapse spaces/tabs
.replace(/\n\s+/g, '\n') // clean spaces after newlines
.replace(/\s+\n/g, '\n') // clean spaces before newlines
.replace(/\n{3,}/g, '\n\n') // keep max 2 consecutive newlines
.trim(); // clean edges
}
function cleanUrl(url: string) {
const u = new URL(url);
const paramsToRemove = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid'];
for (const param of paramsToRemove) {
u.searchParams.delete(param);
}
return u.toString();
}
/**
* Parses an RSS/XML feed content to extract article information
*
* Handles various RSS feed formats and structures while normalizing the output.
* Extracts titles, links, and publication dates from the feed items.
*
* @param xml The XML content of the RSS feed as a string
* @returns A Result containing either an array of parsed feed items or an error
*/
export async function parseRSSFeed(xml: string): Promise<Result<z.infer<typeof rssFeedSchema>[], Error>> {
const safeParser = Result.fromThrowable(
(xml: string) => new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_' }).parse(xml),
e => (e instanceof Error ? e : new Error(String(e)))
);
const parsedXml = safeParser(xml);
if (parsedXml.isErr()) {
return err(new Error(`Parse error: ${parsedXml.error.message}`));
}
const result = parsedXml.value;
// handle various feed structures
let items = result.rss?.channel?.item || result.feed?.entry || result.item || result['rdf:RDF']?.item || [];
// handle single item case
items = Array.isArray(items) ? items : [items];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
const properItems = items.map((item: any) => {
let title = '';
let link = '';
let id = '';
let pubDateString: string | null = null;
if (typeof item.title === 'string') {
title = item.title;
} else if (typeof item.title === 'object' && item.title['#text']) {
title = item.title['#text'];
} else {
title = 'UNKNOWN';
}
if (typeof item.link === 'string') {
link = item.link;
} else if (typeof item.link === 'object' && item.link['@_href']) {
link = item.link['@_href'];
} else if (typeof item.guid === 'string') {
link = item.guid;
} else {
link = 'UNKNOWN';
}
if (typeof item.guid === 'string') {
id = item.guid;
} else if (typeof item.guid === 'object' && item.guid['#text']) {
id = item.guid['#text'];
} else {
id = 'UNKNOWN';
}
if (typeof item.pubDate === 'string') {
pubDateString = item.pubDate;
} else if (typeof item.published === 'string') {
pubDateString = item.published;
} else if (typeof item.updated === 'string') {
pubDateString = item.updated;
}
let pubDate: Date | null = null;
if (pubDateString) {
pubDate = new Date(pubDateString);
if (Number.isNaN(pubDate.getTime())) {
pubDate = null;
}
}
return {
title: cleanString(title),
link: cleanUrl(cleanString(link)),
id: cleanString(id),
pubDate,
};
});
// standardize the items
const parsedItems = z.array(rssFeedSchema).safeParse(properItems);
if (parsedItems.success === false) {
return err(new Error(`Validation error: ${parsedItems.error.message}`));
}
return ok(parsedItems.data);
}
/**
* Parses HTML content to extract article text and metadata
*
* Uses Mozilla Readability to identify and extract the main content
* from an HTML document, ignoring navigation, ads, and other non-content elements.
*
* @param opts Object containing the HTML content to parse
* @returns A Result containing either the parsed article data or an error object
*/
export function parseArticle(opts: { html: string }) {
const safeReadability = Result.fromThrowable(
(html: string) => new Readability(parseHTML(html).document).parse(),
e => (e instanceof Error ? e : new Error(String(e)))
);
const articleResult = safeReadability(opts.html);
if (articleResult.isErr()) {
return err({ type: 'READABILITY_ERROR', error: articleResult.error });
}
// if we can't parse the article or there is no article, not much we can do
const article = articleResult.value;
if (article === null || !article.title || !article.textContent) {
return err({ type: 'NO_ARTICLE_FOUND', error: new Error('No article found') });
}
return ok({
title: article.title,
text: cleanString(article.textContent),
publishedTime: article.publishedTime || undefined,
});
}