Spaces:
Running
Running
File size: 4,980 Bytes
1b44660 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import { Readability } from '@mozilla/readability';
import { XMLParser } from 'fast-xml-parser';
import { parseHTML } from 'linkedom';
import { Result, err, ok } from 'neverthrow';
import { z } from 'zod';
const rssFeedSchema = z.object({
title: z.string().min(1),
link: z.string(),
pubDate: z.date().nullable(),
});
function cleanString(text: string) {
return text
.replace(/[ \t]+/g, ' ') // collapse spaces/tabs
.replace(/\n\s+/g, '\n') // clean spaces after newlines
.replace(/\s+\n/g, '\n') // clean spaces before newlines
.replace(/\n{3,}/g, '\n\n') // keep max 2 consecutive newlines
.trim(); // clean edges
}
function cleanUrl(url: string) {
const u = new URL(url);
const paramsToRemove = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid'];
for (const param of paramsToRemove) {
u.searchParams.delete(param);
}
return u.toString();
}
/**
* Parses an RSS/XML feed content to extract article information
*
* Handles various RSS feed formats and structures while normalizing the output.
* Extracts titles, links, and publication dates from the feed items.
*
* @param xml The XML content of the RSS feed as a string
* @returns A Result containing either an array of parsed feed items or an error
*/
export async function parseRSSFeed(xml: string): Promise<Result<z.infer<typeof rssFeedSchema>[], Error>> {
const safeParser = Result.fromThrowable(
(xml: string) => new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_' }).parse(xml),
e => (e instanceof Error ? e : new Error(String(e)))
);
const parsedXml = safeParser(xml);
if (parsedXml.isErr()) {
return err(new Error(`Parse error: ${parsedXml.error.message}`));
}
const result = parsedXml.value;
// handle various feed structures
let items = result.rss?.channel?.item || result.feed?.entry || result.item || result['rdf:RDF']?.item || [];
// handle single item case
items = Array.isArray(items) ? items : [items];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
const properItems = items.map((item: any) => {
let title = '';
let link = '';
let id = '';
let pubDateString: string | null = null;
if (typeof item.title === 'string') {
title = item.title;
} else if (typeof item.title === 'object' && item.title['#text']) {
title = item.title['#text'];
} else {
title = 'UNKNOWN';
}
if (typeof item.link === 'string') {
link = item.link;
} else if (typeof item.link === 'object' && item.link['@_href']) {
link = item.link['@_href'];
} else if (typeof item.guid === 'string') {
link = item.guid;
} else {
link = 'UNKNOWN';
}
if (typeof item.guid === 'string') {
id = item.guid;
} else if (typeof item.guid === 'object' && item.guid['#text']) {
id = item.guid['#text'];
} else {
id = 'UNKNOWN';
}
if (typeof item.pubDate === 'string') {
pubDateString = item.pubDate;
} else if (typeof item.published === 'string') {
pubDateString = item.published;
} else if (typeof item.updated === 'string') {
pubDateString = item.updated;
}
let pubDate: Date | null = null;
if (pubDateString) {
pubDate = new Date(pubDateString);
if (Number.isNaN(pubDate.getTime())) {
pubDate = null;
}
}
return {
title: cleanString(title),
link: cleanUrl(cleanString(link)),
id: cleanString(id),
pubDate,
};
});
// standardize the items
const parsedItems = z.array(rssFeedSchema).safeParse(properItems);
if (parsedItems.success === false) {
return err(new Error(`Validation error: ${parsedItems.error.message}`));
}
return ok(parsedItems.data);
}
/**
* Parses HTML content to extract article text and metadata
*
* Uses Mozilla Readability to identify and extract the main content
* from an HTML document, ignoring navigation, ads, and other non-content elements.
*
* @param opts Object containing the HTML content to parse
* @returns A Result containing either the parsed article data or an error object
*/
export function parseArticle(opts: { html: string }) {
const safeReadability = Result.fromThrowable(
(html: string) => new Readability(parseHTML(html).document).parse(),
e => (e instanceof Error ? e : new Error(String(e)))
);
const articleResult = safeReadability(opts.html);
if (articleResult.isErr()) {
return err({ type: 'READABILITY_ERROR', error: articleResult.error });
}
// if we can't parse the article or there is no article, not much we can do
const article = articleResult.value;
if (article === null || !article.title || !article.textContent) {
return err({ type: 'NO_ARTICLE_FOUND', error: new Error('No article found') });
}
return ok({
title: article.title,
text: cleanString(article.textContent),
publishedTime: article.publishedTime || undefined,
});
}
|