File size: 4,980 Bytes
1b44660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import { Readability } from '@mozilla/readability';
import { XMLParser } from 'fast-xml-parser';
import { parseHTML } from 'linkedom';
import { Result, err, ok } from 'neverthrow';
import { z } from 'zod';

const rssFeedSchema = z.object({
  title: z.string().min(1),
  link: z.string(),
  pubDate: z.date().nullable(),
});

function cleanString(text: string) {
  return text
    .replace(/[ \t]+/g, ' ') // collapse spaces/tabs
    .replace(/\n\s+/g, '\n') // clean spaces after newlines
    .replace(/\s+\n/g, '\n') // clean spaces before newlines
    .replace(/\n{3,}/g, '\n\n') // keep max 2 consecutive newlines
    .trim(); // clean edges
}

function cleanUrl(url: string) {
  const u = new URL(url);

  const paramsToRemove = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid'];
  for (const param of paramsToRemove) {
    u.searchParams.delete(param);
  }

  return u.toString();
}

/**
 * Parses an RSS/XML feed content to extract article information
 *
 * Handles various RSS feed formats and structures while normalizing the output.
 * Extracts titles, links, and publication dates from the feed items.
 *
 * @param xml The XML content of the RSS feed as a string
 * @returns A Result containing either an array of parsed feed items or an error
 */
export async function parseRSSFeed(xml: string): Promise<Result<z.infer<typeof rssFeedSchema>[], Error>> {
  const safeParser = Result.fromThrowable(
    (xml: string) => new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_' }).parse(xml),
    e => (e instanceof Error ? e : new Error(String(e)))
  );

  const parsedXml = safeParser(xml);
  if (parsedXml.isErr()) {
    return err(new Error(`Parse error: ${parsedXml.error.message}`));
  }

  const result = parsedXml.value;

  // handle various feed structures
  let items = result.rss?.channel?.item || result.feed?.entry || result.item || result['rdf:RDF']?.item || [];

  // handle single item case
  items = Array.isArray(items) ? items : [items];

  // biome-ignore lint/suspicious/noExplicitAny: <explanation>
  const properItems = items.map((item: any) => {
    let title = '';
    let link = '';
    let id = '';
    let pubDateString: string | null = null;

    if (typeof item.title === 'string') {
      title = item.title;
    } else if (typeof item.title === 'object' && item.title['#text']) {
      title = item.title['#text'];
    } else {
      title = 'UNKNOWN';
    }

    if (typeof item.link === 'string') {
      link = item.link;
    } else if (typeof item.link === 'object' && item.link['@_href']) {
      link = item.link['@_href'];
    } else if (typeof item.guid === 'string') {
      link = item.guid;
    } else {
      link = 'UNKNOWN';
    }

    if (typeof item.guid === 'string') {
      id = item.guid;
    } else if (typeof item.guid === 'object' && item.guid['#text']) {
      id = item.guid['#text'];
    } else {
      id = 'UNKNOWN';
    }

    if (typeof item.pubDate === 'string') {
      pubDateString = item.pubDate;
    } else if (typeof item.published === 'string') {
      pubDateString = item.published;
    } else if (typeof item.updated === 'string') {
      pubDateString = item.updated;
    }

    let pubDate: Date | null = null;
    if (pubDateString) {
      pubDate = new Date(pubDateString);
      if (Number.isNaN(pubDate.getTime())) {
        pubDate = null;
      }
    }

    return {
      title: cleanString(title),
      link: cleanUrl(cleanString(link)),
      id: cleanString(id),
      pubDate,
    };
  });

  // standardize the items
  const parsedItems = z.array(rssFeedSchema).safeParse(properItems);
  if (parsedItems.success === false) {
    return err(new Error(`Validation error: ${parsedItems.error.message}`));
  }

  return ok(parsedItems.data);
}

/**
 * Parses HTML content to extract article text and metadata
 *
 * Uses Mozilla Readability to identify and extract the main content
 * from an HTML document, ignoring navigation, ads, and other non-content elements.
 *
 * @param opts Object containing the HTML content to parse
 * @returns A Result containing either the parsed article data or an error object
 */
export function parseArticle(opts: { html: string }) {
  const safeReadability = Result.fromThrowable(
    (html: string) => new Readability(parseHTML(html).document).parse(),
    e => (e instanceof Error ? e : new Error(String(e)))
  );

  const articleResult = safeReadability(opts.html);
  if (articleResult.isErr()) {
    return err({ type: 'READABILITY_ERROR', error: articleResult.error });
  }

  // if we can't parse the article or there is no article, not much we can do
  const article = articleResult.value;
  if (article === null || !article.title || !article.textContent) {
    return err({ type: 'NO_ARTICLE_FOUND', error: new Error('No article found') });
  }

  return ok({
    title: article.title,
    text: cleanString(article.textContent),
    publishedTime: article.publishedTime || undefined,
  });
}