broadfield-dev commited on
Commit
41abbcb
·
verified ·
1 Parent(s): 9271377

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +77 -54
rss_processor.py CHANGED
@@ -39,62 +39,85 @@ def clean_text(text):
39
  def fetch_rss_feeds():
40
  articles = []
41
  seen_keys = set()
42
- for feed_url in RSS_FEEDS:
43
- try:
44
- logger.info(f"Fetching {feed_url}")
45
- feed = feedparser.parse(feed_url)
46
- if feed.bozo:
47
- logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
 
 
 
 
 
 
 
48
  continue
49
- article_count = 0
50
- for entry in feed.entries:
51
- if article_count >= 10:
52
- break
53
- title = entry.get("title", "No Title")
54
- link = entry.get("link", "")
55
- description = entry.get("summary", entry.get("description", ""))
56
-
57
- cleaned_title = clean_text(title)
58
- cleaned_link = clean_text(link)
59
-
60
- published = "Unknown Date"
61
- for date_field in ["published", "updated", "created", "pubDate"]:
62
- if date_field in entry:
63
- try:
64
- parsed_date = dateutil.parser.parse(entry[date_field])
65
- published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
66
- break
67
- except (ValueError, TypeError):
68
- continue
69
-
70
- key = f"{cleaned_title}|{cleaned_link}|{published}"
71
- if key not in seen_keys:
72
- seen_keys.add(key)
73
- image = "svg"
74
- for img_source in [
75
- lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
76
- lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
77
- ]:
78
- try:
79
- img = img_source(entry)
80
- if img and img.strip():
81
- image = img
82
  break
83
- except (IndexError, AttributeError, TypeError):
84
- continue
85
-
86
- articles.append({
87
- "title": title,
88
- "link": link,
89
- "description": description,
90
- "published": published,
91
- "category": categorize_feed(feed_url),
92
- "image": image,
93
- })
94
- article_count += 1
95
- except Exception as e:
96
- logger.error(f"Error fetching {feed_url}: {e}")
97
- logger.info(f"Total articles fetched: {len(articles)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  return articles
99
 
100
  def categorize_feed(url):
 
39
  def fetch_rss_feeds():
40
  articles = []
41
  seen_keys = set()
42
+
43
+ try:
44
+ with open(FEEDS_FILE, 'r') as f:
45
+ feed_categories = json.load(f)
46
+ except FileNotFoundError:
47
+ logger.error(f"{FEEDS_FILE} not found. No feeds to process.")
48
+ return []
49
+
50
+ for category, feeds in feed_categories.items():
51
+ for feed_info in feeds:
52
+ feed_url = feed_info.get("url")
53
+ if not feed_url:
54
+ logger.warning(f"Skipping feed with no URL in category '{category}'")
55
  continue
56
+
57
+ try:
58
+ logger.info(f"Fetching {feed_url}")
59
+ feed = feedparser.parse(feed_url)
60
+ if feed.bozo:
61
+ logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
62
+ continue
63
+ article_count = 0
64
+ for entry in feed.entries:
65
+ if article_count >= MAX_ARTICLES_PER_FEED:
66
+ break
67
+
68
+ title_raw = entry.get("title", "No Title")
69
+ link = entry.get("link", "")
70
+ description = entry.get("summary", entry.get("description", ""))
71
+
72
+ clean_title_val = clean_text(title_raw)
73
+ clean_desc_val = clean_text(description)
74
+
75
+ if not clean_desc_val:
76
+ continue
77
+
78
+ published = "Unknown Date"
79
+ for date_field in ["published", "updated", "created", "pubDate"]:
80
+ if date_field in entry:
81
+ try:
82
+ parsed_date = dateutil.parser.parse(entry[date_field])
83
+ published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
84
  break
85
+ except (ValueError, TypeError):
86
+ continue
87
+
88
+ description_hash = hashlib.sha256(clean_desc_val.encode('utf-8')).hexdigest()
89
+ key = f"{clean_title_val}|{link}|{published}|{description_hash}"
90
+
91
+ if key not in seen_keys:
92
+ seen_keys.add(key)
93
+
94
+ image = "svg"
95
+ for img_source in [
96
+ lambda e: e.get("media_content", [{}])[0].get("url") if e.get("media_content") else "",
97
+ lambda e: e.get("media_thumbnail", [{}])[0].get("url") if e.get("media_thumbnail") else "",
98
+ lambda e: e.get("enclosure", {}).get("url") if e.get("enclosure") else "",
99
+ lambda e: next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), ""),
100
+ ]:
101
+ try:
102
+ img = img_source(entry)
103
+ if img and img.strip():
104
+ image = img
105
+ break
106
+ except (IndexError, AttributeError, TypeError):
107
+ continue
108
+
109
+ articles.append({
110
+ "title": title_raw,
111
+ "link": link,
112
+ "description": clean_desc_val,
113
+ "published": published,
114
+ "category": category,
115
+ "image": image,
116
+ })
117
+ article_count += 1
118
+ except Exception as e:
119
+ logger.error(f"Error fetching {feed_url}: {e}")
120
+ logger.info(f"Total unique articles fetched: {len(articles)}")
121
  return articles
122
 
123
  def categorize_feed(url):