broadfield-dev commited on
Commit
a9254a4
·
verified ·
1 Parent(s): ec48712

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -325
app.py CHANGED
@@ -1,355 +1,245 @@
1
  import os
2
- import threading
3
- from flask import Flask, render_template, request, jsonify
4
- from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
5
- import logging
6
- import time
7
- from datetime import datetime
8
- import hashlib
9
- import glob
10
  from langchain.vectorstores import Chroma
11
  from langchain.embeddings import HuggingFaceEmbeddings
12
-
13
- app = Flask(__name__)
 
 
 
 
 
 
 
14
 
15
  # Setup logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
- # Global flag to track background loading
20
- loading_complete = True # Start as True to allow initial rendering
21
- last_update_time = time.time()
22
- last_data_hash = None # Track the hash of the last data to detect changes
23
-
24
- def load_feeds_in_background():
25
- global loading_complete, last_update_time
26
- try:
27
- logger.info("Starting background RSS feed fetch")
28
- articles = fetch_rss_feeds()
29
- logger.info(f"Fetched {len(articles)} articles")
30
- process_and_store_articles(articles)
31
- last_update_time = time.time()
32
- logger.info("Background feed processing complete")
33
- upload_to_hf_hub()
34
- except Exception as e:
35
- logger.error(f"Error in background feed loading: {e}")
36
- finally:
37
- loading_complete = True
38
-
39
- def get_all_docs_from_dbs():
40
- """Aggregate documents and metadata from all Chroma DB folders."""
41
- all_docs = {'documents': [], 'metadatas': []}
42
- seen_ids = set()
43
-
44
- for db_path in glob.glob("chroma_db*"):
45
- if not os.path.isdir(db_path):
46
- continue
47
- try:
48
- temp_vector_db = Chroma(
49
- persist_directory=db_path,
50
- embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
51
- collection_name="news_articles"
52
- )
53
- db_data = temp_vector_db.get(include=['documents', 'metadatas'])
54
- if db_data.get('documents') and db_data.get('metadatas'):
55
- for doc, meta in zip(db_data['documents'], db_data['metadatas']):
56
- doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
57
- if doc_id not in seen_ids:
58
- seen_ids.add(doc_id)
59
- all_docs['documents'].append(doc)
60
- all_docs['metadatas'].append(meta)
61
- except Exception as e:
62
- logger.error(f"Error loading DB {db_path}: {e}")
63
-
64
- return all_docs
65
-
66
- def compute_data_hash(categorized_articles):
67
- """Compute a hash of the current articles to detect changes."""
68
- if not categorized_articles:
69
  return ""
70
- # Create a sorted string representation of the articles for consistent hashing
71
- data_str = ""
72
- for cat, articles in sorted(categorized_articles.items()):
73
- for article in sorted(articles, key=lambda x: x["published"]):
74
- data_str += f"{cat}|{article['title']}|{article['link']}|{article['published']}|"
75
- return hashlib.sha256(data_str.encode('utf-8')).hexdigest()
76
-
77
- @app.route('/')
78
- def index():
79
- global loading_complete, last_update_time, last_data_hash
80
-
81
- # Check if any DB exists; if not, download from Hugging Face
82
- db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
83
- if not db_exists:
84
- logger.info("No Chroma DB found, downloading from Hugging Face Hub...")
85
- download_from_hf_hub()
86
-
87
- # Start background RSS feed update
88
- loading_complete = False
89
- threading.Thread(target=load_feeds_in_background, daemon=True).start()
90
-
91
- # Load existing data immediately
92
- try:
93
- all_docs = get_all_docs_from_dbs()
94
- total_docs = len(all_docs['documents'])
95
- logger.info(f"Total articles across all DBs at startup: {total_docs}")
96
- if not all_docs.get('metadatas'):
97
- logger.info("No articles in any DB yet")
98
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
99
-
100
- # Process and categorize articles with deduplication
101
- enriched_articles = []
102
- seen_keys = set()
103
- for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
104
- if not meta:
105
- continue
106
- title = meta.get("title", "No Title")
107
- link = meta.get("link", "")
108
- description = meta.get("original_description", "No Description")
109
- published = meta.get("published", "Unknown Date").strip()
110
-
111
- title = clean_text(title)
112
- link = clean_text(link)
113
- description = clean_text(description)
114
-
115
- description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
116
- key = f"{title}|{link}|{published}|{description_hash}"
117
- if key not in seen_keys:
118
- seen_keys.add(key)
119
- try:
120
- published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
121
- except (ValueError, TypeError):
122
- published = "1970-01-01T00:00:00"
123
- enriched_articles.append({
124
- "title": title,
125
- "link": link,
126
- "description": description,
127
- "category": meta.get("category", "Uncategorized"),
128
- "published": published,
129
- "image": meta.get("image", "svg"),
130
- })
131
-
132
- enriched_articles.sort(key=lambda x: x["published"], reverse=True)
133
-
134
- categorized_articles = {}
135
- for article in enriched_articles:
136
- cat = article["category"]
137
- if cat not in categorized_articles:
138
- categorized_articles[cat] = []
139
- categorized_articles[cat].append(article)
140
-
141
- categorized_articles = dict(sorted(categorized_articles.items(), key=lambda x: x[0].lower()))
142
-
143
- for cat in categorized_articles:
144
- categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
145
- if len(categorized_articles[cat]) >= 2:
146
- logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
147
-
148
- # Compute initial data hash
149
- last_data_hash = compute_data_hash(categorized_articles)
150
-
151
- logger.info(f"Displaying articles at startup: {sum(len(articles) for articles in categorized_articles.values())} total")
152
- return render_template("index.html",
153
- categorized_articles=categorized_articles,
154
- has_articles=True,
155
- loading=True)
156
- except Exception as e:
157
- logger.error(f"Error retrieving articles at startup: {e}")
158
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
159
-
160
- @app.route('/search', methods=['POST'])
161
- def search():
162
- query = request.form.get('search')
163
- if not query:
164
- logger.info("Empty search query received")
165
- return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
166
-
167
- try:
168
- logger.info(f"Searching for: {query}")
169
- all_docs = get_all_docs_from_dbs()
170
- if not all_docs.get('metadatas'):
171
- return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
172
-
173
- enriched_articles = []
174
- seen_keys = set()
175
- for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
176
- if not meta:
177
  continue
178
- title = meta.get("title", "No Title")
179
- link = meta.get("link", "")
180
- description = meta.get("original_description", "No Description")
181
- published = meta.get("published", "Unknown Date").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- title = clean_text(title)
184
- link = clean_text(link)
185
- description = clean_text(description)
186
-
187
- if query.lower() in title or query.lower() in description:
188
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
189
  key = f"{title}|{link}|{published}|{description_hash}"
190
  if key not in seen_keys:
191
  seen_keys.add(key)
192
- enriched_articles.append({
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  "title": title,
194
  "link": link,
195
  "description": description,
196
- "category": meta.get("category", "Uncategorized"),
197
  "published": published,
198
- "image": meta.get("image", "svg"),
 
199
  })
200
-
201
- categorized_articles = {}
202
- for article in enriched_articles:
203
- cat = article["category"]
204
- categorized_articles.setdefault(cat, []).append(article)
205
-
206
- logger.info(f"Found {len(enriched_articles)} unique articles across {len(categorized_articles)} categories")
207
- return jsonify({
208
- "categorized_articles": categorized_articles,
209
- "has_articles": bool(enriched_articles),
210
- "loading": False
211
- })
212
- except Exception as e:
213
- logger.error(f"Search error: {e}")
214
- return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False}), 500
215
-
216
- @app.route('/check_loading')
217
- def check_loading():
218
- global loading_complete, last_update_time
219
- if loading_complete:
220
- return jsonify({"status": "complete", "last_update": last_update_time})
221
- return jsonify({"status": "loading"}), 202
222
-
223
- @app.route('/get_updates')
224
- def get_updates():
225
- global last_update_time, last_data_hash
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  try:
227
- all_docs = get_all_docs_from_dbs()
228
- if not all_docs.get('metadatas'):
229
- return jsonify({"articles": [], "last_update": last_update_time, "has_updates": False})
230
-
231
- enriched_articles = []
232
- seen_keys = set()
233
- for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
234
- if not meta:
235
- continue
236
- title = meta.get("title", "No Title")
237
- link = meta.get("link", "")
238
- description = meta.get("original_description", "No Description")
239
- published = meta.get("published", "Unknown Date").strip()
240
-
241
- title = clean_text(title)
242
- link = clean_text(link)
243
- description = clean_text(description)
244
 
 
 
 
 
 
 
 
 
 
245
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
246
- key = f"{title}|{link}|{published}|{description_hash}"
247
- if key not in seen_keys:
248
- seen_keys.add(key)
249
- try:
250
- published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
251
- except (ValueError, TypeError):
252
- published = "1970-01-01T00:00:00"
253
- enriched_articles.append({
254
- "title": title,
255
- "link": link,
256
- "description": description,
257
- "category": meta.get("category", "Uncategorized"),
258
- "published": published,
259
- "image": meta.get("image", "svg"),
260
- })
261
-
262
- enriched_articles.sort(key=lambda x: x["published"], reverse=True)
263
- categorized_articles = {}
264
- for article in enriched_articles:
265
- cat = article["category"]
266
- if cat not in categorized_articles:
267
- categorized_articles[cat] = []
268
- key = f"{article['title']}|{article['link']}|{article['published']}"
269
- if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
270
- categorized_articles[cat].append(article)
271
-
272
- for cat in categorized_articles:
273
- unique_articles = []
274
- seen_cat_keys = set()
275
- for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
276
- key = f"{clean_text(article['title'])}|{clean_text(article['link'])}|{article['published']}"
277
- if key not in seen_cat_keys:
278
- seen_cat_keys.add(key)
279
- unique_articles.append(article)
280
- categorized_articles[cat] = unique_articles[:10]
281
-
282
- # Compute hash of new data
283
- current_data_hash = compute_data_hash(categorized_articles)
284
-
285
- # Compare with last data hash to determine if there are updates
286
- has_updates = last_data_hash != current_data_hash
287
- if has_updates:
288
- logger.info("New RSS data detected, sending updates to frontend")
289
- last_data_hash = current_data_hash
290
- return jsonify({
291
- "articles": categorized_articles,
292
- "last_update": last_update_time,
293
- "has_updates": True
294
- })
295
- else:
296
- logger.info("No new RSS data, skipping update")
297
- return jsonify({
298
- "articles": {},
299
- "last_update": last_update_time,
300
- "has_updates": False
301
- })
302
- except Exception as e:
303
- logger.error(f"Error fetching updates: {e}")
304
- return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False}), 500
305
-
306
- @app.route('/get_all_articles/<category>')
307
- def get_all_articles(category):
308
- try:
309
- all_docs = get_all_docs_from_dbs()
310
- if not all_docs.get('metadatas'):
311
- return jsonify({"articles": [], "category": category})
312
-
313
- enriched_articles = []
314
- seen_keys = set()
315
- for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
316
- if not meta or meta.get("category") != category:
317
  continue
318
- title = meta.get("title", "No Title")
319
- link = meta.get("link", "")
320
- description = meta.get("original_description", "No Description")
321
- published = meta.get("published", "Unknown Date").strip()
322
 
323
- title = clean_text(title)
324
- link = clean_text(link)
325
- description = clean_text(description)
326
-
327
- description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
328
- key = f"{title}|{link}|{published}|{description_hash}"
329
- if key not in seen_keys:
330
- seen_keys.add(key)
331
- try:
332
- published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
333
- except (ValueError, TypeError):
334
- published = "1970-01-01T00:00:00"
335
- enriched_articles.append({
336
- "title": title,
337
- "link": link,
338
- "description": description,
339
- "category": meta.get("category", "Uncategorized"),
340
- "published": published,
341
- "image": meta.get("image", "svg"),
342
- })
 
 
343
 
344
- enriched_articles.sort(key=lambda x: x["published"], reverse=True)
345
- return jsonify({"articles": enriched_articles, "category": category})
 
 
 
 
 
 
 
 
 
 
 
346
  except Exception as e:
347
- logger.error(f"Error fetching all articles for category {category}: {e}")
348
- return jsonify({"articles": [], "category": category}), 500
349
 
350
- @app.route('/card')
351
- def card_load():
352
- return render_template("card.html")
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
  if __name__ == "__main__":
355
- app.run(host="0.0.0.0", port=7860)
 
 
 
 
1
  import os
2
+ import feedparser
 
 
 
 
 
 
 
3
  from langchain.vectorstores import Chroma
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.docstore.document import Document
6
+ import logging
7
+ from huggingface_hub import HfApi, login, snapshot_download
8
+ import shutil
9
+ import rss_feeds
10
+ from datetime import datetime, date
11
+ import dateutil.parser
12
+ import hashlib
13
+ import re
14
 
15
  # Setup logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
+ # Constants
20
+ MAX_ARTICLES_PER_FEED = 10
21
+ RSS_FEEDS = rss_feeds.RSS_FEEDS
22
+ COLLECTION_NAME = "news_articles"
23
+ HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
24
+ REPO_ID = "broadfield-dev/news-rag-db"
25
+
26
+ # Initialize Hugging Face API
27
+ login(token=HF_API_TOKEN)
28
+ hf_api = HfApi()
29
+
30
+ def get_embedding_model():
31
+ """Returns a singleton instance of the embedding model to avoid reloading."""
32
+ if not hasattr(get_embedding_model, "model"):
33
+ get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
34
+ return get_embedding_model.model
35
+
36
+ def get_daily_db_dir():
37
+ """Returns the path for today's Chroma DB."""
38
+ return f"chroma_db_{date.today().isoformat()}"
39
+
40
+ def clean_text(text):
41
+ """Clean text by removing HTML tags and extra whitespace."""
42
+ if not text or not isinstance(text, str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  return ""
44
+ text = re.sub(r'<.*?>', '', text)
45
+ text = ' '.join(text.split())
46
+ return text.strip().lower()
47
+
48
+ def fetch_rss_feeds():
49
+ articles = []
50
+ seen_keys = set()
51
+ for feed_url in RSS_FEEDS:
52
+ try:
53
+ logger.info(f"Fetching {feed_url}")
54
+ feed = feedparser.parse(feed_url)
55
+ if feed.bozo:
56
+ logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  continue
58
+ article_count = 0
59
+ for entry in feed.entries:
60
+ if article_count >= MAX_ARTICLES_PER_FEED:
61
+ break
62
+ title = entry.get("title", "No Title")
63
+ link = entry.get("link", "")
64
+ description = entry.get("summary", entry.get("description", ""))
65
+
66
+ title = clean_text(title)
67
+ link = clean_text(link)
68
+ description = clean_text(description)
69
+
70
+ published = "Unknown Date"
71
+ for date_field in ["published", "updated", "created", "pubDate"]:
72
+ if date_field in entry:
73
+ try:
74
+ parsed_date = dateutil.parser.parse(entry[date_field])
75
+ published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
76
+ break
77
+ except (ValueError, TypeError) as e:
78
+ logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
79
+ continue
80
 
 
 
 
 
 
81
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
82
  key = f"{title}|{link}|{published}|{description_hash}"
83
  if key not in seen_keys:
84
  seen_keys.add(key)
85
+ image = "svg"
86
+ for img_source in [
87
+ lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
88
+ lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
89
+ lambda e: clean_text(e.get("enclosure", {}).get("url")) if e.get("enclosure") else "",
90
+ lambda e: clean_text(next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), "")),
91
+ ]:
92
+ try:
93
+ img = img_source(entry)
94
+ if img and img.strip():
95
+ image = img
96
+ break
97
+ except (IndexError, AttributeError, TypeError):
98
+ continue
99
+
100
+ articles.append({
101
  "title": title,
102
  "link": link,
103
  "description": description,
 
104
  "published": published,
105
+ "category": categorize_feed(feed_url),
106
+ "image": image,
107
  })
108
+ article_count += 1
109
+ except Exception as e:
110
+ logger.error(f"Error fetching {feed_url}: {e}")
111
+ logger.info(f"Total articles fetched: {len(articles)}")
112
+ return articles
113
+
114
+ def categorize_feed(url):
115
+ """Categorize an RSS feed based on its URL."""
116
+ if not url or not isinstance(url, str):
117
+ logger.warning(f"Invalid URL provided for categorization: {url}")
118
+ return "Uncategorized"
119
+
120
+ url = url.lower().strip()
121
+
122
+ logger.debug(f"Categorizing URL: {url}")
123
+
124
+ if any(keyword in url for keyword in ["nature", "science.org", "arxiv.org", "plos.org", "annualreviews.org", "journals.uchicago.edu", "jneurosci.org", "cell.com", "nejm.org", "lancet.com"]):
125
+ return "Academic Papers"
126
+ elif any(keyword in url for keyword in ["reuters.com/business", "bloomberg.com", "ft.com", "marketwatch.com", "cnbc.com", "foxbusiness.com", "wsj.com", "bworldonline.com", "economist.com", "forbes.com"]):
127
+ return "Business"
128
+ elif any(keyword in url for keyword in ["investing.com", "cnbc.com/market", "marketwatch.com/market", "fool.co.uk", "zacks.com", "seekingalpha.com", "barrons.com", "yahoofinance.com"]):
129
+ return "Stocks & Markets"
130
+ elif any(keyword in url for keyword in ["whitehouse.gov", "state.gov", "commerce.gov", "transportation.gov", "ed.gov", "dol.gov", "justice.gov", "federalreserve.gov", "occ.gov", "sec.gov", "bls.gov", "usda.gov", "gao.gov", "cbo.gov", "fema.gov", "defense.gov", "hhs.gov", "energy.gov", "interior.gov"]):
131
+ return "Federal Government"
132
+ elif any(keyword in url for keyword in ["weather.gov", "metoffice.gov.uk", "accuweather.com", "weatherunderground.com", "noaa.gov", "wunderground.com", "climate.gov", "ecmwf.int", "bom.gov.au"]):
133
+ return "Weather"
134
+ elif any(keyword in url for keyword in ["data.worldbank.org", "imf.org", "un.org", "oecd.org", "statista.com", "kff.org", "who.int", "cdc.gov", "bea.gov", "census.gov", "fdic.gov"]):
135
+ return "Data & Statistics"
136
+ elif any(keyword in url for keyword in ["nasa", "spaceweatherlive", "space", "universetoday", "skyandtelescope", "esa"]):
137
+ return "Space"
138
+ elif any(keyword in url for keyword in ["sciencedaily", "quantamagazine", "smithsonianmag", "popsci", "discovermagazine", "scientificamerican", "newscientist", "livescience", "atlasobscura"]):
139
+ return "Science"
140
+ elif any(keyword in url for keyword in ["wired", "techcrunch", "arstechnica", "gizmodo", "theverge"]):
141
+ return "Tech"
142
+ elif any(keyword in url for keyword in ["horoscope", "astrostyle"]):
143
+ return "Astrology"
144
+ elif any(keyword in url for keyword in ["cnn_allpolitics", "bbci.co.uk/news/politics", "reuters.com/arc/outboundfeeds/newsletter-politics", "politico.com/rss/politics", "thehill"]):
145
+ return "Politics"
146
+ elif any(keyword in url for keyword in ["weather", "swpc.noaa.gov", "foxweather"]):
147
+ return "Earth Weather"
148
+ elif "vogue" in url:
149
+ return "Lifestyle"
150
+ elif any(keyword in url for keyword in ["phys.org", "aps.org", "physicsworld"]):
151
+ return "Physics"
152
+ else:
153
+ logger.warning(f"No matching category found for URL: {url}")
154
+ return "Uncategorized"
155
+
156
+ def process_and_store_articles(articles):
157
+ db_path = get_daily_db_dir()
158
+ vector_db = Chroma(
159
+ persist_directory=db_path,
160
+ embedding_function=get_embedding_model(),
161
+ collection_name=COLLECTION_NAME
162
+ )
163
+
164
  try:
165
+ existing_ids = set(vector_db.get(include=[])["ids"])
166
+ except Exception:
167
+ existing_ids = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ docs_to_add = []
170
+ ids_to_add = []
171
+
172
+ for article in articles:
173
+ try:
174
+ title = clean_text(article["title"])
175
+ link = clean_text(article["link"])
176
+ description = clean_text(article["description"])
177
+ published = article["published"]
178
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
179
+
180
+ doc_id = f"{title}|{link}|{published}|{description_hash}"
181
+
182
+ if doc_id in existing_ids:
183
+ logger.debug(f"Skipping duplicate in DB {db_path}: {doc_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  continue
 
 
 
 
185
 
186
+ metadata = {
187
+ "title": article["title"],
188
+ "link": article["link"],
189
+ "original_description": article["description"],
190
+ "published": article["published"],
191
+ "category": article["category"],
192
+ "image": article["image"],
193
+ }
194
+ doc = Document(page_content=description, metadata=metadata)
195
+ docs_to_add.append(doc)
196
+ ids_to_add.append(doc_id)
197
+ existing_ids.add(doc_id)
198
+ except Exception as e:
199
+ logger.error(f"Error processing article {article.get('title', 'N/A')}: {e}")
200
+
201
+ if docs_to_add:
202
+ try:
203
+ vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
204
+ vector_db.persist()
205
+ logger.info(f"Added {len(docs_to_add)} new articles to DB {db_path}. Total in DB: {vector_db._collection.count()}")
206
+ except Exception as e:
207
+ logger.error(f"Error storing articles in {db_path}: {e}")
208
 
209
+ def download_from_hf_hub():
210
+ try:
211
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
212
+ logger.info(f"Downloading all DBs from {REPO_ID}...")
213
+ snapshot_download(
214
+ repo_id=REPO_ID,
215
+ repo_type="dataset",
216
+ local_dir=".",
217
+ local_dir_use_symlinks=False,
218
+ allow_patterns="chroma_db_*/**",
219
+ token=HF_API_TOKEN
220
+ )
221
+ logger.info("Finished downloading DBs.")
222
  except Exception as e:
223
+ logger.error(f"Error downloading from Hugging Face Hub: {e}")
 
224
 
225
+ def upload_to_hf_hub():
226
+ db_path = get_daily_db_dir()
227
+ if os.path.exists(db_path):
228
+ try:
229
+ logger.info(f"Uploading updated Chroma DB '{db_path}' to {REPO_ID}...")
230
+ hf_api.upload_folder(
231
+ folder_path=db_path,
232
+ path_in_repo=db_path,
233
+ repo_id=REPO_ID,
234
+ repo_type="dataset",
235
+ token=HF_API_TOKEN
236
+ )
237
+ logger.info(f"Database folder '{db_path}' uploaded to: {REPO_ID}")
238
+ except Exception as e:
239
+ logger.error(f"Error uploading to Hugging Face Hub: {e}")
240
 
241
  if __name__ == "__main__":
242
+ download_from_hf_hub()
243
+ articles = fetch_rss_feeds()
244
+ process_and_store_articles(articles)
245
+ upload_to_hf_hub()