broadfield-dev commited on
Commit
4624af3
·
verified ·
1 Parent(s): 3aa40bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +357 -213
app.py CHANGED
@@ -1,31 +1,25 @@
1
  import os
2
- import feedparser
3
- from langchain.vectorstores import Chroma
4
- from langchain.embeddings import HuggingFaceEmbeddings
5
- from langchain.docstore.document import Document
6
  import logging
7
- from huggingface_hub import HfApi, login, snapshot_download
8
- import shutil
9
- import rss_feeds
10
- from datetime import datetime, date
11
- import dateutil.parser
12
  import hashlib
13
- import re
 
 
 
 
14
 
15
  # Setup logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
- # Constants
20
- MAX_ARTICLES_PER_FEED = 10
21
- RSS_FEEDS = rss_feeds.RSS_FEEDS
22
- COLLECTION_NAME = "news_articles"
23
- HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
24
- REPO_ID = "broadfield-dev/news-rag-db"
25
-
26
- # Initialize Hugging Face API
27
- login(token=HF_API_TOKEN)
28
- hf_api = HfApi()
29
 
30
  def get_embedding_model():
31
  """Returns a singleton instance of the embedding model to avoid reloading."""
@@ -33,213 +27,363 @@ def get_embedding_model():
33
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
34
  return get_embedding_model.model
35
 
36
- def get_daily_db_dir():
37
- """Returns the path for today's Chroma DB."""
38
- return f"chroma_db_{date.today().isoformat()}"
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- def clean_text(text):
41
- """Clean text by removing HTML tags and extra whitespace."""
42
- if not text or not isinstance(text, str):
43
- return ""
44
- text = re.sub(r'<.*?>', '', text)
45
- text = ' '.join(text.split())
46
- return text.strip().lower()
47
-
48
- def fetch_rss_feeds():
49
- articles = []
50
- seen_keys = set()
51
- for feed_url in RSS_FEEDS:
52
  try:
53
- logger.info(f"Fetching {feed_url}")
54
- feed = feedparser.parse(feed_url)
55
- if feed.bozo:
56
- logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
 
 
 
57
  continue
58
- article_count = 0
59
- for entry in feed.entries:
60
- if article_count >= MAX_ARTICLES_PER_FEED:
61
- break
62
- title = entry.get("title", "No Title")
63
- link = entry.get("link", "")
64
- description = entry.get("summary", entry.get("description", ""))
65
-
66
- title = clean_text(title)
67
- link = clean_text(link)
68
- description = clean_text(description)
69
-
70
- published = "Unknown Date"
71
- for date_field in ["published", "updated", "created", "pubDate"]:
72
- if date_field in entry:
73
- try:
74
- parsed_date = dateutil.parser.parse(entry[date_field])
75
- published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
76
- break
77
- except (ValueError, TypeError) as e:
78
- logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
79
- continue
80
-
81
- description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
82
- key = f"{title}|{link}|{published}|{description_hash}"
83
- if key not in seen_keys:
84
- seen_keys.add(key)
85
- image = "svg"
86
- for img_source in [
87
- lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
88
- lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
89
- lambda e: clean_text(e.get("enclosure", {}).get("url")) if e.get("enclosure") else "",
90
- lambda e: clean_text(next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), "")),
91
- ]:
92
- try:
93
- img = img_source(entry)
94
- if img and img.strip():
95
- image = img
96
- break
97
- except (IndexError, AttributeError, TypeError):
98
- continue
99
-
100
- articles.append({
101
- "title": title,
102
- "link": link,
103
- "description": description,
104
- "published": published,
105
- "category": categorize_feed(feed_url),
106
- "image": image,
107
- })
108
- article_count += 1
109
  except Exception as e:
110
- logger.error(f"Error fetching {feed_url}: {e}")
111
- logger.info(f"Total articles fetched: {len(articles)}")
112
- return articles
113
-
114
- def categorize_feed(url):
115
- """Categorize an RSS feed based on its URL."""
116
- if not url or not isinstance(url, str):
117
- logger.warning(f"Invalid URL provided for categorization: {url}")
118
- return "Uncategorized"
119
-
120
- url = url.lower().strip()
121
-
122
- logger.debug(f"Categorizing URL: {url}")
123
-
124
- if any(keyword in url for keyword in ["nature", "science.org", "arxiv.org", "plos.org", "annualreviews.org", "journals.uchicago.edu", "jneurosci.org", "cell.com", "nejm.org", "lancet.com"]):
125
- return "Academic Papers"
126
- elif any(keyword in url for keyword in ["reuters.com/business", "bloomberg.com", "ft.com", "marketwatch.com", "cnbc.com", "foxbusiness.com", "wsj.com", "bworldonline.com", "economist.com", "forbes.com"]):
127
- return "Business"
128
- elif any(keyword in url for keyword in ["investing.com", "cnbc.com/market", "marketwatch.com/market", "fool.co.uk", "zacks.com", "seekingalpha.com", "barrons.com", "yahoofinance.com"]):
129
- return "Stocks & Markets"
130
- elif any(keyword in url for keyword in ["whitehouse.gov", "state.gov", "commerce.gov", "transportation.gov", "ed.gov", "dol.gov", "justice.gov", "federalreserve.gov", "occ.gov", "sec.gov", "bls.gov", "usda.gov", "gao.gov", "cbo.gov", "fema.gov", "defense.gov", "hhs.gov", "energy.gov", "interior.gov"]):
131
- return "Federal Government"
132
- elif any(keyword in url for keyword in ["weather.gov", "metoffice.gov.uk", "accuweather.com", "weatherunderground.com", "noaa.gov", "wunderground.com", "climate.gov", "ecmwf.int", "bom.gov.au"]):
133
- return "Weather"
134
- elif any(keyword in url for keyword in ["data.worldbank.org", "imf.org", "un.org", "oecd.org", "statista.com", "kff.org", "who.int", "cdc.gov", "bea.gov", "census.gov", "fdic.gov"]):
135
- return "Data & Statistics"
136
- elif any(keyword in url for keyword in ["nasa", "spaceweatherlive", "space", "universetoday", "skyandtelescope", "esa"]):
137
- return "Space"
138
- elif any(keyword in url for keyword in ["sciencedaily", "quantamagazine", "smithsonianmag", "popsci", "discovermagazine", "scientificamerican", "newscientist", "livescience", "atlasobscura"]):
139
- return "Science"
140
- elif any(keyword in url for keyword in ["wired", "techcrunch", "arstechnica", "gizmodo", "theverge"]):
141
- return "Tech"
142
- elif any(keyword in url for keyword in ["horoscope", "astrostyle"]):
143
- return "Astrology"
144
- elif any(keyword in url for keyword in ["cnn_allpolitics", "bbci.co.uk/news/politics", "reuters.com/arc/outboundfeeds/newsletter-politics", "politico.com/rss/politics", "thehill"]):
145
- return "Politics"
146
- elif any(keyword in url for keyword in ["weather", "swpc.noaa.gov", "foxweather"]):
147
- return "Earth Weather"
148
- elif "vogue" in url:
149
- return "Lifestyle"
150
- elif any(keyword in url for keyword in ["phys.org", "aps.org", "physicsworld"]):
151
- return "Physics"
152
- else:
153
- logger.warning(f"No matching category found for URL: {url}")
154
- return "Uncategorized"
155
-
156
- def process_and_store_articles(articles):
157
- db_path = get_daily_db_dir()
158
- vector_db = Chroma(
159
- persist_directory=db_path,
160
- embedding_function=get_embedding_model(),
161
- collection_name=COLLECTION_NAME
162
- )
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  try:
165
- existing_ids = set(vector_db.get(include=[])["ids"])
166
- except Exception:
167
- existing_ids = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- docs_to_add = []
170
- ids_to_add = []
171
-
172
- for article in articles:
173
- try:
174
- title = clean_text(article["title"])
175
- link = clean_text(article["link"])
176
- description = clean_text(article["description"])
177
- published = article["published"]
178
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- doc_id = f"{title}|{link}|{published}|{description_hash}"
181
-
182
- if doc_id in existing_ids:
183
- logger.debug(f"Skipping duplicate in DB {db_path}: {doc_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  continue
 
 
 
 
185
 
186
- metadata = {
187
- "title": article["title"],
188
- "link": article["link"],
189
- "original_description": article["description"],
190
- "published": article["published"],
191
- "category": article["category"],
192
- "image": article["image"],
193
- }
194
- doc = Document(page_content=description, metadata=metadata)
195
- docs_to_add.append(doc)
196
- ids_to_add.append(doc_id)
197
- existing_ids.add(doc_id)
198
- except Exception as e:
199
- logger.error(f"Error processing article {article.get('title', 'N/A')}: {e}")
200
-
201
- if docs_to_add:
202
- try:
203
- vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
204
- vector_db.persist()
205
- logger.info(f"Added {len(docs_to_add)} new articles to DB {db_path}. Total in DB: {vector_db._collection.count()}")
206
- except Exception as e:
207
- logger.error(f"Error storing articles in {db_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
- def download_from_hf_hub():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  try:
211
- hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
212
- logger.info(f"Downloading all DBs from {REPO_ID}...")
213
- snapshot_download(
214
- repo_id=REPO_ID,
215
- repo_type="dataset",
216
- local_dir=".",
217
- local_dir_use_symlinks=False,
218
- allow_patterns="chroma_db_*/**",
219
- token=HF_API_TOKEN
220
- )
221
- logger.info("Finished downloading DBs.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  except Exception as e:
223
- logger.error(f"Error downloading from Hugging Face Hub: {e}")
 
224
 
225
- def upload_to_hf_hub():
226
- db_path = get_daily_db_dir()
227
- if os.path.exists(db_path):
228
- try:
229
- logger.info(f"Uploading updated Chroma DB '{db_path}' to {REPO_ID}...")
230
- hf_api.upload_folder(
231
- folder_path=db_path,
232
- path_in_repo=db_path,
233
- repo_id=REPO_ID,
234
- repo_type="dataset",
235
- token=HF_API_TOKEN
236
- )
237
- logger.info(f"Database folder '{db_path}' uploaded to: {REPO_ID}")
238
- except Exception as e:
239
- logger.error(f"Error uploading to Hugging Face Hub: {e}")
240
 
241
  if __name__ == "__main__":
242
- download_from_hf_hub()
243
- articles = fetch_rss_feeds()
244
- process_and_store_articles(articles)
245
- upload_to_hf_hub()
 
1
  import os
2
+ import threading
3
+ from flask import Flask, render_template, request, jsonify
4
+ from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text
 
5
  import logging
6
+ import time
7
+ from datetime import datetime
 
 
 
8
  import hashlib
9
+ import glob
10
+ from langchain.vectorstores import Chroma
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+
13
+ app = Flask(__name__)
14
 
15
  # Setup logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
+ # Global flag to track background loading
20
+ loading_complete = True # Start as True to allow initial rendering
21
+ last_update_time = time.time()
22
+ last_data_hash = None # Track the hash of the last data to detect changes
 
 
 
 
 
 
23
 
24
  def get_embedding_model():
25
  """Returns a singleton instance of the embedding model to avoid reloading."""
 
27
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
28
  return get_embedding_model.model
29
 
30
+ def load_feeds_in_background():
31
+ global loading_complete, last_update_time
32
+ try:
33
+ logger.info("Starting background RSS feed fetch")
34
+ articles = fetch_rss_feeds()
35
+ logger.info(f"Fetched {len(articles)} articles")
36
+ process_and_store_articles(articles)
37
+ last_update_time = time.time()
38
+ logger.info("Background feed processing complete")
39
+ upload_to_hf_hub()
40
+ except Exception as e:
41
+ logger.error(f"Error in background feed loading: {e}")
42
+ finally:
43
+ loading_complete = True
44
 
45
+ def get_all_docs_from_dbs():
46
+ """Aggregate documents and metadata from all Chroma DB folders."""
47
+ all_docs = {'documents': [], 'metadatas': []}
48
+ seen_ids = set()
49
+ embedding_function = get_embedding_model()
50
+
51
+ for db_path in glob.glob("chroma_db_*"):
52
+ if not os.path.isdir(db_path):
53
+ continue
 
 
 
54
  try:
55
+ temp_vector_db = Chroma(
56
+ persist_directory=db_path,
57
+ embedding_function=embedding_function,
58
+ collection_name="news_articles"
59
+ )
60
+ # Skip empty databases
61
+ if temp_vector_db._collection.count() == 0:
62
  continue
63
+
64
+ db_data = temp_vector_db.get(include=['documents', 'metadatas'])
65
+ if db_data.get('documents') and db_data.get('metadatas'):
66
+ for doc, meta in zip(db_data['documents'], db_data['metadatas']):
67
+ # Use a more robust unique identifier
68
+ doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
69
+ if doc_id not in seen_ids:
70
+ seen_ids.add(doc_id)
71
+ all_docs['documents'].append(doc)
72
+ all_docs['metadatas'].append(meta)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  except Exception as e:
74
+ logger.error(f"Error loading DB {db_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ return all_docs
77
+
78
+ def compute_data_hash(categorized_articles):
79
+ """Compute a hash of the current articles to detect changes."""
80
+ if not categorized_articles:
81
+ return ""
82
+ # Create a sorted string representation of the articles for consistent hashing
83
+ data_str = ""
84
+ for cat, articles in sorted(categorized_articles.items()):
85
+ for article in sorted(articles, key=lambda x: x["published"]):
86
+ data_str += f"{cat}|{article['title']}|{article['link']}|{article['published']}|"
87
+ return hashlib.sha256(data_str.encode('utf-8')).hexdigest()
88
+
89
+ @app.route('/')
90
+ def index():
91
+ global loading_complete, last_update_time, last_data_hash
92
+
93
+ # Check if any DB exists; if not, download from Hugging Face
94
+ if not glob.glob("chroma_db_*"):
95
+ logger.info("No Chroma DBs found, downloading from Hugging Face Hub...")
96
+ download_from_hf_hub()
97
+
98
+ # Start background RSS feed update
99
+ loading_complete = False
100
+ threading.Thread(target=load_feeds_in_background, daemon=True).start()
101
+
102
+ # Load existing data immediately
103
  try:
104
+ all_docs = get_all_docs_from_dbs()
105
+ total_docs = len(all_docs['documents'])
106
+ logger.info(f"Total articles across all DBs at startup: {total_docs}")
107
+ if not all_docs.get('metadatas'):
108
+ logger.info("No articles in any DB yet")
109
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
110
+
111
+ # Process and categorize articles with deduplication
112
+ enriched_articles = []
113
+ seen_keys = set()
114
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
115
+ if not meta:
116
+ continue
117
+ title = meta.get("title", "No Title")
118
+ link = meta.get("link", "")
119
+ description = meta.get("original_description", "No Description")
120
+ published = meta.get("published", "Unknown Date").strip()
121
+
122
+ title = clean_text(title)
123
+ link = clean_text(link)
124
+ description = clean_text(description)
125
 
 
 
 
 
 
 
 
 
 
126
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
127
+ key = f"{title}|{link}|{published}|{description_hash}"
128
+ if key not in seen_keys:
129
+ seen_keys.add(key)
130
+ try:
131
+ published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
132
+ except (ValueError, TypeError):
133
+ published = "1970-01-01T00:00:00"
134
+ enriched_articles.append({
135
+ "title": title,
136
+ "link": link,
137
+ "description": description,
138
+ "category": meta.get("category", "Uncategorized"),
139
+ "published": published,
140
+ "image": meta.get("image", "svg"),
141
+ })
142
+
143
+ enriched_articles.sort(key=lambda x: x["published"], reverse=True)
144
+
145
+ categorized_articles = {}
146
+ for article in enriched_articles:
147
+ cat = article["category"]
148
+ if cat not in categorized_articles:
149
+ categorized_articles[cat] = []
150
+ categorized_articles[cat].append(article)
151
+
152
+ categorized_articles = dict(sorted(categorized_articles.items(), key=lambda x: x[0].lower()))
153
+
154
+ for cat in categorized_articles:
155
+ categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
156
+ if len(categorized_articles[cat]) >= 2:
157
+ logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
158
+
159
+ # Compute initial data hash
160
+ last_data_hash = compute_data_hash(categorized_articles)
161
+
162
+ logger.info(f"Displaying articles at startup: {sum(len(articles) for articles in categorized_articles.values())} total")
163
+ return render_template("index.html",
164
+ categorized_articles=categorized_articles,
165
+ has_articles=True,
166
+ loading=True)
167
+ except Exception as e:
168
+ logger.error(f"Error retrieving articles at startup: {e}")
169
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
170
+
171
+ @app.route('/search', methods=['POST'])
172
+ def search():
173
+ query = request.form.get('search')
174
+ if not query:
175
+ logger.info("Empty search query received")
176
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
177
+
178
+ try:
179
+ logger.info(f"Performing semantic search for: '{query}'")
180
+
181
+ embedding_function = get_embedding_model()
182
+ enriched_articles = []
183
+ seen_keys = set()
184
+ db_paths = glob.glob("chroma_db_*")
185
+
186
+ if not db_paths:
187
+ logger.warning("No Chroma DBs found for search.")
188
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
189
+
190
+ all_search_results = []
191
+ for db_path in db_paths:
192
+ if not os.path.isdir(db_path): continue
193
+ try:
194
+ vector_db = Chroma(
195
+ persist_directory=db_path,
196
+ embedding_function=embedding_function,
197
+ collection_name="news_articles"
198
+ )
199
+ if vector_db._collection.count() > 0:
200
+ results = vector_db.similarity_search_with_relevance_scores(query, k=20)
201
+ all_search_results.extend(results)
202
+ except Exception as e:
203
+ logger.error(f"Error searching in DB {db_path}: {e}")
204
+
205
+ # Sort all results by relevance score (higher is better)
206
+ all_search_results.sort(key=lambda x: x[1], reverse=True)
207
+
208
+ # Process and deduplicate top results
209
+ for doc, score in all_search_results:
210
+ meta = doc.metadata
211
+ title = clean_text(meta.get("title", "No Title"))
212
+ link = clean_text(meta.get("link", ""))
213
+ description = clean_text(meta.get("original_description", "No Description"))
214
+ published = meta.get("published", "Unknown Date").strip()
215
+
216
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
217
+ key = f"{title}|{link}|{published}|{description_hash}"
218
 
219
+ if key not in seen_keys:
220
+ seen_keys.add(key)
221
+ enriched_articles.append({
222
+ "title": meta.get("title", "No Title"),
223
+ "link": meta.get("link", ""),
224
+ "description": meta.get("original_description", "No Description"),
225
+ "category": meta.get("category", "Uncategorized"),
226
+ "published": published,
227
+ "image": meta.get("image", "svg"),
228
+ })
229
+
230
+ logger.info(f"Found {len(enriched_articles)} unique articles from semantic search.")
231
+ if not enriched_articles:
232
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
233
+
234
+ # Categorize the articles
235
+ categorized_articles = {}
236
+ for article in enriched_articles:
237
+ cat = article["category"]
238
+ categorized_articles.setdefault(cat, []).append(article)
239
+
240
+ return jsonify({
241
+ "categorized_articles": categorized_articles,
242
+ "has_articles": bool(enriched_articles),
243
+ "loading": False
244
+ })
245
+ except Exception as e:
246
+ logger.error(f"Semantic search error: {e}", exc_info=True)
247
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False}), 500
248
+
249
+
250
+ @app.route('/check_loading')
251
+ def check_loading():
252
+ global loading_complete, last_update_time
253
+ if loading_complete:
254
+ return jsonify({"status": "complete", "last_update": last_update_time})
255
+ return jsonify({"status": "loading"}), 202
256
+
257
+ @app.route('/get_updates')
258
+ def get_updates():
259
+ global last_update_time, last_data_hash
260
+ try:
261
+ all_docs = get_all_docs_from_dbs()
262
+ if not all_docs.get('metadatas'):
263
+ return jsonify({"articles": [], "last_update": last_update_time, "has_updates": False})
264
+
265
+ enriched_articles = []
266
+ seen_keys = set()
267
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
268
+ if not meta:
269
  continue
270
+ title = meta.get("title", "No Title")
271
+ link = meta.get("link", "")
272
+ description = meta.get("original_description", "No Description")
273
+ published = meta.get("published", "Unknown Date").strip()
274
 
275
+ title = clean_text(title)
276
+ link = clean_text(link)
277
+ description = clean_text(description)
278
+
279
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
280
+ key = f"{title}|{link}|{published}|{description_hash}"
281
+ if key not in seen_keys:
282
+ seen_keys.add(key)
283
+ try:
284
+ published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
285
+ except (ValueError, TypeError):
286
+ published = "1970-01-01T00:00:00"
287
+ enriched_articles.append({
288
+ "title": title,
289
+ "link": link,
290
+ "description": description,
291
+ "category": meta.get("category", "Uncategorized"),
292
+ "published": published,
293
+ "image": meta.get("image", "svg"),
294
+ })
295
+
296
+ enriched_articles.sort(key=lambda x: x["published"], reverse=True)
297
+ categorized_articles = {}
298
+ for article in enriched_articles:
299
+ cat = article["category"]
300
+ if cat not in categorized_articles:
301
+ categorized_articles[cat] = []
302
+ key = f"{article['title']}|{article['link']}|{article['published']}"
303
+ if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
304
+ categorized_articles[cat].append(article)
305
+
306
+ for cat in categorized_articles:
307
+ unique_articles = []
308
+ seen_cat_keys = set()
309
+ for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
310
+ key = f"{clean_text(article['title'])}|{clean_text(article['link'])}|{article['published']}"
311
+ if key not in seen_cat_keys:
312
+ seen_cat_keys.add(key)
313
+ unique_articles.append(article)
314
+ categorized_articles[cat] = unique_articles[:10]
315
 
316
+ # Compute hash of new data
317
+ current_data_hash = compute_data_hash(categorized_articles)
318
+
319
+ # Compare with last data hash to determine if there are updates
320
+ has_updates = last_data_hash != current_data_hash
321
+ if has_updates:
322
+ logger.info("New RSS data detected, sending updates to frontend")
323
+ last_data_hash = current_data_hash
324
+ return jsonify({
325
+ "articles": categorized_articles,
326
+ "last_update": last_update_time,
327
+ "has_updates": True
328
+ })
329
+ else:
330
+ logger.info("No new RSS data, skipping update")
331
+ return jsonify({
332
+ "articles": {},
333
+ "last_update": last_update_time,
334
+ "has_updates": False
335
+ })
336
+ except Exception as e:
337
+ logger.error(f"Error fetching updates: {e}")
338
+ return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False}), 500
339
+
340
+ @app.route('/get_all_articles/<category>')
341
+ def get_all_articles(category):
342
  try:
343
+ all_docs = get_all_docs_from_dbs()
344
+ if not all_docs.get('metadatas'):
345
+ return jsonify({"articles": [], "category": category})
346
+
347
+ enriched_articles = []
348
+ seen_keys = set()
349
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
350
+ if not meta or meta.get("category") != category:
351
+ continue
352
+ title = meta.get("title", "No Title")
353
+ link = meta.get("link", "")
354
+ description = meta.get("original_description", "No Description")
355
+ published = meta.get("published", "Unknown Date").strip()
356
+
357
+ title = clean_text(title)
358
+ link = clean_text(link)
359
+ description = clean_text(description)
360
+
361
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
362
+ key = f"{title}|{link}|{published}|{description_hash}"
363
+ if key not in seen_keys:
364
+ seen_keys.add(key)
365
+ try:
366
+ published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
367
+ except (ValueError, TypeError):
368
+ published = "1970-01-01T00:00:00"
369
+ enriched_articles.append({
370
+ "title": title,
371
+ "link": link,
372
+ "description": description,
373
+ "category": meta.get("category", "Uncategorized"),
374
+ "published": published,
375
+ "image": meta.get("image", "svg"),
376
+ })
377
+
378
+ enriched_articles.sort(key=lambda x: x["published"], reverse=True)
379
+ return jsonify({"articles": enriched_articles, "category": category})
380
  except Exception as e:
381
+ logger.error(f"Error fetching all articles for category {category}: {e}")
382
+ return jsonify({"articles": [], "category": category}), 500
383
 
384
+ @app.route('/card')
385
+ def card_load():
386
+ return render_template("card.html")
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
  if __name__ == "__main__":
389
+ app.run(host="0.0.0.0", port=7860)