broadfield-dev commited on
Commit
5b77884
·
verified ·
1 Parent(s): 1252efa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -98
app.py CHANGED
@@ -1,33 +1,29 @@
1
  import os
2
  import threading
3
- from flask import Flask, render_template, request, jsonify
4
- from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text, LOCAL_DB_DIR
5
  import logging
6
  import time
7
  from datetime import datetime
 
 
8
  from langchain.vectorstores import Chroma
9
  from langchain.embeddings import HuggingFaceEmbeddings
10
 
11
- # --- Basic Flask App Setup ---
12
  app = Flask(__name__)
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
- # --- Global State Management ---
17
  loading_complete = True
18
- last_update_time = None
 
19
 
20
- # --- Embedding and Vector DB Management ---
21
  def get_embedding_model():
22
- """Initializes and returns a singleton HuggingFace embedding model."""
23
  if not hasattr(get_embedding_model, "model"):
24
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
  return get_embedding_model.model
26
 
27
  def get_vector_db():
28
- """Initializes and returns a singleton Chroma DB client."""
29
  if not os.path.exists(LOCAL_DB_DIR):
30
- logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
31
  return None
32
  try:
33
  if not hasattr(get_vector_db, "db_instance"):
@@ -40,67 +36,58 @@ def get_vector_db():
40
  except Exception as e:
41
  logger.error(f"Failed to load vector DB: {e}")
42
  if hasattr(get_vector_db, "db_instance"):
43
- delattr(get_vector_db, "db_instance")
44
  return None
45
 
46
- # --- Background Processing ---
47
  def load_feeds_in_background():
48
- """Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
49
  global loading_complete, last_update_time
50
  if not loading_complete:
51
- logger.info("An update is already in progress. Skipping.")
52
  return
53
-
54
  loading_complete = False
55
  try:
56
- logger.info("Starting background RSS feed fetch and processing...")
57
  articles = fetch_rss_feeds()
58
- logger.info(f"Fetched {len(articles)} articles from RSS feeds.")
59
- if articles:
60
- process_and_store_articles(articles)
61
- upload_to_hf_hub()
62
- last_update_time = datetime.now().isoformat()
63
- logger.info("Background feed processing complete.")
64
  except Exception as e:
65
  logger.error(f"Error in background feed loading: {e}")
66
  finally:
67
  loading_complete = True
68
 
69
- # --- Data Transformation Helper (Used by both SSR and API) ---
70
- def format_articles_from_db(docs):
71
- """
72
- Takes ChromaDB documents and formats them into a standardized list of article dictionaries.
73
- """
 
 
74
  enriched_articles = []
75
  seen_keys = set()
76
-
77
  items = []
78
- # Handle .get() results (dict of lists)
79
  if isinstance(docs, dict) and 'metadatas' in docs:
80
  items = zip(docs.get('documents', []), docs.get('metadatas', []))
81
- # Handle similarity_search results (list of (Document, score) tuples)
82
  elif isinstance(docs, list):
83
  items = [(doc.page_content, doc.metadata) for doc, score in docs]
84
 
85
  for doc_content, meta in items:
86
  if not meta: continue
87
-
88
  title = meta.get("title", "No Title")
89
  link = meta.get("link", "")
90
- key = f"{title}|{link}"
 
91
 
92
  if key not in seen_keys:
93
  seen_keys.add(key)
94
-
95
- published_str = meta.get("published", "").strip()
96
  try:
97
- # The format from your original `process_and_store_articles`
98
- published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
99
  except (ValueError, TypeError):
100
- published_iso = datetime.utcnow().isoformat()
101
 
102
  enriched_articles.append({
103
- "id": meta.get("id", link),
104
  "title": title,
105
  "link": link,
106
  "description": meta.get("original_description", "No Description"),
@@ -108,41 +95,34 @@ def format_articles_from_db(docs):
108
  "published": published_iso,
109
  "image": meta.get("image", "svg"),
110
  })
111
-
112
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
113
  return enriched_articles
114
 
115
- # --------------------------------------------------------------------------------
116
- # --- Web Application Route (Server-Side Rendered) ---
117
- # --------------------------------------------------------------------------------
 
 
 
 
 
118
  @app.route('/')
119
  def index():
120
- """
121
- Renders the main web page by fetching, processing, and passing data
122
- to the template on the server side. This preserves the original functionality.
123
- """
124
- # Perform startup checks
125
  if not os.path.exists(LOCAL_DB_DIR):
126
  logger.info(f"No Chroma DB found at '{LOCAL_DB_DIR}', downloading from Hugging Face Hub...")
127
  download_from_hf_hub()
128
 
129
- # Trigger background update
130
  threading.Thread(target=load_feeds_in_background, daemon=True).start()
131
 
132
  try:
133
- # Fetch all data from the DB for rendering
134
- vector_db = get_vector_db()
135
- if not vector_db:
136
- raise ConnectionError("Database could not be loaded.")
137
-
138
- all_docs = vector_db.get(include=['documents', 'metadatas'])
139
-
140
- if not all_docs or not all_docs['metadatas']:
141
- logger.info("No articles in the DB yet for initial render.")
142
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
143
 
144
- # Process and categorize articles for the template
145
- enriched_articles = format_articles_from_db(all_docs)
146
 
147
  categorized_articles = {}
148
  for article in enriched_articles:
@@ -150,60 +130,94 @@ def index():
150
  categorized_articles.setdefault(cat, []).append(article)
151
 
152
  categorized_articles = dict(sorted(categorized_articles.items()))
153
-
154
- # Limit to 10 articles per category for the main page view
155
  for cat in categorized_articles:
156
  categorized_articles[cat] = categorized_articles[cat][:10]
 
 
157
 
158
- return render_template(
159
- "index.html",
160
- categorized_articles=categorized_articles,
161
- has_articles=True,
162
- # The original code didn't pass loading, but it's good practice
163
- loading=not loading_complete
164
- )
165
  except Exception as e:
166
- logger.error(f"Error rendering index page: {e}", exc_info=True)
167
- # Fallback render in case of error
168
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=True, error="Could not load articles.")
169
-
170
 
171
- # Your original search route, which was also server-side
172
- # We can keep it or decide to use the API for search on the frontend
173
  @app.route('/search', methods=['POST'])
174
  def search():
175
- # This route returns a JSON payload to be handled by JavaScript.
176
- # It functions like an API endpoint and is a good example of a hybrid approach.
177
  query = request.form.get('search')
178
  if not query:
179
  return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
180
 
181
  vector_db = get_vector_db()
182
  if not vector_db:
183
- return jsonify({"error": "Database not available"}), 503
184
-
185
- results = vector_db.similarity_search_with_relevance_scores(query, k=50)
186
- enriched_articles = format_articles_from_db(results)
 
187
 
188
- categorized_articles = {}
189
- for article in enriched_articles:
190
- cat = article["category"]
191
- categorized_articles.setdefault(cat, []).append(article)
192
 
193
- return jsonify({
194
- "categorized_articles": categorized_articles,
195
- "has_articles": bool(enriched_articles),
196
- "loading": False
197
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- # --------------------------------------------------------------------------------
201
- # --- NEW: Standalone API v1 Endpoints (Return only JSON) ---
202
- # --------------------------------------------------------------------------------
203
 
204
  @app.route('/api/v1/search', methods=['GET'])
205
  def api_search():
206
- """API endpoint for semantic search."""
207
  query = request.args.get('q')
208
  limit = request.args.get('limit', default=20, type=int)
209
 
@@ -216,7 +230,7 @@ def api_search():
216
 
217
  try:
218
  results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
219
- formatted_articles = format_articles_from_db(results)
220
  return jsonify(formatted_articles)
221
  except Exception as e:
222
  logger.error(f"API Search error: {e}", exc_info=True)
@@ -224,7 +238,6 @@ def api_search():
224
 
225
  @app.route('/api/v1/articles/category/<string:category_name>', methods=['GET'])
226
  def api_get_articles_by_category(category_name):
227
- """API endpoint to get articles filtered by category with pagination."""
228
  limit = request.args.get('limit', default=20, type=int)
229
  offset = request.args.get('offset', default=0, type=int)
230
 
@@ -234,7 +247,7 @@ def api_get_articles_by_category(category_name):
234
 
235
  try:
236
  results = vector_db.get(where={"category": category_name}, include=['documents', 'metadatas'])
237
- formatted_articles = format_articles_from_db(results)
238
  paginated_results = formatted_articles[offset : offset + limit]
239
 
240
  return jsonify({
@@ -246,8 +259,29 @@ def api_get_articles_by_category(category_name):
246
  logger.error(f"API Category fetch error: {e}", exc_info=True)
247
  return jsonify({"error": "An internal error occurred."}), 500
248
 
249
- # Other routes like /card, /get_updates, etc. from your original file would go here.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- # --- Main Application Runner ---
252
  if __name__ == "__main__":
253
- app.run(host="0.0.0.0", port=7860, debug=False)
 
1
  import os
2
  import threading
3
+ import hashlib
 
4
  import logging
5
  import time
6
  from datetime import datetime
7
+ from flask import Flask, render_template, request, jsonify
8
+ from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, LOCAL_DB_DIR
9
  from langchain.vectorstores import Chroma
10
  from langchain.embeddings import HuggingFaceEmbeddings
11
 
 
12
  app = Flask(__name__)
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
 
16
  loading_complete = True
17
+ last_update_time = time.time()
18
+ last_data_hash = None
19
 
 
20
  def get_embedding_model():
 
21
  if not hasattr(get_embedding_model, "model"):
22
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
23
  return get_embedding_model.model
24
 
25
  def get_vector_db():
 
26
  if not os.path.exists(LOCAL_DB_DIR):
 
27
  return None
28
  try:
29
  if not hasattr(get_vector_db, "db_instance"):
 
36
  except Exception as e:
37
  logger.error(f"Failed to load vector DB: {e}")
38
  if hasattr(get_vector_db, "db_instance"):
39
+ delattr(get_vector_db, "db_instance")
40
  return None
41
 
 
42
  def load_feeds_in_background():
 
43
  global loading_complete, last_update_time
44
  if not loading_complete:
 
45
  return
 
46
  loading_complete = False
47
  try:
48
+ logger.info("Starting background RSS feed fetch")
49
  articles = fetch_rss_feeds()
50
+ logger.info(f"Fetched {len(articles)} articles")
51
+ process_and_store_articles(articles)
52
+ last_update_time = time.time()
53
+ logger.info("Background feed processing complete")
54
+ upload_to_hf_hub()
 
55
  except Exception as e:
56
  logger.error(f"Error in background feed loading: {e}")
57
  finally:
58
  loading_complete = True
59
 
60
+ def get_all_docs_from_db():
61
+ vector_db = get_vector_db()
62
+ if not vector_db or vector_db._collection.count() == 0:
63
+ return {'documents': [], 'metadatas': []}
64
+ return vector_db.get(include=['documents', 'metadatas'])
65
+
66
+ def format_articles_from_db_results(docs):
67
  enriched_articles = []
68
  seen_keys = set()
69
+
70
  items = []
 
71
  if isinstance(docs, dict) and 'metadatas' in docs:
72
  items = zip(docs.get('documents', []), docs.get('metadatas', []))
 
73
  elif isinstance(docs, list):
74
  items = [(doc.page_content, doc.metadata) for doc, score in docs]
75
 
76
  for doc_content, meta in items:
77
  if not meta: continue
 
78
  title = meta.get("title", "No Title")
79
  link = meta.get("link", "")
80
+ published = meta.get("published", "Unknown Date").strip()
81
+ key = f"{title}|{link}|{published}"
82
 
83
  if key not in seen_keys:
84
  seen_keys.add(key)
 
 
85
  try:
86
+ published_iso = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat()
 
87
  except (ValueError, TypeError):
88
+ published_iso = "1970-01-01T00:00:00"
89
 
90
  enriched_articles.append({
 
91
  "title": title,
92
  "link": link,
93
  "description": meta.get("original_description", "No Description"),
 
95
  "published": published_iso,
96
  "image": meta.get("image", "svg"),
97
  })
98
+
99
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
100
  return enriched_articles
101
 
102
+ def compute_data_hash(categorized_articles):
103
+ if not categorized_articles: return ""
104
+ data_str = ""
105
+ for cat, articles in sorted(categorized_articles.items()):
106
+ for article in sorted(articles, key=lambda x: x["published"]):
107
+ data_str += f"{cat}|{article['title']}|{article['link']}|{article['published']}|"
108
+ return hashlib.sha256(data_str.encode('utf-8')).hexdigest()
109
+
110
  @app.route('/')
111
  def index():
112
+ global loading_complete, last_update_time, last_data_hash
113
+
 
 
 
114
  if not os.path.exists(LOCAL_DB_DIR):
115
  logger.info(f"No Chroma DB found at '{LOCAL_DB_DIR}', downloading from Hugging Face Hub...")
116
  download_from_hf_hub()
117
 
 
118
  threading.Thread(target=load_feeds_in_background, daemon=True).start()
119
 
120
  try:
121
+ all_docs = get_all_docs_from_db()
122
+ if not all_docs['metadatas']:
 
 
 
 
 
 
 
123
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
124
 
125
+ enriched_articles = format_articles_from_db_results(all_docs)
 
126
 
127
  categorized_articles = {}
128
  for article in enriched_articles:
 
130
  categorized_articles.setdefault(cat, []).append(article)
131
 
132
  categorized_articles = dict(sorted(categorized_articles.items()))
 
 
133
  for cat in categorized_articles:
134
  categorized_articles[cat] = categorized_articles[cat][:10]
135
+
136
+ last_data_hash = compute_data_hash(categorized_articles)
137
 
138
+ return render_template("index.html", categorized_articles=categorized_articles, has_articles=True, loading=not loading_complete)
 
 
 
 
 
 
139
  except Exception as e:
140
+ logger.error(f"Error retrieving articles at startup: {e}", exc_info=True)
141
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
 
 
142
 
 
 
143
  @app.route('/search', methods=['POST'])
144
  def search():
 
 
145
  query = request.form.get('search')
146
  if not query:
147
  return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
148
 
149
  vector_db = get_vector_db()
150
  if not vector_db:
151
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
152
+
153
+ try:
154
+ results = vector_db.similarity_search_with_relevance_scores(query, k=50)
155
+ enriched_articles = format_articles_from_db_results(results)
156
 
157
+ categorized_articles = {}
158
+ for article in enriched_articles:
159
+ cat = article["category"]
160
+ categorized_articles.setdefault(cat, []).append(article)
161
 
162
+ return jsonify({
163
+ "categorized_articles": categorized_articles,
164
+ "has_articles": bool(enriched_articles),
165
+ "loading": False
166
+ })
167
+ except Exception as e:
168
+ logger.error(f"Semantic search error: {e}", exc_info=True)
169
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False}), 500
170
+
171
+ @app.route('/get_all_articles/<category>')
172
+ def get_all_articles(category):
173
+ try:
174
+ all_docs = get_all_docs_from_db()
175
+ enriched_articles = format_articles_from_db_results(all_docs)
176
+ category_articles = [article for article in enriched_articles if article["category"] == category]
177
+ return jsonify({"articles": category_articles, "category": category})
178
+ except Exception as e:
179
+ logger.error(f"Error fetching all articles for category {category}: {e}")
180
+ return jsonify({"articles": [], "category": category}), 500
181
+
182
+ @app.route('/check_loading')
183
+ def check_loading():
184
+ return jsonify({"status": "complete" if loading_complete else "loading", "last_update": last_update_time})
185
+
186
+ @app.route('/get_updates')
187
+ def get_updates():
188
+ global last_update_time, last_data_hash
189
+ try:
190
+ all_docs = get_all_docs_from_db()
191
+ if not all_docs['metadatas']:
192
+ return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False})
193
+
194
+ enriched_articles = format_articles_from_db_results(all_docs)
195
+ categorized_articles = {}
196
+ for article in enriched_articles:
197
+ cat = article["category"]
198
+ categorized_articles.setdefault(cat, []).append(article)
199
+
200
+ for cat in categorized_articles:
201
+ categorized_articles[cat] = categorized_articles[cat][:10]
202
 
203
+ current_data_hash = compute_data_hash(categorized_articles)
204
+ has_updates = last_data_hash != current_data_hash
205
+
206
+ if has_updates:
207
+ last_data_hash = current_data_hash
208
+ return jsonify({"articles": categorized_articles, "last_update": last_update_time, "has_updates": True})
209
+ else:
210
+ return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False})
211
+ except Exception as e:
212
+ logger.error(f"Error fetching updates: {e}")
213
+ return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False}), 500
214
 
215
+ @app.route('/card')
216
+ def card_load():
217
+ return render_template("card.html")
218
 
219
  @app.route('/api/v1/search', methods=['GET'])
220
  def api_search():
 
221
  query = request.args.get('q')
222
  limit = request.args.get('limit', default=20, type=int)
223
 
 
230
 
231
  try:
232
  results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
233
+ formatted_articles = format_articles_from_db_results(results)
234
  return jsonify(formatted_articles)
235
  except Exception as e:
236
  logger.error(f"API Search error: {e}", exc_info=True)
 
238
 
239
  @app.route('/api/v1/articles/category/<string:category_name>', methods=['GET'])
240
  def api_get_articles_by_category(category_name):
 
241
  limit = request.args.get('limit', default=20, type=int)
242
  offset = request.args.get('offset', default=0, type=int)
243
 
 
247
 
248
  try:
249
  results = vector_db.get(where={"category": category_name}, include=['documents', 'metadatas'])
250
+ formatted_articles = format_articles_from_db_results(results)
251
  paginated_results = formatted_articles[offset : offset + limit]
252
 
253
  return jsonify({
 
259
  logger.error(f"API Category fetch error: {e}", exc_info=True)
260
  return jsonify({"error": "An internal error occurred."}), 500
261
 
262
+ @app.route('/api/v1/categories', methods=['GET'])
263
+ def api_get_categories():
264
+ vector_db = get_vector_db()
265
+ if not vector_db:
266
+ return jsonify({"error": "Database not available."}), 503
267
+
268
+ try:
269
+ all_metadata = vector_db.get(include=['metadatas'])['metadatas']
270
+ if not all_metadata:
271
+ return jsonify([])
272
+
273
+ unique_categories = sorted(list({meta['category'] for meta in all_metadata if 'category' in meta}))
274
+ return jsonify(unique_categories)
275
+ except Exception as e:
276
+ logger.error(f"API Categories fetch error: {e}", exc_info=True)
277
+ return jsonify({"error": "An internal error occurred."}), 500
278
+
279
+ @app.route('/api/v1/status', methods=['GET'])
280
+ def api_get_status():
281
+ return jsonify({
282
+ "status": "complete" if loading_complete else "loading",
283
+ "last_update_time": last_update_time
284
+ })
285
 
 
286
  if __name__ == "__main__":
287
+ app.run(host="0.0.0.0", port=7860)