broadfield-dev commited on
Commit
1252efa
·
verified ·
1 Parent(s): 679afad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -97
app.py CHANGED
@@ -4,7 +4,6 @@ from flask import Flask, render_template, request, jsonify
4
  from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text, LOCAL_DB_DIR
5
  import logging
6
  import time
7
- import json
8
  from datetime import datetime
9
  from langchain.vectorstores import Chroma
10
  from langchain.embeddings import HuggingFaceEmbeddings
@@ -21,7 +20,6 @@ last_update_time = None
21
  # --- Embedding and Vector DB Management ---
22
  def get_embedding_model():
23
  """Initializes and returns a singleton HuggingFace embedding model."""
24
- # Using a simple hasattr check for a singleton pattern
25
  if not hasattr(get_embedding_model, "model"):
26
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
27
  return get_embedding_model.model
@@ -32,7 +30,6 @@ def get_vector_db():
32
  logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
33
  return None
34
  try:
35
- # Using a simple hasattr check for a singleton pattern
36
  if not hasattr(get_vector_db, "db_instance"):
37
  get_vector_db.db_instance = Chroma(
38
  persist_directory=LOCAL_DB_DIR,
@@ -42,7 +39,6 @@ def get_vector_db():
42
  return get_vector_db.db_instance
43
  except Exception as e:
44
  logger.error(f"Failed to load vector DB: {e}")
45
- # Invalidate instance on failure
46
  if hasattr(get_vector_db, "db_instance"):
47
  delattr(get_vector_db, "db_instance")
48
  return None
@@ -51,7 +47,6 @@ def get_vector_db():
51
  def load_feeds_in_background():
52
  """Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
53
  global loading_complete, last_update_time
54
- # Ensure only one background process runs at a time
55
  if not loading_complete:
56
  logger.info("An update is already in progress. Skipping.")
57
  return
@@ -71,19 +66,19 @@ def load_feeds_in_background():
71
  finally:
72
  loading_complete = True
73
 
74
- # --- Data Transformation Helper ---
75
  def format_articles_from_db(docs):
76
  """
77
- Takes ChromaDB documents (with metadata) and formats them into a standardized list of article dictionaries.
78
- Handles deduplication based on title and link.
79
  """
80
  enriched_articles = []
81
  seen_keys = set()
82
 
83
- # The 'docs' can be a list of (Document, score) tuples or a dict from .get()
84
  items = []
 
85
  if isinstance(docs, dict) and 'metadatas' in docs:
86
- items = zip(docs['documents'], docs['metadatas'])
 
87
  elif isinstance(docs, list):
88
  items = [(doc.page_content, doc.metadata) for doc, score in docs]
89
 
@@ -92,21 +87,20 @@ def format_articles_from_db(docs):
92
 
93
  title = meta.get("title", "No Title")
94
  link = meta.get("link", "")
95
- # Use a composite key to identify unique articles
96
  key = f"{title}|{link}"
97
 
98
  if key not in seen_keys:
99
  seen_keys.add(key)
100
 
101
- # Safely parse the published date
102
  published_str = meta.get("published", "").strip()
103
  try:
 
104
  published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
105
  except (ValueError, TypeError):
106
- published_iso = datetime.utcnow().isoformat() # Default to now if format is wrong
107
 
108
  enriched_articles.append({
109
- "id": meta.get("id", link), # Provide a unique ID
110
  "title": title,
111
  "link": link,
112
  "description": meta.get("original_description", "No Description"),
@@ -115,33 +109,96 @@ def format_articles_from_db(docs):
115
  "image": meta.get("image", "svg"),
116
  })
117
 
118
- # Sort by date descending by default
119
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
120
  return enriched_articles
121
 
122
  # --------------------------------------------------------------------------------
123
- # --- API v1 Endpoints ---
124
  # --------------------------------------------------------------------------------
125
- #
126
- # API Usage Guide:
127
- #
128
- # GET /api/v1/search?q=<query>&limit=<n>
129
- # - Performs semantic search.
130
- # - `q`: The search term (required).
131
- # - `limit`: Max number of results to return (optional, default=20).
132
- #
133
- # GET /api/v1/articles/category/<name>?limit=<n>&offset=<o>
134
- # - Retrieves all articles for a given category.
135
- # - `name`: The category name (e.g., "Technology").
136
- # - `limit`: For pagination (optional, default=20).
137
- # - `offset`: For pagination (optional, default=0).
138
- #
139
- # GET /api/v1/categories
140
- # - Returns a list of all unique article categories.
141
- #
142
- # GET /api/v1/status
143
- # - Checks the status of the background data processing task.
144
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  # --------------------------------------------------------------------------------
146
 
147
  @app.route('/api/v1/search', methods=['GET'])
@@ -158,7 +215,6 @@ def api_search():
158
  return jsonify({"error": "Database not available."}), 503
159
 
160
  try:
161
- logger.info(f"API: Performing semantic search for: '{query}'")
162
  results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
163
  formatted_articles = format_articles_from_db(results)
164
  return jsonify(formatted_articles)
@@ -177,13 +233,7 @@ def api_get_articles_by_category(category_name):
177
  return jsonify({"error": "Database not available."}), 503
178
 
179
  try:
180
- logger.info(f"API: Fetching articles for category '{category_name}'")
181
- # Use Chroma's metadata filtering for efficiency
182
- results = vector_db.get(
183
- where={"category": category_name},
184
- include=['documents', 'metadatas']
185
- )
186
-
187
  formatted_articles = format_articles_from_db(results)
188
  paginated_results = formatted_articles[offset : offset + limit]
189
 
@@ -196,60 +246,8 @@ def api_get_articles_by_category(category_name):
196
  logger.error(f"API Category fetch error: {e}", exc_info=True)
197
  return jsonify({"error": "An internal error occurred."}), 500
198
 
199
- @app.route('/api/v1/categories', methods=['GET'])
200
- def api_get_categories():
201
- """API endpoint to get a list of all unique categories."""
202
- vector_db = get_vector_db()
203
- if not vector_db:
204
- return jsonify({"error": "Database not available."}), 503
205
-
206
- try:
207
- # Fetch only metadata to be efficient
208
- all_metadata = vector_db.get(include=['metadatas'])['metadatas']
209
- if not all_metadata:
210
- return jsonify([])
211
-
212
- unique_categories = sorted(list({meta['category'] for meta in all_metadata if 'category' in meta}))
213
- return jsonify(unique_categories)
214
- except Exception as e:
215
- logger.error(f"API Categories fetch error: {e}", exc_info=True)
216
- return jsonify({"error": "An internal error occurred."}), 500
217
-
218
- @app.route('/api/v1/status', methods=['GET'])
219
- def api_get_status():
220
- """API endpoint to check the data processing status."""
221
- return jsonify({
222
- "status": "complete" if loading_complete else "loading",
223
- "last_update_time": last_update_time
224
- })
225
-
226
- # --------------------------------------------------------------------------------
227
- # --- Web Application Routes ---
228
- # --------------------------------------------------------------------------------
229
-
230
- @app.route('/')
231
- def index():
232
- """Renders the main web page. Data is fetched by frontend JavaScript."""
233
- return render_template("index.html")
234
-
235
- @app.route('/card')
236
- def card_load():
237
- """Renders a sample card component."""
238
- return render_template("card.html")
239
 
240
  # --- Main Application Runner ---
241
  if __name__ == "__main__":
242
- # On startup, ensure the database exists or download it.
243
- if not os.path.exists(LOCAL_DB_DIR):
244
- logger.info(f"No local DB found at '{LOCAL_DB_DIR}'. Downloading from Hugging Face Hub...")
245
- download_from_hf_hub()
246
-
247
- # Initialize the vector DB instance
248
- get_vector_db()
249
-
250
- # Start the first background update immediately.
251
- threading.Thread(target=load_feeds_in_background, daemon=True).start()
252
-
253
- # Note: For a production environment, use a proper WSGI server like Gunicorn or uWSGI
254
- # instead of Flask's built-in development server.
255
  app.run(host="0.0.0.0", port=7860, debug=False)
 
4
  from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text, LOCAL_DB_DIR
5
  import logging
6
  import time
 
7
  from datetime import datetime
8
  from langchain.vectorstores import Chroma
9
  from langchain.embeddings import HuggingFaceEmbeddings
 
20
  # --- Embedding and Vector DB Management ---
21
  def get_embedding_model():
22
  """Initializes and returns a singleton HuggingFace embedding model."""
 
23
  if not hasattr(get_embedding_model, "model"):
24
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
  return get_embedding_model.model
 
30
  logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
31
  return None
32
  try:
 
33
  if not hasattr(get_vector_db, "db_instance"):
34
  get_vector_db.db_instance = Chroma(
35
  persist_directory=LOCAL_DB_DIR,
 
39
  return get_vector_db.db_instance
40
  except Exception as e:
41
  logger.error(f"Failed to load vector DB: {e}")
 
42
  if hasattr(get_vector_db, "db_instance"):
43
  delattr(get_vector_db, "db_instance")
44
  return None
 
47
  def load_feeds_in_background():
48
  """Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
49
  global loading_complete, last_update_time
 
50
  if not loading_complete:
51
  logger.info("An update is already in progress. Skipping.")
52
  return
 
66
  finally:
67
  loading_complete = True
68
 
69
+ # --- Data Transformation Helper (Used by both SSR and API) ---
70
  def format_articles_from_db(docs):
71
  """
72
+ Takes ChromaDB documents and formats them into a standardized list of article dictionaries.
 
73
  """
74
  enriched_articles = []
75
  seen_keys = set()
76
 
 
77
  items = []
78
+ # Handle .get() results (dict of lists)
79
  if isinstance(docs, dict) and 'metadatas' in docs:
80
+ items = zip(docs.get('documents', []), docs.get('metadatas', []))
81
+ # Handle similarity_search results (list of (Document, score) tuples)
82
  elif isinstance(docs, list):
83
  items = [(doc.page_content, doc.metadata) for doc, score in docs]
84
 
 
87
 
88
  title = meta.get("title", "No Title")
89
  link = meta.get("link", "")
 
90
  key = f"{title}|{link}"
91
 
92
  if key not in seen_keys:
93
  seen_keys.add(key)
94
 
 
95
  published_str = meta.get("published", "").strip()
96
  try:
97
+ # The format from your original `process_and_store_articles`
98
  published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
99
  except (ValueError, TypeError):
100
+ published_iso = datetime.utcnow().isoformat()
101
 
102
  enriched_articles.append({
103
+ "id": meta.get("id", link),
104
  "title": title,
105
  "link": link,
106
  "description": meta.get("original_description", "No Description"),
 
109
  "image": meta.get("image", "svg"),
110
  })
111
 
 
112
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
113
  return enriched_articles
114
 
115
  # --------------------------------------------------------------------------------
116
+ # --- Web Application Route (Server-Side Rendered) ---
117
  # --------------------------------------------------------------------------------
118
+ @app.route('/')
119
+ def index():
120
+ """
121
+ Renders the main web page by fetching, processing, and passing data
122
+ to the template on the server side. This preserves the original functionality.
123
+ """
124
+ # Perform startup checks
125
+ if not os.path.exists(LOCAL_DB_DIR):
126
+ logger.info(f"No Chroma DB found at '{LOCAL_DB_DIR}', downloading from Hugging Face Hub...")
127
+ download_from_hf_hub()
128
+
129
+ # Trigger background update
130
+ threading.Thread(target=load_feeds_in_background, daemon=True).start()
131
+
132
+ try:
133
+ # Fetch all data from the DB for rendering
134
+ vector_db = get_vector_db()
135
+ if not vector_db:
136
+ raise ConnectionError("Database could not be loaded.")
137
+
138
+ all_docs = vector_db.get(include=['documents', 'metadatas'])
139
+
140
+ if not all_docs or not all_docs['metadatas']:
141
+ logger.info("No articles in the DB yet for initial render.")
142
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
143
+
144
+ # Process and categorize articles for the template
145
+ enriched_articles = format_articles_from_db(all_docs)
146
+
147
+ categorized_articles = {}
148
+ for article in enriched_articles:
149
+ cat = article["category"]
150
+ categorized_articles.setdefault(cat, []).append(article)
151
+
152
+ categorized_articles = dict(sorted(categorized_articles.items()))
153
+
154
+ # Limit to 10 articles per category for the main page view
155
+ for cat in categorized_articles:
156
+ categorized_articles[cat] = categorized_articles[cat][:10]
157
+
158
+ return render_template(
159
+ "index.html",
160
+ categorized_articles=categorized_articles,
161
+ has_articles=True,
162
+ # The original code didn't pass loading, but it's good practice
163
+ loading=not loading_complete
164
+ )
165
+ except Exception as e:
166
+ logger.error(f"Error rendering index page: {e}", exc_info=True)
167
+ # Fallback render in case of error
168
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True, error="Could not load articles.")
169
+
170
+
171
+ # Your original search route, which was also server-side
172
+ # We can keep it or decide to use the API for search on the frontend
173
+ @app.route('/search', methods=['POST'])
174
+ def search():
175
+ # This route returns a JSON payload to be handled by JavaScript.
176
+ # It functions like an API endpoint and is a good example of a hybrid approach.
177
+ query = request.form.get('search')
178
+ if not query:
179
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
180
+
181
+ vector_db = get_vector_db()
182
+ if not vector_db:
183
+ return jsonify({"error": "Database not available"}), 503
184
+
185
+ results = vector_db.similarity_search_with_relevance_scores(query, k=50)
186
+ enriched_articles = format_articles_from_db(results)
187
+
188
+ categorized_articles = {}
189
+ for article in enriched_articles:
190
+ cat = article["category"]
191
+ categorized_articles.setdefault(cat, []).append(article)
192
+
193
+ return jsonify({
194
+ "categorized_articles": categorized_articles,
195
+ "has_articles": bool(enriched_articles),
196
+ "loading": False
197
+ })
198
+
199
+
200
+ # --------------------------------------------------------------------------------
201
+ # --- NEW: Standalone API v1 Endpoints (Return only JSON) ---
202
  # --------------------------------------------------------------------------------
203
 
204
  @app.route('/api/v1/search', methods=['GET'])
 
215
  return jsonify({"error": "Database not available."}), 503
216
 
217
  try:
 
218
  results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
219
  formatted_articles = format_articles_from_db(results)
220
  return jsonify(formatted_articles)
 
233
  return jsonify({"error": "Database not available."}), 503
234
 
235
  try:
236
+ results = vector_db.get(where={"category": category_name}, include=['documents', 'metadatas'])
 
 
 
 
 
 
237
  formatted_articles = format_articles_from_db(results)
238
  paginated_results = formatted_articles[offset : offset + limit]
239
 
 
246
  logger.error(f"API Category fetch error: {e}", exc_info=True)
247
  return jsonify({"error": "An internal error occurred."}), 500
248
 
249
+ # Other routes like /card, /get_updates, etc. from your original file would go here.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  # --- Main Application Runner ---
252
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  app.run(host="0.0.0.0", port=7860, debug=False)