broadfield-dev commited on
Commit
679afad
·
verified ·
1 Parent(s): bab9790

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -153
app.py CHANGED
@@ -6,232 +6,250 @@ import logging
6
  import time
7
  import json
8
  from datetime import datetime
9
- import hashlib
10
  from langchain.vectorstores import Chroma
11
  from langchain.embeddings import HuggingFaceEmbeddings
12
 
 
13
  app = Flask(__name__)
14
-
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
 
18
  loading_complete = True
19
- last_update_time = time.time()
20
- last_data_hash = None
21
 
 
22
  def get_embedding_model():
 
 
23
  if not hasattr(get_embedding_model, "model"):
24
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
  return get_embedding_model.model
26
 
27
  def get_vector_db():
 
28
  if not os.path.exists(LOCAL_DB_DIR):
 
29
  return None
30
  try:
31
- return Chroma(
32
- persist_directory=LOCAL_DB_DIR,
33
- embedding_function=get_embedding_model(),
34
- collection_name="news_articles"
35
- )
 
 
 
36
  except Exception as e:
37
  logger.error(f"Failed to load vector DB: {e}")
 
 
 
38
  return None
39
 
 
40
  def load_feeds_in_background():
 
41
  global loading_complete, last_update_time
 
 
 
 
 
 
42
  try:
43
- logger.info("Starting background RSS feed fetch")
44
  articles = fetch_rss_feeds()
45
- logger.info(f"Fetched {len(articles)} articles")
46
- process_and_store_articles(articles)
47
- last_update_time = time.time()
48
- logger.info("Background feed processing complete")
49
- upload_to_hf_hub()
 
50
  except Exception as e:
51
  logger.error(f"Error in background feed loading: {e}")
52
  finally:
53
  loading_complete = True
54
 
55
- def get_all_docs_from_db():
56
- vector_db = get_vector_db()
57
- if not vector_db or vector_db._collection.count() == 0:
58
- return {'documents': [], 'metadatas': []}
59
- return vector_db.get(include=['documents', 'metadatas'])
60
-
61
- def compute_data_hash(categorized_articles):
62
- if not categorized_articles: return ""
63
- data_str = ""
64
- for cat, articles in sorted(categorized_articles.items()):
65
- for article in sorted(articles, key=lambda x: x["published"]):
66
- data_str += f"{cat}|{article['title']}|{article['link']}|{article['published']}|"
67
- return hashlib.sha256(data_str.encode('utf-8')).hexdigest()
68
-
69
- def process_docs_into_articles(docs_data):
70
  enriched_articles = []
71
  seen_keys = set()
72
- for doc, meta in zip(docs_data['documents'], docs_data['metadatas']):
 
 
 
 
 
 
 
 
73
  if not meta: continue
 
74
  title = meta.get("title", "No Title")
75
  link = meta.get("link", "")
76
- description = meta.get("original_description", "No Description")
77
- published = meta.get("published", "Unknown Date").strip()
78
 
79
- key = f"{title}|{link}|{published}"
80
  if key not in seen_keys:
81
  seen_keys.add(key)
 
 
 
82
  try:
83
- published_iso = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat()
84
  except (ValueError, TypeError):
85
- published_iso = "1970-01-01T00:00:00"
86
 
87
  enriched_articles.append({
 
88
  "title": title,
89
  "link": link,
90
- "description": description,
91
  "category": meta.get("category", "Uncategorized"),
92
  "published": published_iso,
93
  "image": meta.get("image", "svg"),
94
  })
 
 
 
95
  return enriched_articles
96
 
97
- @app.route('/')
98
- def index():
99
- global loading_complete, last_update_time, last_data_hash
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- if not os.path.exists(LOCAL_DB_DIR):
102
- logger.info(f"No Chroma DB found at '{LOCAL_DB_DIR}', downloading from Hugging Face Hub...")
103
- download_from_hf_hub()
104
 
105
- loading_complete = False
106
- threading.Thread(target=load_feeds_in_background, daemon=True).start()
 
107
 
108
  try:
109
- all_docs = get_all_docs_from_db()
110
- if not all_docs['metadatas']:
111
- logger.info("No articles in the DB yet")
112
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
113
-
114
- enriched_articles = process_docs_into_articles(all_docs)
115
- enriched_articles.sort(key=lambda x: x["published"], reverse=True)
116
-
117
- categorized_articles = {}
118
- for article in enriched_articles:
119
- cat = article["category"]
120
- categorized_articles.setdefault(cat, []).append(article)
121
-
122
- categorized_articles = dict(sorted(categorized_articles.items()))
123
-
124
- for cat in categorized_articles:
125
- categorized_articles[cat] = categorized_articles[cat][:10]
126
-
127
- last_data_hash = compute_data_hash(categorized_articles)
128
-
129
- return render_template("index.html", categorized_articles=categorized_articles, has_articles=True, loading=True)
130
  except Exception as e:
131
- logger.error(f"Error retrieving articles at startup: {e}", exc_info=True)
132
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
133
 
134
- @app.route('/search', methods=['POST'])
135
- def search():
136
- query = request.form.get('search')
137
- if not query:
138
- return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
 
 
 
 
139
 
140
  try:
141
- logger.info(f"Performing semantic search for: '{query}'")
142
- vector_db = get_vector_db()
143
- if not vector_db:
144
- return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
 
 
145
 
146
- results = vector_db.similarity_search_with_relevance_scores(query, k=50)
 
147
 
148
- enriched_articles = []
149
- seen_keys = set()
150
- for doc, score in results:
151
- meta = doc.metadata
152
- title = meta.get("title", "No Title")
153
- link = meta.get("link", "")
154
- key = f"{title}|{link}|{meta.get('published', '')}"
155
- if key not in seen_keys:
156
- seen_keys.add(key)
157
- enriched_articles.append({
158
- "title": title,
159
- "link": link,
160
- "description": meta.get("original_description", "No Description"),
161
- "category": meta.get("category", "Uncategorized"),
162
- "published": meta.get("published", "Unknown Date"),
163
- "image": meta.get("image", "svg"),
164
- })
165
-
166
- categorized_articles = {}
167
- for article in enriched_articles:
168
- cat = article["category"]
169
- categorized_articles.setdefault(cat, []).append(article)
170
-
171
  return jsonify({
172
- "categorized_articles": categorized_articles,
173
- "has_articles": bool(enriched_articles),
174
- "loading": False
175
  })
176
  except Exception as e:
177
- logger.error(f"Semantic search error: {e}", exc_info=True)
178
- return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False}), 500
179
 
180
- @app.route('/get_all_articles/<category>')
181
- def get_all_articles(category):
182
- try:
183
- all_docs = get_all_docs_from_db()
184
- enriched_articles = process_docs_into_articles(all_docs)
185
-
186
- category_articles = [
187
- article for article in enriched_articles if article["category"] == category
188
- ]
189
 
190
- category_articles.sort(key=lambda x: x["published"], reverse=True)
191
- return jsonify({"articles": category_articles, "category": category})
 
 
 
 
 
 
192
  except Exception as e:
193
- logger.error(f"Error fetching all articles for category {category}: {e}")
194
- return jsonify({"articles": [], "category": category}), 500
195
 
196
- @app.route('/check_loading')
197
- def check_loading():
198
- global loading_complete, last_update_time
199
- return jsonify({"status": "complete" if loading_complete else "loading", "last_update": last_update_time})
 
 
 
200
 
201
- @app.route('/get_updates')
202
- def get_updates():
203
- global last_update_time, last_data_hash
204
- try:
205
- all_docs = get_all_docs_from_db()
206
- if not all_docs['metadatas']:
207
- return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False})
208
-
209
- enriched_articles = process_docs_into_articles(all_docs)
210
- categorized_articles = {}
211
- for article in enriched_articles:
212
- cat = article["category"]
213
- categorized_articles.setdefault(cat, []).append(article)
214
-
215
- for cat in categorized_articles:
216
- categorized_articles[cat].sort(key=lambda x: x["published"], reverse=True)
217
- categorized_articles[cat] = categorized_articles[cat][:10]
218
 
219
- current_data_hash = compute_data_hash(categorized_articles)
220
- has_updates = last_data_hash != current_data_hash
221
-
222
- if has_updates:
223
- logger.info("New RSS data detected, sending updates to frontend")
224
- last_data_hash = current_data_hash
225
- return jsonify({"articles": categorized_articles, "last_update": last_update_time, "has_updates": True})
226
- else:
227
- return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False})
228
- except Exception as e:
229
- logger.error(f"Error fetching updates: {e}")
230
- return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False}), 500
231
 
232
  @app.route('/card')
233
  def card_load():
 
234
  return render_template("card.html")
235
 
 
236
  if __name__ == "__main__":
237
- app.run(host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import time
7
  import json
8
  from datetime import datetime
 
9
  from langchain.vectorstores import Chroma
10
  from langchain.embeddings import HuggingFaceEmbeddings
11
 
12
+ # --- Basic Flask App Setup ---
13
  app = Flask(__name__)
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
+ # --- Global State Management ---
18
  loading_complete = True
19
+ last_update_time = None
 
20
 
21
+ # --- Embedding and Vector DB Management ---
22
  def get_embedding_model():
23
+ """Initializes and returns a singleton HuggingFace embedding model."""
24
+ # Using a simple hasattr check for a singleton pattern
25
  if not hasattr(get_embedding_model, "model"):
26
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
27
  return get_embedding_model.model
28
 
29
  def get_vector_db():
30
+ """Initializes and returns a singleton Chroma DB client."""
31
  if not os.path.exists(LOCAL_DB_DIR):
32
+ logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
33
  return None
34
  try:
35
+ # Using a simple hasattr check for a singleton pattern
36
+ if not hasattr(get_vector_db, "db_instance"):
37
+ get_vector_db.db_instance = Chroma(
38
+ persist_directory=LOCAL_DB_DIR,
39
+ embedding_function=get_embedding_model(),
40
+ collection_name="news_articles"
41
+ )
42
+ return get_vector_db.db_instance
43
  except Exception as e:
44
  logger.error(f"Failed to load vector DB: {e}")
45
+ # Invalidate instance on failure
46
+ if hasattr(get_vector_db, "db_instance"):
47
+ delattr(get_vector_db, "db_instance")
48
  return None
49
 
50
+ # --- Background Processing ---
51
  def load_feeds_in_background():
52
+ """Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
53
  global loading_complete, last_update_time
54
+ # Ensure only one background process runs at a time
55
+ if not loading_complete:
56
+ logger.info("An update is already in progress. Skipping.")
57
+ return
58
+
59
+ loading_complete = False
60
  try:
61
+ logger.info("Starting background RSS feed fetch and processing...")
62
  articles = fetch_rss_feeds()
63
+ logger.info(f"Fetched {len(articles)} articles from RSS feeds.")
64
+ if articles:
65
+ process_and_store_articles(articles)
66
+ upload_to_hf_hub()
67
+ last_update_time = datetime.now().isoformat()
68
+ logger.info("Background feed processing complete.")
69
  except Exception as e:
70
  logger.error(f"Error in background feed loading: {e}")
71
  finally:
72
  loading_complete = True
73
 
74
+ # --- Data Transformation Helper ---
75
+ def format_articles_from_db(docs):
76
+ """
77
+ Takes ChromaDB documents (with metadata) and formats them into a standardized list of article dictionaries.
78
+ Handles deduplication based on title and link.
79
+ """
 
 
 
 
 
 
 
 
 
80
  enriched_articles = []
81
  seen_keys = set()
82
+
83
+ # The 'docs' can be a list of (Document, score) tuples or a dict from .get()
84
+ items = []
85
+ if isinstance(docs, dict) and 'metadatas' in docs:
86
+ items = zip(docs['documents'], docs['metadatas'])
87
+ elif isinstance(docs, list):
88
+ items = [(doc.page_content, doc.metadata) for doc, score in docs]
89
+
90
+ for doc_content, meta in items:
91
  if not meta: continue
92
+
93
  title = meta.get("title", "No Title")
94
  link = meta.get("link", "")
95
+ # Use a composite key to identify unique articles
96
+ key = f"{title}|{link}"
97
 
 
98
  if key not in seen_keys:
99
  seen_keys.add(key)
100
+
101
+ # Safely parse the published date
102
+ published_str = meta.get("published", "").strip()
103
  try:
104
+ published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
105
  except (ValueError, TypeError):
106
+ published_iso = datetime.utcnow().isoformat() # Default to now if format is wrong
107
 
108
  enriched_articles.append({
109
+ "id": meta.get("id", link), # Provide a unique ID
110
  "title": title,
111
  "link": link,
112
+ "description": meta.get("original_description", "No Description"),
113
  "category": meta.get("category", "Uncategorized"),
114
  "published": published_iso,
115
  "image": meta.get("image", "svg"),
116
  })
117
+
118
+ # Sort by date descending by default
119
+ enriched_articles.sort(key=lambda x: x["published"], reverse=True)
120
  return enriched_articles
121
 
122
+ # --------------------------------------------------------------------------------
123
+ # --- API v1 Endpoints ---
124
+ # --------------------------------------------------------------------------------
125
+ #
126
+ # API Usage Guide:
127
+ #
128
+ # GET /api/v1/search?q=<query>&limit=<n>
129
+ # - Performs semantic search.
130
+ # - `q`: The search term (required).
131
+ # - `limit`: Max number of results to return (optional, default=20).
132
+ #
133
+ # GET /api/v1/articles/category/<name>?limit=<n>&offset=<o>
134
+ # - Retrieves all articles for a given category.
135
+ # - `name`: The category name (e.g., "Technology").
136
+ # - `limit`: For pagination (optional, default=20).
137
+ # - `offset`: For pagination (optional, default=0).
138
+ #
139
+ # GET /api/v1/categories
140
+ # - Returns a list of all unique article categories.
141
+ #
142
+ # GET /api/v1/status
143
+ # - Checks the status of the background data processing task.
144
+ #
145
+ # --------------------------------------------------------------------------------
146
+
147
+ @app.route('/api/v1/search', methods=['GET'])
148
+ def api_search():
149
+ """API endpoint for semantic search."""
150
+ query = request.args.get('q')
151
+ limit = request.args.get('limit', default=20, type=int)
152
 
153
+ if not query:
154
+ return jsonify({"error": "Query parameter 'q' is required."}), 400
 
155
 
156
+ vector_db = get_vector_db()
157
+ if not vector_db:
158
+ return jsonify({"error": "Database not available."}), 503
159
 
160
  try:
161
+ logger.info(f"API: Performing semantic search for: '{query}'")
162
+ results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
163
+ formatted_articles = format_articles_from_db(results)
164
+ return jsonify(formatted_articles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  except Exception as e:
166
+ logger.error(f"API Search error: {e}", exc_info=True)
167
+ return jsonify({"error": "An internal error occurred during search."}), 500
168
 
169
+ @app.route('/api/v1/articles/category/<string:category_name>', methods=['GET'])
170
+ def api_get_articles_by_category(category_name):
171
+ """API endpoint to get articles filtered by category with pagination."""
172
+ limit = request.args.get('limit', default=20, type=int)
173
+ offset = request.args.get('offset', default=0, type=int)
174
+
175
+ vector_db = get_vector_db()
176
+ if not vector_db:
177
+ return jsonify({"error": "Database not available."}), 503
178
 
179
  try:
180
+ logger.info(f"API: Fetching articles for category '{category_name}'")
181
+ # Use Chroma's metadata filtering for efficiency
182
+ results = vector_db.get(
183
+ where={"category": category_name},
184
+ include=['documents', 'metadatas']
185
+ )
186
 
187
+ formatted_articles = format_articles_from_db(results)
188
+ paginated_results = formatted_articles[offset : offset + limit]
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  return jsonify({
191
+ "category": category_name,
192
+ "total_articles": len(formatted_articles),
193
+ "articles": paginated_results
194
  })
195
  except Exception as e:
196
+ logger.error(f"API Category fetch error: {e}", exc_info=True)
197
+ return jsonify({"error": "An internal error occurred."}), 500
198
 
199
+ @app.route('/api/v1/categories', methods=['GET'])
200
+ def api_get_categories():
201
+ """API endpoint to get a list of all unique categories."""
202
+ vector_db = get_vector_db()
203
+ if not vector_db:
204
+ return jsonify({"error": "Database not available."}), 503
 
 
 
205
 
206
+ try:
207
+ # Fetch only metadata to be efficient
208
+ all_metadata = vector_db.get(include=['metadatas'])['metadatas']
209
+ if not all_metadata:
210
+ return jsonify([])
211
+
212
+ unique_categories = sorted(list({meta['category'] for meta in all_metadata if 'category' in meta}))
213
+ return jsonify(unique_categories)
214
  except Exception as e:
215
+ logger.error(f"API Categories fetch error: {e}", exc_info=True)
216
+ return jsonify({"error": "An internal error occurred."}), 500
217
 
218
+ @app.route('/api/v1/status', methods=['GET'])
219
+ def api_get_status():
220
+ """API endpoint to check the data processing status."""
221
+ return jsonify({
222
+ "status": "complete" if loading_complete else "loading",
223
+ "last_update_time": last_update_time
224
+ })
225
 
226
+ # --------------------------------------------------------------------------------
227
+ # --- Web Application Routes ---
228
+ # --------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ @app.route('/')
231
+ def index():
232
+ """Renders the main web page. Data is fetched by frontend JavaScript."""
233
+ return render_template("index.html")
 
 
 
 
 
 
 
 
234
 
235
  @app.route('/card')
236
  def card_load():
237
+ """Renders a sample card component."""
238
  return render_template("card.html")
239
 
240
+ # --- Main Application Runner ---
241
  if __name__ == "__main__":
242
+ # On startup, ensure the database exists or download it.
243
+ if not os.path.exists(LOCAL_DB_DIR):
244
+ logger.info(f"No local DB found at '{LOCAL_DB_DIR}'. Downloading from Hugging Face Hub...")
245
+ download_from_hf_hub()
246
+
247
+ # Initialize the vector DB instance
248
+ get_vector_db()
249
+
250
+ # Start the first background update immediately.
251
+ threading.Thread(target=load_feeds_in_background, daemon=True).start()
252
+
253
+ # Note: For a production environment, use a proper WSGI server like Gunicorn or uWSGI
254
+ # instead of Flask's built-in development server.
255
+ app.run(host="0.0.0.0", port=7860, debug=False)