Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,6 @@ from flask import Flask, render_template, request, jsonify
|
|
4 |
from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text, LOCAL_DB_DIR
|
5 |
import logging
|
6 |
import time
|
7 |
-
import json
|
8 |
from datetime import datetime
|
9 |
from langchain.vectorstores import Chroma
|
10 |
from langchain.embeddings import HuggingFaceEmbeddings
|
@@ -21,7 +20,6 @@ last_update_time = None
|
|
21 |
# --- Embedding and Vector DB Management ---
|
22 |
def get_embedding_model():
|
23 |
"""Initializes and returns a singleton HuggingFace embedding model."""
|
24 |
-
# Using a simple hasattr check for a singleton pattern
|
25 |
if not hasattr(get_embedding_model, "model"):
|
26 |
get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
27 |
return get_embedding_model.model
|
@@ -32,7 +30,6 @@ def get_vector_db():
|
|
32 |
logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
|
33 |
return None
|
34 |
try:
|
35 |
-
# Using a simple hasattr check for a singleton pattern
|
36 |
if not hasattr(get_vector_db, "db_instance"):
|
37 |
get_vector_db.db_instance = Chroma(
|
38 |
persist_directory=LOCAL_DB_DIR,
|
@@ -42,7 +39,6 @@ def get_vector_db():
|
|
42 |
return get_vector_db.db_instance
|
43 |
except Exception as e:
|
44 |
logger.error(f"Failed to load vector DB: {e}")
|
45 |
-
# Invalidate instance on failure
|
46 |
if hasattr(get_vector_db, "db_instance"):
|
47 |
delattr(get_vector_db, "db_instance")
|
48 |
return None
|
@@ -51,7 +47,6 @@ def get_vector_db():
|
|
51 |
def load_feeds_in_background():
|
52 |
"""Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
|
53 |
global loading_complete, last_update_time
|
54 |
-
# Ensure only one background process runs at a time
|
55 |
if not loading_complete:
|
56 |
logger.info("An update is already in progress. Skipping.")
|
57 |
return
|
@@ -71,19 +66,19 @@ def load_feeds_in_background():
|
|
71 |
finally:
|
72 |
loading_complete = True
|
73 |
|
74 |
-
# --- Data Transformation Helper ---
|
75 |
def format_articles_from_db(docs):
|
76 |
"""
|
77 |
-
Takes ChromaDB documents
|
78 |
-
Handles deduplication based on title and link.
|
79 |
"""
|
80 |
enriched_articles = []
|
81 |
seen_keys = set()
|
82 |
|
83 |
-
# The 'docs' can be a list of (Document, score) tuples or a dict from .get()
|
84 |
items = []
|
|
|
85 |
if isinstance(docs, dict) and 'metadatas' in docs:
|
86 |
-
items = zip(docs
|
|
|
87 |
elif isinstance(docs, list):
|
88 |
items = [(doc.page_content, doc.metadata) for doc, score in docs]
|
89 |
|
@@ -92,21 +87,20 @@ def format_articles_from_db(docs):
|
|
92 |
|
93 |
title = meta.get("title", "No Title")
|
94 |
link = meta.get("link", "")
|
95 |
-
# Use a composite key to identify unique articles
|
96 |
key = f"{title}|{link}"
|
97 |
|
98 |
if key not in seen_keys:
|
99 |
seen_keys.add(key)
|
100 |
|
101 |
-
# Safely parse the published date
|
102 |
published_str = meta.get("published", "").strip()
|
103 |
try:
|
|
|
104 |
published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
|
105 |
except (ValueError, TypeError):
|
106 |
-
published_iso = datetime.utcnow().isoformat()
|
107 |
|
108 |
enriched_articles.append({
|
109 |
-
"id": meta.get("id", link),
|
110 |
"title": title,
|
111 |
"link": link,
|
112 |
"description": meta.get("original_description", "No Description"),
|
@@ -115,33 +109,96 @@ def format_articles_from_db(docs):
|
|
115 |
"image": meta.get("image", "svg"),
|
116 |
})
|
117 |
|
118 |
-
# Sort by date descending by default
|
119 |
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
120 |
return enriched_articles
|
121 |
|
122 |
# --------------------------------------------------------------------------------
|
123 |
-
# ---
|
124 |
# --------------------------------------------------------------------------------
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
#
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
#
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
#
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
# --------------------------------------------------------------------------------
|
146 |
|
147 |
@app.route('/api/v1/search', methods=['GET'])
|
@@ -158,7 +215,6 @@ def api_search():
|
|
158 |
return jsonify({"error": "Database not available."}), 503
|
159 |
|
160 |
try:
|
161 |
-
logger.info(f"API: Performing semantic search for: '{query}'")
|
162 |
results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
|
163 |
formatted_articles = format_articles_from_db(results)
|
164 |
return jsonify(formatted_articles)
|
@@ -177,13 +233,7 @@ def api_get_articles_by_category(category_name):
|
|
177 |
return jsonify({"error": "Database not available."}), 503
|
178 |
|
179 |
try:
|
180 |
-
|
181 |
-
# Use Chroma's metadata filtering for efficiency
|
182 |
-
results = vector_db.get(
|
183 |
-
where={"category": category_name},
|
184 |
-
include=['documents', 'metadatas']
|
185 |
-
)
|
186 |
-
|
187 |
formatted_articles = format_articles_from_db(results)
|
188 |
paginated_results = formatted_articles[offset : offset + limit]
|
189 |
|
@@ -196,60 +246,8 @@ def api_get_articles_by_category(category_name):
|
|
196 |
logger.error(f"API Category fetch error: {e}", exc_info=True)
|
197 |
return jsonify({"error": "An internal error occurred."}), 500
|
198 |
|
199 |
-
|
200 |
-
def api_get_categories():
|
201 |
-
"""API endpoint to get a list of all unique categories."""
|
202 |
-
vector_db = get_vector_db()
|
203 |
-
if not vector_db:
|
204 |
-
return jsonify({"error": "Database not available."}), 503
|
205 |
-
|
206 |
-
try:
|
207 |
-
# Fetch only metadata to be efficient
|
208 |
-
all_metadata = vector_db.get(include=['metadatas'])['metadatas']
|
209 |
-
if not all_metadata:
|
210 |
-
return jsonify([])
|
211 |
-
|
212 |
-
unique_categories = sorted(list({meta['category'] for meta in all_metadata if 'category' in meta}))
|
213 |
-
return jsonify(unique_categories)
|
214 |
-
except Exception as e:
|
215 |
-
logger.error(f"API Categories fetch error: {e}", exc_info=True)
|
216 |
-
return jsonify({"error": "An internal error occurred."}), 500
|
217 |
-
|
218 |
-
@app.route('/api/v1/status', methods=['GET'])
|
219 |
-
def api_get_status():
|
220 |
-
"""API endpoint to check the data processing status."""
|
221 |
-
return jsonify({
|
222 |
-
"status": "complete" if loading_complete else "loading",
|
223 |
-
"last_update_time": last_update_time
|
224 |
-
})
|
225 |
-
|
226 |
-
# --------------------------------------------------------------------------------
|
227 |
-
# --- Web Application Routes ---
|
228 |
-
# --------------------------------------------------------------------------------
|
229 |
-
|
230 |
-
@app.route('/')
|
231 |
-
def index():
|
232 |
-
"""Renders the main web page. Data is fetched by frontend JavaScript."""
|
233 |
-
return render_template("index.html")
|
234 |
-
|
235 |
-
@app.route('/card')
|
236 |
-
def card_load():
|
237 |
-
"""Renders a sample card component."""
|
238 |
-
return render_template("card.html")
|
239 |
|
240 |
# --- Main Application Runner ---
|
241 |
if __name__ == "__main__":
|
242 |
-
# On startup, ensure the database exists or download it.
|
243 |
-
if not os.path.exists(LOCAL_DB_DIR):
|
244 |
-
logger.info(f"No local DB found at '{LOCAL_DB_DIR}'. Downloading from Hugging Face Hub...")
|
245 |
-
download_from_hf_hub()
|
246 |
-
|
247 |
-
# Initialize the vector DB instance
|
248 |
-
get_vector_db()
|
249 |
-
|
250 |
-
# Start the first background update immediately.
|
251 |
-
threading.Thread(target=load_feeds_in_background, daemon=True).start()
|
252 |
-
|
253 |
-
# Note: For a production environment, use a proper WSGI server like Gunicorn or uWSGI
|
254 |
-
# instead of Flask's built-in development server.
|
255 |
app.run(host="0.0.0.0", port=7860, debug=False)
|
|
|
4 |
from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text, LOCAL_DB_DIR
|
5 |
import logging
|
6 |
import time
|
|
|
7 |
from datetime import datetime
|
8 |
from langchain.vectorstores import Chroma
|
9 |
from langchain.embeddings import HuggingFaceEmbeddings
|
|
|
20 |
# --- Embedding and Vector DB Management ---
|
21 |
def get_embedding_model():
|
22 |
"""Initializes and returns a singleton HuggingFace embedding model."""
|
|
|
23 |
if not hasattr(get_embedding_model, "model"):
|
24 |
get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
25 |
return get_embedding_model.model
|
|
|
30 |
logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
|
31 |
return None
|
32 |
try:
|
|
|
33 |
if not hasattr(get_vector_db, "db_instance"):
|
34 |
get_vector_db.db_instance = Chroma(
|
35 |
persist_directory=LOCAL_DB_DIR,
|
|
|
39 |
return get_vector_db.db_instance
|
40 |
except Exception as e:
|
41 |
logger.error(f"Failed to load vector DB: {e}")
|
|
|
42 |
if hasattr(get_vector_db, "db_instance"):
|
43 |
delattr(get_vector_db, "db_instance")
|
44 |
return None
|
|
|
47 |
def load_feeds_in_background():
|
48 |
"""Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
|
49 |
global loading_complete, last_update_time
|
|
|
50 |
if not loading_complete:
|
51 |
logger.info("An update is already in progress. Skipping.")
|
52 |
return
|
|
|
66 |
finally:
|
67 |
loading_complete = True
|
68 |
|
69 |
+
# --- Data Transformation Helper (Used by both SSR and API) ---
|
70 |
def format_articles_from_db(docs):
|
71 |
"""
|
72 |
+
Takes ChromaDB documents and formats them into a standardized list of article dictionaries.
|
|
|
73 |
"""
|
74 |
enriched_articles = []
|
75 |
seen_keys = set()
|
76 |
|
|
|
77 |
items = []
|
78 |
+
# Handle .get() results (dict of lists)
|
79 |
if isinstance(docs, dict) and 'metadatas' in docs:
|
80 |
+
items = zip(docs.get('documents', []), docs.get('metadatas', []))
|
81 |
+
# Handle similarity_search results (list of (Document, score) tuples)
|
82 |
elif isinstance(docs, list):
|
83 |
items = [(doc.page_content, doc.metadata) for doc, score in docs]
|
84 |
|
|
|
87 |
|
88 |
title = meta.get("title", "No Title")
|
89 |
link = meta.get("link", "")
|
|
|
90 |
key = f"{title}|{link}"
|
91 |
|
92 |
if key not in seen_keys:
|
93 |
seen_keys.add(key)
|
94 |
|
|
|
95 |
published_str = meta.get("published", "").strip()
|
96 |
try:
|
97 |
+
# The format from your original `process_and_store_articles`
|
98 |
published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
|
99 |
except (ValueError, TypeError):
|
100 |
+
published_iso = datetime.utcnow().isoformat()
|
101 |
|
102 |
enriched_articles.append({
|
103 |
+
"id": meta.get("id", link),
|
104 |
"title": title,
|
105 |
"link": link,
|
106 |
"description": meta.get("original_description", "No Description"),
|
|
|
109 |
"image": meta.get("image", "svg"),
|
110 |
})
|
111 |
|
|
|
112 |
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
113 |
return enriched_articles
|
114 |
|
115 |
# --------------------------------------------------------------------------------
|
116 |
+
# --- Web Application Route (Server-Side Rendered) ---
|
117 |
# --------------------------------------------------------------------------------
|
118 |
+
@app.route('/')
|
119 |
+
def index():
|
120 |
+
"""
|
121 |
+
Renders the main web page by fetching, processing, and passing data
|
122 |
+
to the template on the server side. This preserves the original functionality.
|
123 |
+
"""
|
124 |
+
# Perform startup checks
|
125 |
+
if not os.path.exists(LOCAL_DB_DIR):
|
126 |
+
logger.info(f"No Chroma DB found at '{LOCAL_DB_DIR}', downloading from Hugging Face Hub...")
|
127 |
+
download_from_hf_hub()
|
128 |
+
|
129 |
+
# Trigger background update
|
130 |
+
threading.Thread(target=load_feeds_in_background, daemon=True).start()
|
131 |
+
|
132 |
+
try:
|
133 |
+
# Fetch all data from the DB for rendering
|
134 |
+
vector_db = get_vector_db()
|
135 |
+
if not vector_db:
|
136 |
+
raise ConnectionError("Database could not be loaded.")
|
137 |
+
|
138 |
+
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
139 |
+
|
140 |
+
if not all_docs or not all_docs['metadatas']:
|
141 |
+
logger.info("No articles in the DB yet for initial render.")
|
142 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
|
143 |
+
|
144 |
+
# Process and categorize articles for the template
|
145 |
+
enriched_articles = format_articles_from_db(all_docs)
|
146 |
+
|
147 |
+
categorized_articles = {}
|
148 |
+
for article in enriched_articles:
|
149 |
+
cat = article["category"]
|
150 |
+
categorized_articles.setdefault(cat, []).append(article)
|
151 |
+
|
152 |
+
categorized_articles = dict(sorted(categorized_articles.items()))
|
153 |
+
|
154 |
+
# Limit to 10 articles per category for the main page view
|
155 |
+
for cat in categorized_articles:
|
156 |
+
categorized_articles[cat] = categorized_articles[cat][:10]
|
157 |
+
|
158 |
+
return render_template(
|
159 |
+
"index.html",
|
160 |
+
categorized_articles=categorized_articles,
|
161 |
+
has_articles=True,
|
162 |
+
# The original code didn't pass loading, but it's good practice
|
163 |
+
loading=not loading_complete
|
164 |
+
)
|
165 |
+
except Exception as e:
|
166 |
+
logger.error(f"Error rendering index page: {e}", exc_info=True)
|
167 |
+
# Fallback render in case of error
|
168 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=True, error="Could not load articles.")
|
169 |
+
|
170 |
+
|
171 |
+
# Your original search route, which was also server-side
|
172 |
+
# We can keep it or decide to use the API for search on the frontend
|
173 |
+
@app.route('/search', methods=['POST'])
|
174 |
+
def search():
|
175 |
+
# This route returns a JSON payload to be handled by JavaScript.
|
176 |
+
# It functions like an API endpoint and is a good example of a hybrid approach.
|
177 |
+
query = request.form.get('search')
|
178 |
+
if not query:
|
179 |
+
return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
|
180 |
+
|
181 |
+
vector_db = get_vector_db()
|
182 |
+
if not vector_db:
|
183 |
+
return jsonify({"error": "Database not available"}), 503
|
184 |
+
|
185 |
+
results = vector_db.similarity_search_with_relevance_scores(query, k=50)
|
186 |
+
enriched_articles = format_articles_from_db(results)
|
187 |
+
|
188 |
+
categorized_articles = {}
|
189 |
+
for article in enriched_articles:
|
190 |
+
cat = article["category"]
|
191 |
+
categorized_articles.setdefault(cat, []).append(article)
|
192 |
+
|
193 |
+
return jsonify({
|
194 |
+
"categorized_articles": categorized_articles,
|
195 |
+
"has_articles": bool(enriched_articles),
|
196 |
+
"loading": False
|
197 |
+
})
|
198 |
+
|
199 |
+
|
200 |
+
# --------------------------------------------------------------------------------
|
201 |
+
# --- NEW: Standalone API v1 Endpoints (Return only JSON) ---
|
202 |
# --------------------------------------------------------------------------------
|
203 |
|
204 |
@app.route('/api/v1/search', methods=['GET'])
|
|
|
215 |
return jsonify({"error": "Database not available."}), 503
|
216 |
|
217 |
try:
|
|
|
218 |
results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
|
219 |
formatted_articles = format_articles_from_db(results)
|
220 |
return jsonify(formatted_articles)
|
|
|
233 |
return jsonify({"error": "Database not available."}), 503
|
234 |
|
235 |
try:
|
236 |
+
results = vector_db.get(where={"category": category_name}, include=['documents', 'metadatas'])
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
formatted_articles = format_articles_from_db(results)
|
238 |
paginated_results = formatted_articles[offset : offset + limit]
|
239 |
|
|
|
246 |
logger.error(f"API Category fetch error: {e}", exc_info=True)
|
247 |
return jsonify({"error": "An internal error occurred."}), 500
|
248 |
|
249 |
+
# Other routes like /card, /get_updates, etc. from your original file would go here.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
# --- Main Application Runner ---
|
252 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
app.run(host="0.0.0.0", port=7860, debug=False)
|