Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,232 +6,250 @@ import logging
|
|
6 |
import time
|
7 |
import json
|
8 |
from datetime import datetime
|
9 |
-
import hashlib
|
10 |
from langchain.vectorstores import Chroma
|
11 |
from langchain.embeddings import HuggingFaceEmbeddings
|
12 |
|
|
|
13 |
app = Flask(__name__)
|
14 |
-
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
|
|
18 |
loading_complete = True
|
19 |
-
last_update_time =
|
20 |
-
last_data_hash = None
|
21 |
|
|
|
22 |
def get_embedding_model():
|
|
|
|
|
23 |
if not hasattr(get_embedding_model, "model"):
|
24 |
get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
25 |
return get_embedding_model.model
|
26 |
|
27 |
def get_vector_db():
|
|
|
28 |
if not os.path.exists(LOCAL_DB_DIR):
|
|
|
29 |
return None
|
30 |
try:
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
36 |
except Exception as e:
|
37 |
logger.error(f"Failed to load vector DB: {e}")
|
|
|
|
|
|
|
38 |
return None
|
39 |
|
|
|
40 |
def load_feeds_in_background():
|
|
|
41 |
global loading_complete, last_update_time
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
try:
|
43 |
-
logger.info("Starting background RSS feed fetch")
|
44 |
articles = fetch_rss_feeds()
|
45 |
-
logger.info(f"Fetched {len(articles)} articles")
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
50 |
except Exception as e:
|
51 |
logger.error(f"Error in background feed loading: {e}")
|
52 |
finally:
|
53 |
loading_complete = True
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
def compute_data_hash(categorized_articles):
|
62 |
-
if not categorized_articles: return ""
|
63 |
-
data_str = ""
|
64 |
-
for cat, articles in sorted(categorized_articles.items()):
|
65 |
-
for article in sorted(articles, key=lambda x: x["published"]):
|
66 |
-
data_str += f"{cat}|{article['title']}|{article['link']}|{article['published']}|"
|
67 |
-
return hashlib.sha256(data_str.encode('utf-8')).hexdigest()
|
68 |
-
|
69 |
-
def process_docs_into_articles(docs_data):
|
70 |
enriched_articles = []
|
71 |
seen_keys = set()
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
if not meta: continue
|
|
|
74 |
title = meta.get("title", "No Title")
|
75 |
link = meta.get("link", "")
|
76 |
-
|
77 |
-
|
78 |
|
79 |
-
key = f"{title}|{link}|{published}"
|
80 |
if key not in seen_keys:
|
81 |
seen_keys.add(key)
|
|
|
|
|
|
|
82 |
try:
|
83 |
-
published_iso = datetime.strptime(
|
84 |
except (ValueError, TypeError):
|
85 |
-
published_iso =
|
86 |
|
87 |
enriched_articles.append({
|
|
|
88 |
"title": title,
|
89 |
"link": link,
|
90 |
-
"description":
|
91 |
"category": meta.get("category", "Uncategorized"),
|
92 |
"published": published_iso,
|
93 |
"image": meta.get("image", "svg"),
|
94 |
})
|
|
|
|
|
|
|
95 |
return enriched_articles
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
if not
|
102 |
-
|
103 |
-
download_from_hf_hub()
|
104 |
|
105 |
-
|
106 |
-
|
|
|
107 |
|
108 |
try:
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
enriched_articles = process_docs_into_articles(all_docs)
|
115 |
-
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
116 |
-
|
117 |
-
categorized_articles = {}
|
118 |
-
for article in enriched_articles:
|
119 |
-
cat = article["category"]
|
120 |
-
categorized_articles.setdefault(cat, []).append(article)
|
121 |
-
|
122 |
-
categorized_articles = dict(sorted(categorized_articles.items()))
|
123 |
-
|
124 |
-
for cat in categorized_articles:
|
125 |
-
categorized_articles[cat] = categorized_articles[cat][:10]
|
126 |
-
|
127 |
-
last_data_hash = compute_data_hash(categorized_articles)
|
128 |
-
|
129 |
-
return render_template("index.html", categorized_articles=categorized_articles, has_articles=True, loading=True)
|
130 |
except Exception as e:
|
131 |
-
logger.error(f"
|
132 |
-
return
|
133 |
|
134 |
-
@app.route('/
|
135 |
-
def
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
139 |
|
140 |
try:
|
141 |
-
logger.info(f"
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
145 |
|
146 |
-
|
|
|
147 |
|
148 |
-
enriched_articles = []
|
149 |
-
seen_keys = set()
|
150 |
-
for doc, score in results:
|
151 |
-
meta = doc.metadata
|
152 |
-
title = meta.get("title", "No Title")
|
153 |
-
link = meta.get("link", "")
|
154 |
-
key = f"{title}|{link}|{meta.get('published', '')}"
|
155 |
-
if key not in seen_keys:
|
156 |
-
seen_keys.add(key)
|
157 |
-
enriched_articles.append({
|
158 |
-
"title": title,
|
159 |
-
"link": link,
|
160 |
-
"description": meta.get("original_description", "No Description"),
|
161 |
-
"category": meta.get("category", "Uncategorized"),
|
162 |
-
"published": meta.get("published", "Unknown Date"),
|
163 |
-
"image": meta.get("image", "svg"),
|
164 |
-
})
|
165 |
-
|
166 |
-
categorized_articles = {}
|
167 |
-
for article in enriched_articles:
|
168 |
-
cat = article["category"]
|
169 |
-
categorized_articles.setdefault(cat, []).append(article)
|
170 |
-
|
171 |
return jsonify({
|
172 |
-
"
|
173 |
-
"
|
174 |
-
"
|
175 |
})
|
176 |
except Exception as e:
|
177 |
-
logger.error(f"
|
178 |
-
return jsonify({"
|
179 |
|
180 |
-
@app.route('/
|
181 |
-
def
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
category_articles = [
|
187 |
-
article for article in enriched_articles if article["category"] == category
|
188 |
-
]
|
189 |
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
except Exception as e:
|
193 |
-
logger.error(f"
|
194 |
-
return jsonify({"
|
195 |
|
196 |
-
@app.route('/
|
197 |
-
def
|
198 |
-
|
199 |
-
return jsonify({
|
|
|
|
|
|
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
try:
|
205 |
-
all_docs = get_all_docs_from_db()
|
206 |
-
if not all_docs['metadatas']:
|
207 |
-
return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False})
|
208 |
-
|
209 |
-
enriched_articles = process_docs_into_articles(all_docs)
|
210 |
-
categorized_articles = {}
|
211 |
-
for article in enriched_articles:
|
212 |
-
cat = article["category"]
|
213 |
-
categorized_articles.setdefault(cat, []).append(article)
|
214 |
-
|
215 |
-
for cat in categorized_articles:
|
216 |
-
categorized_articles[cat].sort(key=lambda x: x["published"], reverse=True)
|
217 |
-
categorized_articles[cat] = categorized_articles[cat][:10]
|
218 |
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
logger.info("New RSS data detected, sending updates to frontend")
|
224 |
-
last_data_hash = current_data_hash
|
225 |
-
return jsonify({"articles": categorized_articles, "last_update": last_update_time, "has_updates": True})
|
226 |
-
else:
|
227 |
-
return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False})
|
228 |
-
except Exception as e:
|
229 |
-
logger.error(f"Error fetching updates: {e}")
|
230 |
-
return jsonify({"articles": {}, "last_update": last_update_time, "has_updates": False}), 500
|
231 |
|
232 |
@app.route('/card')
|
233 |
def card_load():
|
|
|
234 |
return render_template("card.html")
|
235 |
|
|
|
236 |
if __name__ == "__main__":
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import time
|
7 |
import json
|
8 |
from datetime import datetime
|
|
|
9 |
from langchain.vectorstores import Chroma
|
10 |
from langchain.embeddings import HuggingFaceEmbeddings
|
11 |
|
12 |
+
# --- Basic Flask App Setup ---
|
13 |
app = Flask(__name__)
|
|
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
+
# --- Global State Management ---
|
18 |
loading_complete = True
|
19 |
+
last_update_time = None
|
|
|
20 |
|
21 |
+
# --- Embedding and Vector DB Management ---
|
22 |
def get_embedding_model():
|
23 |
+
"""Initializes and returns a singleton HuggingFace embedding model."""
|
24 |
+
# Using a simple hasattr check for a singleton pattern
|
25 |
if not hasattr(get_embedding_model, "model"):
|
26 |
get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
27 |
return get_embedding_model.model
|
28 |
|
29 |
def get_vector_db():
|
30 |
+
"""Initializes and returns a singleton Chroma DB client."""
|
31 |
if not os.path.exists(LOCAL_DB_DIR):
|
32 |
+
logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
|
33 |
return None
|
34 |
try:
|
35 |
+
# Using a simple hasattr check for a singleton pattern
|
36 |
+
if not hasattr(get_vector_db, "db_instance"):
|
37 |
+
get_vector_db.db_instance = Chroma(
|
38 |
+
persist_directory=LOCAL_DB_DIR,
|
39 |
+
embedding_function=get_embedding_model(),
|
40 |
+
collection_name="news_articles"
|
41 |
+
)
|
42 |
+
return get_vector_db.db_instance
|
43 |
except Exception as e:
|
44 |
logger.error(f"Failed to load vector DB: {e}")
|
45 |
+
# Invalidate instance on failure
|
46 |
+
if hasattr(get_vector_db, "db_instance"):
|
47 |
+
delattr(get_vector_db, "db_instance")
|
48 |
return None
|
49 |
|
50 |
+
# --- Background Processing ---
|
51 |
def load_feeds_in_background():
|
52 |
+
"""Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
|
53 |
global loading_complete, last_update_time
|
54 |
+
# Ensure only one background process runs at a time
|
55 |
+
if not loading_complete:
|
56 |
+
logger.info("An update is already in progress. Skipping.")
|
57 |
+
return
|
58 |
+
|
59 |
+
loading_complete = False
|
60 |
try:
|
61 |
+
logger.info("Starting background RSS feed fetch and processing...")
|
62 |
articles = fetch_rss_feeds()
|
63 |
+
logger.info(f"Fetched {len(articles)} articles from RSS feeds.")
|
64 |
+
if articles:
|
65 |
+
process_and_store_articles(articles)
|
66 |
+
upload_to_hf_hub()
|
67 |
+
last_update_time = datetime.now().isoformat()
|
68 |
+
logger.info("Background feed processing complete.")
|
69 |
except Exception as e:
|
70 |
logger.error(f"Error in background feed loading: {e}")
|
71 |
finally:
|
72 |
loading_complete = True
|
73 |
|
74 |
+
# --- Data Transformation Helper ---
|
75 |
+
def format_articles_from_db(docs):
|
76 |
+
"""
|
77 |
+
Takes ChromaDB documents (with metadata) and formats them into a standardized list of article dictionaries.
|
78 |
+
Handles deduplication based on title and link.
|
79 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
enriched_articles = []
|
81 |
seen_keys = set()
|
82 |
+
|
83 |
+
# The 'docs' can be a list of (Document, score) tuples or a dict from .get()
|
84 |
+
items = []
|
85 |
+
if isinstance(docs, dict) and 'metadatas' in docs:
|
86 |
+
items = zip(docs['documents'], docs['metadatas'])
|
87 |
+
elif isinstance(docs, list):
|
88 |
+
items = [(doc.page_content, doc.metadata) for doc, score in docs]
|
89 |
+
|
90 |
+
for doc_content, meta in items:
|
91 |
if not meta: continue
|
92 |
+
|
93 |
title = meta.get("title", "No Title")
|
94 |
link = meta.get("link", "")
|
95 |
+
# Use a composite key to identify unique articles
|
96 |
+
key = f"{title}|{link}"
|
97 |
|
|
|
98 |
if key not in seen_keys:
|
99 |
seen_keys.add(key)
|
100 |
+
|
101 |
+
# Safely parse the published date
|
102 |
+
published_str = meta.get("published", "").strip()
|
103 |
try:
|
104 |
+
published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
|
105 |
except (ValueError, TypeError):
|
106 |
+
published_iso = datetime.utcnow().isoformat() # Default to now if format is wrong
|
107 |
|
108 |
enriched_articles.append({
|
109 |
+
"id": meta.get("id", link), # Provide a unique ID
|
110 |
"title": title,
|
111 |
"link": link,
|
112 |
+
"description": meta.get("original_description", "No Description"),
|
113 |
"category": meta.get("category", "Uncategorized"),
|
114 |
"published": published_iso,
|
115 |
"image": meta.get("image", "svg"),
|
116 |
})
|
117 |
+
|
118 |
+
# Sort by date descending by default
|
119 |
+
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
120 |
return enriched_articles
|
121 |
|
122 |
+
# --------------------------------------------------------------------------------
|
123 |
+
# --- API v1 Endpoints ---
|
124 |
+
# --------------------------------------------------------------------------------
|
125 |
+
#
|
126 |
+
# API Usage Guide:
|
127 |
+
#
|
128 |
+
# GET /api/v1/search?q=<query>&limit=<n>
|
129 |
+
# - Performs semantic search.
|
130 |
+
# - `q`: The search term (required).
|
131 |
+
# - `limit`: Max number of results to return (optional, default=20).
|
132 |
+
#
|
133 |
+
# GET /api/v1/articles/category/<name>?limit=<n>&offset=<o>
|
134 |
+
# - Retrieves all articles for a given category.
|
135 |
+
# - `name`: The category name (e.g., "Technology").
|
136 |
+
# - `limit`: For pagination (optional, default=20).
|
137 |
+
# - `offset`: For pagination (optional, default=0).
|
138 |
+
#
|
139 |
+
# GET /api/v1/categories
|
140 |
+
# - Returns a list of all unique article categories.
|
141 |
+
#
|
142 |
+
# GET /api/v1/status
|
143 |
+
# - Checks the status of the background data processing task.
|
144 |
+
#
|
145 |
+
# --------------------------------------------------------------------------------
|
146 |
+
|
147 |
+
@app.route('/api/v1/search', methods=['GET'])
|
148 |
+
def api_search():
|
149 |
+
"""API endpoint for semantic search."""
|
150 |
+
query = request.args.get('q')
|
151 |
+
limit = request.args.get('limit', default=20, type=int)
|
152 |
|
153 |
+
if not query:
|
154 |
+
return jsonify({"error": "Query parameter 'q' is required."}), 400
|
|
|
155 |
|
156 |
+
vector_db = get_vector_db()
|
157 |
+
if not vector_db:
|
158 |
+
return jsonify({"error": "Database not available."}), 503
|
159 |
|
160 |
try:
|
161 |
+
logger.info(f"API: Performing semantic search for: '{query}'")
|
162 |
+
results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
|
163 |
+
formatted_articles = format_articles_from_db(results)
|
164 |
+
return jsonify(formatted_articles)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
except Exception as e:
|
166 |
+
logger.error(f"API Search error: {e}", exc_info=True)
|
167 |
+
return jsonify({"error": "An internal error occurred during search."}), 500
|
168 |
|
169 |
+
@app.route('/api/v1/articles/category/<string:category_name>', methods=['GET'])
|
170 |
+
def api_get_articles_by_category(category_name):
|
171 |
+
"""API endpoint to get articles filtered by category with pagination."""
|
172 |
+
limit = request.args.get('limit', default=20, type=int)
|
173 |
+
offset = request.args.get('offset', default=0, type=int)
|
174 |
+
|
175 |
+
vector_db = get_vector_db()
|
176 |
+
if not vector_db:
|
177 |
+
return jsonify({"error": "Database not available."}), 503
|
178 |
|
179 |
try:
|
180 |
+
logger.info(f"API: Fetching articles for category '{category_name}'")
|
181 |
+
# Use Chroma's metadata filtering for efficiency
|
182 |
+
results = vector_db.get(
|
183 |
+
where={"category": category_name},
|
184 |
+
include=['documents', 'metadatas']
|
185 |
+
)
|
186 |
|
187 |
+
formatted_articles = format_articles_from_db(results)
|
188 |
+
paginated_results = formatted_articles[offset : offset + limit]
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
return jsonify({
|
191 |
+
"category": category_name,
|
192 |
+
"total_articles": len(formatted_articles),
|
193 |
+
"articles": paginated_results
|
194 |
})
|
195 |
except Exception as e:
|
196 |
+
logger.error(f"API Category fetch error: {e}", exc_info=True)
|
197 |
+
return jsonify({"error": "An internal error occurred."}), 500
|
198 |
|
199 |
+
@app.route('/api/v1/categories', methods=['GET'])
|
200 |
+
def api_get_categories():
|
201 |
+
"""API endpoint to get a list of all unique categories."""
|
202 |
+
vector_db = get_vector_db()
|
203 |
+
if not vector_db:
|
204 |
+
return jsonify({"error": "Database not available."}), 503
|
|
|
|
|
|
|
205 |
|
206 |
+
try:
|
207 |
+
# Fetch only metadata to be efficient
|
208 |
+
all_metadata = vector_db.get(include=['metadatas'])['metadatas']
|
209 |
+
if not all_metadata:
|
210 |
+
return jsonify([])
|
211 |
+
|
212 |
+
unique_categories = sorted(list({meta['category'] for meta in all_metadata if 'category' in meta}))
|
213 |
+
return jsonify(unique_categories)
|
214 |
except Exception as e:
|
215 |
+
logger.error(f"API Categories fetch error: {e}", exc_info=True)
|
216 |
+
return jsonify({"error": "An internal error occurred."}), 500
|
217 |
|
218 |
+
@app.route('/api/v1/status', methods=['GET'])
|
219 |
+
def api_get_status():
|
220 |
+
"""API endpoint to check the data processing status."""
|
221 |
+
return jsonify({
|
222 |
+
"status": "complete" if loading_complete else "loading",
|
223 |
+
"last_update_time": last_update_time
|
224 |
+
})
|
225 |
|
226 |
+
# --------------------------------------------------------------------------------
|
227 |
+
# --- Web Application Routes ---
|
228 |
+
# --------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
+
@app.route('/')
|
231 |
+
def index():
|
232 |
+
"""Renders the main web page. Data is fetched by frontend JavaScript."""
|
233 |
+
return render_template("index.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
@app.route('/card')
|
236 |
def card_load():
|
237 |
+
"""Renders a sample card component."""
|
238 |
return render_template("card.html")
|
239 |
|
240 |
+
# --- Main Application Runner ---
|
241 |
if __name__ == "__main__":
|
242 |
+
# On startup, ensure the database exists or download it.
|
243 |
+
if not os.path.exists(LOCAL_DB_DIR):
|
244 |
+
logger.info(f"No local DB found at '{LOCAL_DB_DIR}'. Downloading from Hugging Face Hub...")
|
245 |
+
download_from_hf_hub()
|
246 |
+
|
247 |
+
# Initialize the vector DB instance
|
248 |
+
get_vector_db()
|
249 |
+
|
250 |
+
# Start the first background update immediately.
|
251 |
+
threading.Thread(target=load_feeds_in_background, daemon=True).start()
|
252 |
+
|
253 |
+
# Note: For a production environment, use a proper WSGI server like Gunicorn or uWSGI
|
254 |
+
# instead of Flask's built-in development server.
|
255 |
+
app.run(host="0.0.0.0", port=7860, debug=False)
|