RSS_News_1

Sleeping

App Files Files Community

broadfield-dev commited on Jun 22

Commit

1252efa

verified ·

1 Parent(s): 679afad

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -97

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ from flask import Flask, render_template, request, jsonify
 from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text, LOCAL_DB_DIR
 import logging
 import time
-import json
 from datetime import datetime
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
@@ -21,7 +20,6 @@ last_update_time = None
 # --- Embedding and Vector DB Management ---
 def get_embedding_model():
     """Initializes and returns a singleton HuggingFace embedding model."""
-    # Using a simple hasattr check for a singleton pattern
     if not hasattr(get_embedding_model, "model"):
         get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return get_embedding_model.model
@@ -32,7 +30,6 @@ def get_vector_db():
         logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
         return None
     try:
-        # Using a simple hasattr check for a singleton pattern
         if not hasattr(get_vector_db, "db_instance"):
             get_vector_db.db_instance = Chroma(
                 persist_directory=LOCAL_DB_DIR,
@@ -42,7 +39,6 @@ def get_vector_db():
         return get_vector_db.db_instance
     except Exception as e:
         logger.error(f"Failed to load vector DB: {e}")
-        # Invalidate instance on failure
         if hasattr(get_vector_db, "db_instance"):
              delattr(get_vector_db, "db_instance")
         return None
@@ -51,7 +47,6 @@ def get_vector_db():
 def load_feeds_in_background():
     """Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
     global loading_complete, last_update_time
-    # Ensure only one background process runs at a time
     if not loading_complete:
         logger.info("An update is already in progress. Skipping.")
         return
@@ -71,19 +66,19 @@ def load_feeds_in_background():
     finally:
         loading_complete = True
-# --- Data Transformation Helper ---
 def format_articles_from_db(docs):
     """
-    Takes ChromaDB documents (with metadata) and formats them into a standardized list of article dictionaries.
-    Handles deduplication based on title and link.
     """
     enriched_articles = []
     seen_keys = set()
-    # The 'docs' can be a list of (Document, score) tuples or a dict from .get()
     items = []
     if isinstance(docs, dict) and 'metadatas' in docs:
-        items = zip(docs['documents'], docs['metadatas'])
     elif isinstance(docs, list):
         items = [(doc.page_content, doc.metadata) for doc, score in docs]
@@ -92,21 +87,20 @@ def format_articles_from_db(docs):
         title = meta.get("title", "No Title")
         link = meta.get("link", "")
-        # Use a composite key to identify unique articles
         key = f"{title}|{link}"
         if key not in seen_keys:
             seen_keys.add(key)
-            # Safely parse the published date
             published_str = meta.get("published", "").strip()
             try:
                 published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
             except (ValueError, TypeError):
-                published_iso = datetime.utcnow().isoformat() # Default to now if format is wrong
             enriched_articles.append({
-                "id": meta.get("id", link), # Provide a unique ID
                 "title": title,
                 "link": link,
                 "description": meta.get("original_description", "No Description"),
@@ -115,33 +109,96 @@ def format_articles_from_db(docs):
                 "image": meta.get("image", "svg"),
             })
-    # Sort by date descending by default
     enriched_articles.sort(key=lambda x: x["published"], reverse=True)
     return enriched_articles
 # --------------------------------------------------------------------------------
-# --- API v1 Endpoints ---
 # --------------------------------------------------------------------------------
-#
-# API Usage Guide:
-#
-# GET /api/v1/search?q=<query>&limit=<n>
-#   - Performs semantic search.
-#   - `q`: The search term (required).
-#   - `limit`: Max number of results to return (optional, default=20).
-#
-# GET /api/v1/articles/category/<name>?limit=<n>&offset=<o>
-#   - Retrieves all articles for a given category.
-#   - `name`: The category name (e.g., "Technology").
-#   - `limit`: For pagination (optional, default=20).
-#   - `offset`: For pagination (optional, default=0).
-#
-# GET /api/v1/categories
-#   - Returns a list of all unique article categories.
-#
-# GET /api/v1/status
-#   - Checks the status of the background data processing task.
-#
 # --------------------------------------------------------------------------------
 @app.route('/api/v1/search', methods=['GET'])
@@ -158,7 +215,6 @@ def api_search():
         return jsonify({"error": "Database not available."}), 503
     try:
-        logger.info(f"API: Performing semantic search for: '{query}'")
         results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
         formatted_articles = format_articles_from_db(results)
         return jsonify(formatted_articles)
@@ -177,13 +233,7 @@ def api_get_articles_by_category(category_name):
         return jsonify({"error": "Database not available."}), 503
     try:
-        logger.info(f"API: Fetching articles for category '{category_name}'")
-        # Use Chroma's metadata filtering for efficiency
-        results = vector_db.get(
-            where={"category": category_name},
-            include=['documents', 'metadatas']
-        )
         formatted_articles = format_articles_from_db(results)
         paginated_results = formatted_articles[offset : offset + limit]
@@ -196,60 +246,8 @@ def api_get_articles_by_category(category_name):
         logger.error(f"API Category fetch error: {e}", exc_info=True)
         return jsonify({"error": "An internal error occurred."}), 500
-@app.route('/api/v1/categories', methods=['GET'])
-def api_get_categories():
-    """API endpoint to get a list of all unique categories."""
-    vector_db = get_vector_db()
-    if not vector_db:
-        return jsonify({"error": "Database not available."}), 503
-    try:
-        # Fetch only metadata to be efficient
-        all_metadata = vector_db.get(include=['metadatas'])['metadatas']
-        if not all_metadata:
-            return jsonify([])
-        unique_categories = sorted(list({meta['category'] for meta in all_metadata if 'category' in meta}))
-        return jsonify(unique_categories)
-    except Exception as e:
-        logger.error(f"API Categories fetch error: {e}", exc_info=True)
-        return jsonify({"error": "An internal error occurred."}), 500
-@app.route('/api/v1/status', methods=['GET'])
-def api_get_status():
-    """API endpoint to check the data processing status."""
-    return jsonify({
-        "status": "complete" if loading_complete else "loading",
-        "last_update_time": last_update_time
-    })
-# --------------------------------------------------------------------------------
-# --- Web Application Routes ---
-# --------------------------------------------------------------------------------
-@app.route('/')
-def index():
-    """Renders the main web page. Data is fetched by frontend JavaScript."""
-    return render_template("index.html")
-@app.route('/card')
-def card_load():
-    """Renders a sample card component."""
-    return render_template("card.html")
 # --- Main Application Runner ---
 if __name__ == "__main__":
-    # On startup, ensure the database exists or download it.
-    if not os.path.exists(LOCAL_DB_DIR):
-        logger.info(f"No local DB found at '{LOCAL_DB_DIR}'. Downloading from Hugging Face Hub...")
-        download_from_hf_hub()
-    # Initialize the vector DB instance
-    get_vector_db()
-    # Start the first background update immediately.
-    threading.Thread(target=load_feeds_in_background, daemon=True).start()
-    # Note: For a production environment, use a proper WSGI server like Gunicorn or uWSGI
-    # instead of Flask's built-in development server.
     app.run(host="0.0.0.0", port=7860, debug=False)

 from rss_processor import fetch_rss_feeds, process_and_store_articles, download_from_hf_hub, upload_to_hf_hub, clean_text, LOCAL_DB_DIR
 import logging
 import time
 from datetime import datetime
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 # --- Embedding and Vector DB Management ---
 def get_embedding_model():
     """Initializes and returns a singleton HuggingFace embedding model."""
     if not hasattr(get_embedding_model, "model"):
         get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return get_embedding_model.model
         logger.warning(f"Vector DB not found at {LOCAL_DB_DIR}. It may need to be downloaded or created.")
         return None
     try:
         if not hasattr(get_vector_db, "db_instance"):
             get_vector_db.db_instance = Chroma(
                 persist_directory=LOCAL_DB_DIR,
         return get_vector_db.db_instance
     except Exception as e:
         logger.error(f"Failed to load vector DB: {e}")
         if hasattr(get_vector_db, "db_instance"):
              delattr(get_vector_db, "db_instance")
         return None
 def load_feeds_in_background():
     """Fetches RSS feeds, processes articles, and uploads to Hub in a background thread."""
     global loading_complete, last_update_time
     if not loading_complete:
         logger.info("An update is already in progress. Skipping.")
         return
     finally:
         loading_complete = True
+# --- Data Transformation Helper (Used by both SSR and API) ---
 def format_articles_from_db(docs):
     """
+    Takes ChromaDB documents and formats them into a standardized list of article dictionaries.
     """
     enriched_articles = []
     seen_keys = set()
     items = []
+    # Handle .get() results (dict of lists)
     if isinstance(docs, dict) and 'metadatas' in docs:
+        items = zip(docs.get('documents', []), docs.get('metadatas', []))
+    # Handle similarity_search results (list of (Document, score) tuples)
     elif isinstance(docs, list):
         items = [(doc.page_content, doc.metadata) for doc, score in docs]
         title = meta.get("title", "No Title")
         link = meta.get("link", "")
         key = f"{title}|{link}"
         if key not in seen_keys:
             seen_keys.add(key)
             published_str = meta.get("published", "").strip()
             try:
+                # The format from your original `process_and_store_articles`
                 published_iso = datetime.strptime(published_str, "%Y-%m-%d %H:%M:%S").isoformat()
             except (ValueError, TypeError):
+                published_iso = datetime.utcnow().isoformat()
             enriched_articles.append({
+                "id": meta.get("id", link),
                 "title": title,
                 "link": link,
                 "description": meta.get("original_description", "No Description"),
                 "image": meta.get("image", "svg"),
             })
     enriched_articles.sort(key=lambda x: x["published"], reverse=True)
     return enriched_articles
 # --------------------------------------------------------------------------------
+# --- Web Application Route (Server-Side Rendered) ---
 # --------------------------------------------------------------------------------
+@app.route('/')
+def index():
+    """
+    Renders the main web page by fetching, processing, and passing data
+    to the template on the server side. This preserves the original functionality.
+    """
+    # Perform startup checks
+    if not os.path.exists(LOCAL_DB_DIR):
+        logger.info(f"No Chroma DB found at '{LOCAL_DB_DIR}', downloading from Hugging Face Hub...")
+        download_from_hf_hub()
+    # Trigger background update
+    threading.Thread(target=load_feeds_in_background, daemon=True).start()
+    try:
+        # Fetch all data from the DB for rendering
+        vector_db = get_vector_db()
+        if not vector_db:
+             raise ConnectionError("Database could not be loaded.")
+        all_docs = vector_db.get(include=['documents', 'metadatas'])
+        if not all_docs or not all_docs['metadatas']:
+            logger.info("No articles in the DB yet for initial render.")
+            return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
+        # Process and categorize articles for the template
+        enriched_articles = format_articles_from_db(all_docs)
+        categorized_articles = {}
+        for article in enriched_articles:
+            cat = article["category"]
+            categorized_articles.setdefault(cat, []).append(article)
+        categorized_articles = dict(sorted(categorized_articles.items()))
+        # Limit to 10 articles per category for the main page view
+        for cat in categorized_articles:
+            categorized_articles[cat] = categorized_articles[cat][:10]
+        return render_template(
+            "index.html",
+            categorized_articles=categorized_articles,
+            has_articles=True,
+            # The original code didn't pass loading, but it's good practice
+            loading=not loading_complete
+        )
+    except Exception as e:
+        logger.error(f"Error rendering index page: {e}", exc_info=True)
+        # Fallback render in case of error
+        return render_template("index.html", categorized_articles={}, has_articles=False, loading=True, error="Could not load articles.")
+# Your original search route, which was also server-side
+# We can keep it or decide to use the API for search on the frontend
+@app.route('/search', methods=['POST'])
+def search():
+    # This route returns a JSON payload to be handled by JavaScript.
+    # It functions like an API endpoint and is a good example of a hybrid approach.
+    query = request.form.get('search')
+    if not query:
+        return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
+    vector_db = get_vector_db()
+    if not vector_db:
+        return jsonify({"error": "Database not available"}), 503
+    results = vector_db.similarity_search_with_relevance_scores(query, k=50)
+    enriched_articles = format_articles_from_db(results)
+    categorized_articles = {}
+    for article in enriched_articles:
+        cat = article["category"]
+        categorized_articles.setdefault(cat, []).append(article)
+    return jsonify({
+        "categorized_articles": categorized_articles,
+        "has_articles": bool(enriched_articles),
+        "loading": False
+    })
+# --------------------------------------------------------------------------------
+# --- NEW: Standalone API v1 Endpoints (Return only JSON) ---
 # --------------------------------------------------------------------------------
 @app.route('/api/v1/search', methods=['GET'])
         return jsonify({"error": "Database not available."}), 503
     try:
         results = vector_db.similarity_search_with_relevance_scores(query, k=limit)
         formatted_articles = format_articles_from_db(results)
         return jsonify(formatted_articles)
         return jsonify({"error": "Database not available."}), 503
     try:
+        results = vector_db.get(where={"category": category_name}, include=['documents', 'metadatas'])
         formatted_articles = format_articles_from_db(results)
         paginated_results = formatted_articles[offset : offset + limit]
         logger.error(f"API Category fetch error: {e}", exc_info=True)
         return jsonify({"error": "An internal error occurred."}), 500
+# Other routes like /card, /get_updates, etc. from your original file would go here.
 # --- Main Application Runner ---
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=False)