RSS_News_1

Sleeping

App Files Files Community

broadfield-dev commited on Jun 16

Commit

3aa40bc

verified ·

1 Parent(s): a9254a4

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +66 -49

rss_processor.py CHANGED Viewed

@@ -1,14 +1,13 @@
-# rss_processor.py
 import os
 import feedparser
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
 import logging
-from huggingface_hub import HfApi, login
 import shutil
 import rss_feeds
-from datetime import datetime
 import dateutil.parser
 import hashlib
 import re
@@ -19,7 +18,6 @@ logger = logging.getLogger(__name__)
 # Constants
 MAX_ARTICLES_PER_FEED = 10
-LOCAL_DB_DIR = "chroma_db"
 RSS_FEEDS = rss_feeds.RSS_FEEDS
 COLLECTION_NAME = "news_articles"
 HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
@@ -29,15 +27,15 @@ REPO_ID = "broadfield-dev/news-rag-db"
 login(token=HF_API_TOKEN)
 hf_api = HfApi()
-# Initialize embedding model (global, reusable)
-embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-# Initialize vector DB with a specific collection name
-vector_db = Chroma(
-    persist_directory=LOCAL_DB_DIR,
-    embedding_function=embedding_model,
-    collection_name=COLLECTION_NAME
-)
 def clean_text(text):
     """Clean text by removing HTML tags and extra whitespace."""
@@ -119,9 +117,9 @@ def categorize_feed(url):
         logger.warning(f"Invalid URL provided for categorization: {url}")
         return "Uncategorized"
-    url = url.lower().strip()  # Normalize the URL
-    logger.debug(f"Categorizing URL: {url}")  # Add debugging for visibility
     if any(keyword in url for keyword in ["nature", "science.org", "arxiv.org", "plos.org", "annualreviews.org", "journals.uchicago.edu", "jneurosci.org", "cell.com", "nejm.org", "lancet.com"]):
         return "Academic Papers"
@@ -156,8 +154,21 @@ def categorize_feed(url):
         return "Uncategorized"
 def process_and_store_articles(articles):
-    documents = []
-    existing_ids = set(vector_db.get()["ids"])  # Load existing IDs once
     for article in articles:
         try:
             title = clean_text(article["title"])
@@ -165,10 +176,13 @@ def process_and_store_articles(articles):
             description = clean_text(article["description"])
             published = article["published"]
             description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
             doc_id = f"{title}|{link}|{published}|{description_hash}"
             if doc_id in existing_ids:
-                logger.debug(f"Skipping duplicate in DB: {doc_id}")
                 continue
             metadata = {
                 "title": article["title"],
                 "link": article["link"],
@@ -177,52 +191,55 @@ def process_and_store_articles(articles):
                 "category": article["category"],
                 "image": article["image"],
             }
-            doc = Document(page_content=description, metadata=metadata, id=doc_id)
-            documents.append(doc)
-            existing_ids.add(doc_id)  # Update in-memory set to avoid duplicates within this batch
         except Exception as e:
-            logger.error(f"Error processing article {article['title']}: {e}")
-    if documents:
         try:
-            vector_db.add_documents(documents)
             vector_db.persist()
-            logger.info(f"Added {len(documents)} new articles to DB. Total documents: {len(vector_db.get()['ids'])}")
         except Exception as e:
-            logger.error(f"Error storing articles: {e}")
 def download_from_hf_hub():
-    if not os.path.exists(LOCAL_DB_DIR):
-        try:
-            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
-            logger.info(f"Downloading Chroma DB from {REPO_ID}...")
-            hf_api.hf_hub_download(repo_id=REPO_ID, filename="chroma_db", local_dir=LOCAL_DB_DIR, repo_type="dataset", token=HF_API_TOKEN)
-        except Exception as e:
-            logger.error(f"Error downloading from Hugging Face Hub: {e}")
-    else:
-        logger.info("Local Chroma DB exists, loading existing data.")
 def upload_to_hf_hub():
-    if os.path.exists(LOCAL_DB_DIR):
         try:
-            logger.info(f"Uploading updated Chroma DB to {REPO_ID}...")
-            for root, _, files in os.walk(LOCAL_DB_DIR):
-                for file in files:
-                    local_path = os.path.join(root, file)
-                    remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
-                    hf_api.upload_file(
-                        path_or_fileobj=local_path,
-                        path_in_repo=remote_path,
-                        repo_id=REPO_ID,
-                        repo_type="dataset",
-                        token=HF_API_TOKEN
-                    )
-            logger.info(f"Database uploaded to: {REPO_ID}")
         except Exception as e:
             logger.error(f"Error uploading to Hugging Face Hub: {e}")
 if __name__ == "__main__":
-    download_from_hf_hub()  # Ensure DB is initialized
     articles = fetch_rss_feeds()
     process_and_store_articles(articles)
     upload_to_hf_hub()

 import os
 import feedparser
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
 import logging
+from huggingface_hub import HfApi, login, snapshot_download
 import shutil
 import rss_feeds
+from datetime import datetime, date
 import dateutil.parser
 import hashlib
 import re
 # Constants
 MAX_ARTICLES_PER_FEED = 10
 RSS_FEEDS = rss_feeds.RSS_FEEDS
 COLLECTION_NAME = "news_articles"
 HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
 login(token=HF_API_TOKEN)
 hf_api = HfApi()
+def get_embedding_model():
+    """Returns a singleton instance of the embedding model to avoid reloading."""
+    if not hasattr(get_embedding_model, "model"):
+        get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    return get_embedding_model.model
+def get_daily_db_dir():
+    """Returns the path for today's Chroma DB."""
+    return f"chroma_db_{date.today().isoformat()}"
 def clean_text(text):
     """Clean text by removing HTML tags and extra whitespace."""
         logger.warning(f"Invalid URL provided for categorization: {url}")
         return "Uncategorized"
+    url = url.lower().strip()
+    logger.debug(f"Categorizing URL: {url}")
     if any(keyword in url for keyword in ["nature", "science.org", "arxiv.org", "plos.org", "annualreviews.org", "journals.uchicago.edu", "jneurosci.org", "cell.com", "nejm.org", "lancet.com"]):
         return "Academic Papers"
         return "Uncategorized"
 def process_and_store_articles(articles):
+    db_path = get_daily_db_dir()
+    vector_db = Chroma(
+        persist_directory=db_path,
+        embedding_function=get_embedding_model(),
+        collection_name=COLLECTION_NAME
+    )
+    try:
+        existing_ids = set(vector_db.get(include=[])["ids"])
+    except Exception:
+        existing_ids = set()
+    docs_to_add = []
+    ids_to_add = []
     for article in articles:
         try:
             title = clean_text(article["title"])
             description = clean_text(article["description"])
             published = article["published"]
             description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
             doc_id = f"{title}|{link}|{published}|{description_hash}"
             if doc_id in existing_ids:
+                logger.debug(f"Skipping duplicate in DB {db_path}: {doc_id}")
                 continue
             metadata = {
                 "title": article["title"],
                 "link": article["link"],
                 "category": article["category"],
                 "image": article["image"],
             }
+            doc = Document(page_content=description, metadata=metadata)
+            docs_to_add.append(doc)
+            ids_to_add.append(doc_id)
+            existing_ids.add(doc_id)
         except Exception as e:
+            logger.error(f"Error processing article {article.get('title', 'N/A')}: {e}")
+    if docs_to_add:
         try:
+            vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
             vector_db.persist()
+            logger.info(f"Added {len(docs_to_add)} new articles to DB {db_path}. Total in DB: {vector_db._collection.count()}")
         except Exception as e:
+            logger.error(f"Error storing articles in {db_path}: {e}")
 def download_from_hf_hub():
+    try:
+        hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
+        logger.info(f"Downloading all DBs from {REPO_ID}...")
+        snapshot_download(
+            repo_id=REPO_ID,
+            repo_type="dataset",
+            local_dir=".",
+            local_dir_use_symlinks=False,
+            allow_patterns="chroma_db_*/**",
+            token=HF_API_TOKEN
+        )
+        logger.info("Finished downloading DBs.")
+    except Exception as e:
+        logger.error(f"Error downloading from Hugging Face Hub: {e}")
 def upload_to_hf_hub():
+    db_path = get_daily_db_dir()
+    if os.path.exists(db_path):
         try:
+            logger.info(f"Uploading updated Chroma DB '{db_path}' to {REPO_ID}...")
+            hf_api.upload_folder(
+                folder_path=db_path,
+                path_in_repo=db_path,
+                repo_id=REPO_ID,
+                repo_type="dataset",
+                token=HF_API_TOKEN
+            )
+            logger.info(f"Database folder '{db_path}' uploaded to: {REPO_ID}")
         except Exception as e:
             logger.error(f"Error uploading to Hugging Face Hub: {e}")
 if __name__ == "__main__":
+    download_from_hf_hub()
     articles = fetch_rss_feeds()
     process_and_store_articles(articles)
     upload_to_hf_hub()