RSS_News_1

Sleeping

App Files Files Community

broadfield-dev commited on Jun 17

Commit

ebf6a83

verified ·

1 Parent(s): 24c5dcb

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +37 -80

rss_processor.py CHANGED Viewed

@@ -21,28 +21,14 @@ HF_API_TOKEN = os.getenv("HF_TOKEN")
 REPO_ID = "broadfield-dev/news-rag-db"
 FEEDS_FILE = "rss_feeds.json"
-# Initialize Hugging Face API and login
 login(token=HF_API_TOKEN)
 hf_api = HfApi()
-# Initialize embedding model
 def get_embedding_model():
     if not hasattr(get_embedding_model, "model"):
         get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return get_embedding_model.model
-# Initialize Chroma database globally
-try:
-    vector_db = Chroma(
-        persist_directory=LOCAL_DB_DIR,
-        embedding_function=get_embedding_model(),
-        collection_name=COLLECTION_NAME
-    )
-    logger.info("Chroma database initialized successfully")
-except Exception as e:
-    logger.error(f"Failed to initialize Chroma database: {e}")
-    exit(1)
 def clean_text(text):
     if not text or not isinstance(text, str):
         return ""
@@ -69,13 +55,14 @@ def fetch_rss_feeds():
             try:
                 logger.info(f"Fetching '{feed_info.get('name', feed_url)}' from category '{category}'")
                 feed = feedparser.parse(feed_url, agent="RSSNewsBot/1.0 (+http://huggingface.co/spaces/broadfield-dev/RSS_News)")
                 if feed.bozo:
                     logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
                     continue
-                #for entry in feed.entries[:10]:
-                for entry in feed.entries:
                     title = entry.get("title", "No Title")
                     link = entry.get("link", "")
                     description = entry.get("summary", entry.get("description", ""))
@@ -107,7 +94,7 @@ def fetch_rss_feeds():
                             "link": link,
                             "description": description,
                             "published": published,
-                            "category": category,
                             "image": image,
                         })
             except Exception as e:
@@ -117,67 +104,49 @@ def fetch_rss_feeds():
     return articles
 def process_and_store_articles(articles):
-    if not articles:
-        logger.warning("No articles to process")
-        return
     try:
         existing_ids = set(vector_db.get(include=[])["ids"])
-        logger.info(f"Existing documents in DB: {len(existing_ids)}")
-    except Exception as e:
-        logger.error(f"Error retrieving existing IDs: {e}")
         existing_ids = set()
     docs_to_add = []
     ids_to_add = []
     for article in articles:
-        try:
-            cleaned_title = clean_text(article["title"])
-            cleaned_link = clean_text(article["link"])
-            cleaned_description = clean_text(article["description"])
-            if not cleaned_description:
-                logger.warning(f"Skipping article with empty description: {cleaned_title}")
-                continue
-            doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
-            if doc_id in existing_ids:
-                logger.debug(f"Skipping duplicate article: {doc_id}")
-                continue
-            metadata = {
-                "title": article["title"],
-                "link": article["link"],
-                "original_description": article["description"],
-                "published": article["published"],
-                "category": article["category"],
-                "image": article["image"],
-            }
-            doc = Document(page_content=cleaned_description, metadata=metadata)
-            docs_to_add.append(doc)
-            ids_to_add.append(doc_id)
-            existing_ids.add(doc_id)
-            logger.debug(f"Prepared document for article: {cleaned_title}")
-        except Exception as e:
-            logger.error(f"Error processing article {article.get('title', 'Unknown')}: {e}")
     if docs_to_add:
-        try:
-            vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
-            vector_db.persist()
-            logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
-        except Exception as e:
-            logger.error(f"Error adding documents to Chroma: {e}")
-    else:
-        logger.warning("No new documents to add to the database")
 def download_from_hf_hub():
     if not os.path.exists(LOCAL_DB_DIR):
         try:
-            # Create repo if it doesn't exist
-            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
-            logger.info(f"Downloading Chroma DB from {REPO_ID}...")
             snapshot_download(
                 repo_id=REPO_ID,
                 repo_type="dataset",
@@ -186,19 +155,12 @@ def download_from_hf_hub():
                 allow_patterns=f"{LOCAL_DB_DIR}/**",
                 token=HF_API_TOKEN
             )
-            logger.info(f"Successfully downloaded database from {REPO_ID}")
         except Exception as e:
-            logger.warning(f"Could not download DB from Hub (normal on first run): {e}")
-            # Ensure database is initialized even if download fails
-            vector_db.persist()
-            logger.info(f"Initialized empty Chroma database at {LOCAL_DB_DIR}")
-    else:
-        logger.info("Local Chroma DB exists, loading existing data.")
 def upload_to_hf_hub():
     if os.path.exists(LOCAL_DB_DIR):
         try:
-            logger.info(f"Uploading updated Chroma DB to {REPO_ID}...")
             hf_api.upload_folder(
                 folder_path=LOCAL_DB_DIR,
                 path_in_repo=LOCAL_DB_DIR,
@@ -207,17 +169,12 @@ def upload_to_hf_hub():
                 token=HF_API_TOKEN,
                 commit_message="Update RSS news database"
             )
-            logger.info(f"Database uploaded to: {REPO_ID}")
         except Exception as e:
             logger.error(f"Error uploading to Hugging Face Hub: {e}")
 if __name__ == "__main__":
-    logger.info("Starting script execution")
     download_from_hf_hub()
-    if not os.path.exists(FEEDS_FILE):
-        logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
-        exit(1)
     articles = fetch_rss_feeds()
-    process_and_store_articles(articles)
-    upload_to_hf_hub()
-    logger.info("Script execution completed")

 REPO_ID = "broadfield-dev/news-rag-db"
 FEEDS_FILE = "rss_feeds.json"
 login(token=HF_API_TOKEN)
 hf_api = HfApi()
 def get_embedding_model():
     if not hasattr(get_embedding_model, "model"):
         get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return get_embedding_model.model
 def clean_text(text):
     if not text or not isinstance(text, str):
         return ""
             try:
                 logger.info(f"Fetching '{feed_info.get('name', feed_url)}' from category '{category}'")
+                # Add a User-Agent to prevent getting blocked
                 feed = feedparser.parse(feed_url, agent="RSSNewsBot/1.0 (+http://huggingface.co/spaces/broadfield-dev/RSS_News)")
                 if feed.bozo:
                     logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
                     continue
+                for entry in feed.entries[:10]: # Process max 10 entries per feed
                     title = entry.get("title", "No Title")
                     link = entry.get("link", "")
                     description = entry.get("summary", entry.get("description", ""))
                             "link": link,
                             "description": description,
                             "published": published,
+                            "category": category, # Directly use category from JSON
                             "image": image,
                         })
             except Exception as e:
     return articles
 def process_and_store_articles(articles):
+    vector_db = Chroma(
+        persist_directory=LOCAL_DB_DIR,
+        embedding_function=get_embedding_model(),
+        collection_name=COLLECTION_NAME
+    )
     try:
         existing_ids = set(vector_db.get(include=[])["ids"])
+    except Exception:
         existing_ids = set()
     docs_to_add = []
     ids_to_add = []
     for article in articles:
+        cleaned_title = clean_text(article["title"])
+        cleaned_link = clean_text(article["link"])
+        doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
+        if doc_id in existing_ids:
+            continue
+        metadata = {
+            "title": article["title"],
+            "link": article["link"],
+            "original_description": article["description"],
+            "published": article["published"],
+            "category": article["category"],
+            "image": article["image"],
+        }
+        doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
+        docs_to_add.append(doc)
+        ids_to_add.append(doc_id)
+        existing_ids.add(doc_id)
     if docs_to_add:
+        vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
+        vector_db.persist()
+        logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
 def download_from_hf_hub():
     if not os.path.exists(LOCAL_DB_DIR):
         try:
             snapshot_download(
                 repo_id=REPO_ID,
                 repo_type="dataset",
                 allow_patterns=f"{LOCAL_DB_DIR}/**",
                 token=HF_API_TOKEN
             )
         except Exception as e:
+            logger.warning(f"Could not download DB from Hub (this is normal on first run): {e}")
 def upload_to_hf_hub():
     if os.path.exists(LOCAL_DB_DIR):
         try:
             hf_api.upload_folder(
                 folder_path=LOCAL_DB_DIR,
                 path_in_repo=LOCAL_DB_DIR,
                 token=HF_API_TOKEN,
                 commit_message="Update RSS news database"
             )
         except Exception as e:
             logger.error(f"Error uploading to Hugging Face Hub: {e}")
 if __name__ == "__main__":
     download_from_hf_hub()
     articles = fetch_rss_feeds()
+    if articles:
+        process_and_store_articles(articles)
+        upload_to_hf_hub()