RSS_News_1

Sleeping

App Files Files Community

broadfield-dev commited on Jun 17

Commit

2d5f1d5

verified ·

1 Parent(s): 30a7f9f

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +59 -34

rss_processor.py CHANGED Viewed

@@ -32,11 +32,16 @@ def get_embedding_model():
     return get_embedding_model.model
 # Initialize Chroma database globally
-vector_db = Chroma(
-    persist_directory=LOCAL_DB_DIR,
-    embedding_function=get_embedding_model(),
-    collection_name=COLLECTION_NAME
-)
 def clean_text(text):
     if not text or not isinstance(text, str):
@@ -111,39 +116,60 @@ def fetch_rss_feeds():
     return articles
 def process_and_store_articles(articles):
     try:
         existing_ids = set(vector_db.get(include=[])["ids"])
-    except Exception:
         existing_ids = set()
     docs_to_add = []
     ids_to_add = []
     for article in articles:
-        cleaned_title = clean_text(article["title"])
-        cleaned_link = clean_text(article["link"])
-        doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
-        if doc_id in existing_ids:
-            continue
-        metadata = {
-            "title": article["title"],
-            "link": article["link"],
-            "original_description": article["description"],
-            "published": article["published"],
-            "category": article["category"],
-            "image": article["image"],
-        }
-        doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
-        docs_to_add.append(doc)
-        ids_to_add.append(doc_id)
-        existing_ids.add(doc_id)
     if docs_to_add:
-        vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
-        vector_db.persist()
-        logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
 def download_from_hf_hub():
     if not os.path.exists(LOCAL_DB_DIR):
@@ -185,13 +211,12 @@ def upload_to_hf_hub():
             logger.error(f"Error uploading to Hugging Face Hub: {e}")
 if __name__ == "__main__":
-    download_from_hf_hub()  # Ensure DB is initialized
     if not os.path.exists(FEEDS_FILE):
         logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
         exit(1)
     articles = fetch_rss_feeds()
-    if not articles:
-        logger.warning("No articles fetched. Database remains empty.")
-    else:
-        process_and_store_articles(articles)
-        upload_to_hf_hub()

     return get_embedding_model.model
 # Initialize Chroma database globally
+try:
+    vector_db = Chroma(
+        persist_directory=LOCAL_DB_DIR,
+        embedding_function=get_embedding_model(),
+        collection_name=COLLECTION_NAME
+    )
+    logger.info("Chroma database initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize Chroma database: {e}")
+    exit(1)
 def clean_text(text):
     if not text or not isinstance(text, str):
     return articles
 def process_and_store_articles(articles):
+    if not articles:
+        logger.warning("No articles to process")
+        return
     try:
         existing_ids = set(vector_db.get(include=[])["ids"])
+        logger.info(f"Existing documents in DB: {len(existing_ids)}")
+    except Exception as e:
+        logger.error(f"Error retrieving existing IDs: {e}")
         existing_ids = set()
     docs_to_add = []
     ids_to_add = []
     for article in articles:
+        try:
+            cleaned_title = clean_text(article["title"])
+            cleaned_link = clean_text(article["link"])
+            cleaned_description = clean_text(article["description"])
+            if not cleaned_description:
+                logger.warning(f"Skipping article with empty description: {cleaned_title}")
+                continue
+            doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
+            if doc_id in existing_ids:
+                logger.debug(f"Skipping duplicate article: {doc_id}")
+                continue
+            metadata = {
+                "title": article["title"],
+                "link": article["link"],
+                "original_description": article["description"],
+                "published": article["published"],
+                "category": article["category"],
+                "image": article["image"],
+            }
+            doc = Document(page_content=cleaned_description, metadata=metadata)
+            docs_to_add.append(doc)
+            ids_to_add.append(doc_id)
+            existing_ids.add(doc_id)
+            logger.debug(f"Prepared document for article: {cleaned_title}")
+        except Exception as e:
+            logger.error(f"Error processing article {article.get('title', 'Unknown')}: {e}")
     if docs_to_add:
+        try:
+            vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
+            vector_db.persist()
+            logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
+        except Exception as e:
+            logger.error(f"Error adding documents to Chroma: {e}")
+    else:
+        logger.warning("No new documents to add to the database")
 def download_from_hf_hub():
     if not os.path.exists(LOCAL_DB_DIR):
             logger.error(f"Error uploading to Hugging Face Hub: {e}")
 if __name__ == "__main__":
+    logger.info("Starting script execution")
+    download_from_hf_hub()
     if not os.path.exists(FEEDS_FILE):
         logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
         exit(1)
     articles = fetch_rss_feeds()
+    process_and_store_articles(articles)
+    upload_to_hf_hub()
+    logger.info("Script execution completed")