RSS_News_1

Sleeping

App Files Files Community

broadfield-dev commited on Jun 23

Commit

104836a

verified ·

1 Parent(s): 62de7e5

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +4 -18

rss_processor.py CHANGED Viewed

@@ -17,7 +17,6 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 LOCAL_DB_DIR = "chroma_db"
-#RSS_FEEDS = rss_feeds.RSS_FEEDS
 FEEDS_FILE = "rss_feeds.json"
 COLLECTION_NAME = "news_articles"
 HF_API_TOKEN = os.getenv("HF_TOKEN")
@@ -161,20 +160,15 @@ def categorize_feed(url):
         return "Uncategorized"
 def process_and_store_articles(articles):
     vector_db = Chroma(
         persist_directory=LOCAL_DB_DIR,
         embedding_function=get_embedding_model(),
-        #embedding_function=embedding_model,
         collection_name=COLLECTION_NAME
     )
-    try:
-        existing_ids = set(vector_db.get(include=[])["ids"])
-        logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
-    except Exception:
-        existing_ids = set()
-        logger.info("No existing DB found or it is empty. Starting fresh.")
     docs_to_add = []
     ids_to_add = []
@@ -183,9 +177,6 @@ def process_and_store_articles(articles):
         cleaned_link = clean_text(article["link"])
         doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
-        if doc_id in existing_ids:
-            continue
         metadata = {
             "title": article["title"],
             "link": article["link"],
@@ -197,7 +188,6 @@ def process_and_store_articles(articles):
         doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
         docs_to_add.append(doc)
         ids_to_add.append(doc_id)
-        existing_ids.add(doc_id)
     if docs_to_add:
         try:
@@ -239,8 +229,4 @@ def upload_to_hf_hub():
             )
             logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
         except Exception as e:
-            logger.error(f"Error uploading to Hugging Face Hub: {e}")

 logger = logging.getLogger(__name__)
 LOCAL_DB_DIR = "chroma_db"
 FEEDS_FILE = "rss_feeds.json"
 COLLECTION_NAME = "news_articles"
 HF_API_TOKEN = os.getenv("HF_TOKEN")
         return "Uncategorized"
 def process_and_store_articles(articles):
+    if os.path.exists(LOCAL_DB_DIR):
+        shutil.rmtree(LOCAL_DB_DIR)
     vector_db = Chroma(
         persist_directory=LOCAL_DB_DIR,
         embedding_function=get_embedding_model(),
         collection_name=COLLECTION_NAME
     )
     docs_to_add = []
     ids_to_add = []
         cleaned_link = clean_text(article["link"])
         doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
         metadata = {
             "title": article["title"],
             "link": article["link"],
         doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
         docs_to_add.append(doc)
         ids_to_add.append(doc_id)
     if docs_to_add:
         try:
             )
             logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
         except Exception as e:
+            logger.error(f"Error uploading to Hugging Face Hub: {e}")