RSS_News_1

Sleeping

App Files Files Community

broadfield-dev commited on Jun 22

Commit

4f6bd49

verified ·

1 Parent(s): 24dab84

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +65 -83

rss_processor.py CHANGED Viewed

@@ -159,104 +159,86 @@ def categorize_feed(url):
         logger.warning(f"No matching category found for URL: {url}")
         return "Uncategorized"
-def process_and_store_articles(articles, vector_db):
-    documents = []
-    doc_ids = []
     try:
         existing_ids = set(vector_db.get(include=[])["ids"])
-        logger.info(f"Found {len(existing_ids)} existing document IDs in the database.")
     except Exception:
         existing_ids = set()
-        logger.info("No existing documents found or error retrieving them. Starting fresh.")
     for article in articles:
-        try:
-            title = clean_text(article["title"])
-            link = article["link"]
-            description = article["description"]
-            published = article["published"]
-            description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
-            doc_id = f"{title}|{link}|{published}|{description_hash}"
-            if doc_id in existing_ids:
-                continue
-            metadata = {
-                "title": article["title"],
-                "link": article["link"],
-                "published": article["published"],
-                "category": article["category"],
-                "image": article["image"],
-            }
-            doc = Document(page_content=description, metadata=metadata)
-            documents.append(doc)
-            doc_ids.append(doc_id)
-            existing_ids.add(doc_id)
-        except Exception as e:
-            logger.error(f"Error processing article {article['title']}: {e}")
-    if documents:
         try:
-            vector_db.add_documents(documents=documents, ids=doc_ids)
             vector_db.persist()
-            logger.info(f"Added {len(documents)} new articles to DB. Total documents now: {len(vector_db.get()['ids'])}")
         except Exception as e:
             logger.error(f"Error storing articles: {e}")
-    else:
-        logger.info("No new articles to add.")
 def download_from_hf_hub():
-    if os.path.exists(LOCAL_DB_DIR):
-        logger.info(f"Local database directory '{LOCAL_DB_DIR}' already exists. Skipping download.")
-        return
-    logger.info(f"Attempting to download database from Hugging Face Hub repo: {REPO_ID}")
-    try:
-        snapshot_download(
-            repo_id=REPO_ID,
-            repo_type="dataset",
-            local_dir=LOCAL_DB_DIR,
-            token=HF_API_TOKEN,
-        )
-        logger.info(f"Database successfully downloaded to '{LOCAL_DB_DIR}'.")
-    except HfHubHTTPError as e:
-        logger.warning(f"Failed to download from Hub (repo may be new or empty): {e}. Building new dataset locally.")
-        os.makedirs(LOCAL_DB_DIR, exist_ok=True)
-    except Exception as e:
-        logger.error(f"An unexpected error occurred during download: {e}. Creating new local directory.")
-        os.makedirs(LOCAL_DB_DIR, exist_ok=True)
 def upload_to_hf_hub():
-    if not os.path.exists(LOCAL_DB_DIR):
-        logger.warning(f"Local database directory '{LOCAL_DB_DIR}' not found. Nothing to upload.")
-        return
-    try:
-        hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
-        logger.info(f"Uploading updated Chroma DB to {REPO_ID}...")
-        hf_api.upload_folder(
-            folder_path=LOCAL_DB_DIR,
-            repo_id=REPO_ID,
-            repo_type="dataset",
-            commit_message=f"Update database - {datetime.now().isoformat()}"
-        )
-        logger.info(f"Database uploaded successfully to Hugging Face Hub.")
-    except Exception as e:
-        logger.error(f"Error uploading to Hugging Face Hub: {e}")
-def run_update_pipeline():
-    download_from_hf_hub()
-    vector_db = Chroma(
-        persist_directory=LOCAL_DB_DIR,
-        embedding_function=embedding_model,
-        collection_name=COLLECTION_NAME
-    )
-    articles = fetch_rss_feeds()
-    if articles:
-        process_and_store_articles(articles, vector_db)
-    upload_to_hf_hub()
-    logger.info("Update pipeline finished.")

         logger.warning(f"No matching category found for URL: {url}")
         return "Uncategorized"
+def process_and_store_articles(articles):
+    vector_db = Chroma(
+        persist_directory=LOCAL_DB_DIR,
+        embedding_function=get_embedding_model(),
+        collection_name=COLLECTION_NAME
+    )
     try:
         existing_ids = set(vector_db.get(include=[])["ids"])
+        logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
     except Exception:
         existing_ids = set()
+        logger.info("No existing DB found or it is empty. Starting fresh.")
+    docs_to_add = []
+    ids_to_add = []
     for article in articles:
+        cleaned_title = clean_text(article["title"])
+        cleaned_link = clean_text(article["link"])
+        doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
+        if doc_id in existing_ids:
+            continue
+        metadata = {
+            "title": article["title"],
+            "link": article["link"],
+            "original_description": article["description"],
+            "published": article["published"],
+            "category": article["category"],
+            "image": article["image"],
+        }
+        doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
+        docs_to_add.append(doc)
+        ids_to_add.append(doc_id)
+        existing_ids.add(doc_id)
+    if docs_to_add:
         try:
+            vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
             vector_db.persist()
+            logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
         except Exception as e:
             logger.error(f"Error storing articles: {e}")
 def download_from_hf_hub():
+    if not os.path.exists(LOCAL_DB_DIR):
+        try:
+            logger.info(f"Downloading Chroma DB from {REPO_ID} to {LOCAL_DB_DIR}...")
+            snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="dataset",
+                local_dir=".",
+                local_dir_use_symlinks=False,
+                allow_patterns=f"{LOCAL_DB_DIR}/**",
+                token=HF_API_TOKEN
+            )
+            logger.info("Finished downloading DB.")
+        except Exception as e:
+            logger.warning(f"Could not download from Hugging Face Hub (this is normal on first run): {e}")
+    else:
+        logger.info("Local Chroma DB exists, loading existing data.")
 def upload_to_hf_hub():
+    if os.path.exists(LOCAL_DB_DIR):
+        try:
+            logger.info(f"Uploading updated Chroma DB '{LOCAL_DB_DIR}' to {REPO_ID}...")
+            hf_api.upload_folder(
+                folder_path=LOCAL_DB_DIR,
+                path_in_repo=LOCAL_DB_DIR,
+                repo_id=REPO_ID,
+                repo_type="dataset",
+                token=HF_API_TOKEN,
+                commit_message="Update RSS news database"
+            )
+            logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
+        except Exception as e:
+            logger.error(f"Error uploading to Hugging Face Hub: {e}")