Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +14 -10
rss_processor.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
import feedparser
|
3 |
-
from
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.docstore.document import Document
|
6 |
import logging
|
@@ -163,14 +163,11 @@ def process_and_store_articles(articles):
|
|
163 |
if not os.path.exists(LOCAL_DB_DIR):
|
164 |
os.makedirs(LOCAL_DB_DIR)
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
embedding_function=get_embedding_model(),
|
169 |
-
collection_name=COLLECTION_NAME
|
170 |
-
)
|
171 |
|
172 |
try:
|
173 |
-
existing_ids = set(
|
174 |
logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
|
175 |
except Exception as e:
|
176 |
logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
|
@@ -201,9 +198,16 @@ def process_and_store_articles(articles):
|
|
201 |
|
202 |
if docs_to_add:
|
203 |
try:
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
except Exception as e:
|
208 |
logger.error(f"Error storing articles: {e}")
|
209 |
|
|
|
1 |
import os
|
2 |
import feedparser
|
3 |
+
from chromadb import Client, Documents
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.docstore.document import Document
|
6 |
import logging
|
|
|
163 |
if not os.path.exists(LOCAL_DB_DIR):
|
164 |
os.makedirs(LOCAL_DB_DIR)
|
165 |
|
166 |
+
client = Client(persist_directory=LOCAL_DB_DIR)
|
167 |
+
collection = client.get_or_create_collection(name=COLLECTION_NAME)
|
|
|
|
|
|
|
168 |
|
169 |
try:
|
170 |
+
existing_ids = set(collection.get(include=[])["ids"])
|
171 |
logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
|
172 |
except Exception as e:
|
173 |
logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
|
|
|
198 |
|
199 |
if docs_to_add:
|
200 |
try:
|
201 |
+
embeddings = get_embedding_model()
|
202 |
+
for doc, doc_id in zip(docs_to_add, ids_to_add):
|
203 |
+
collection.add(
|
204 |
+
documents=[doc.page_content],
|
205 |
+
metadatas=[doc.metadata],
|
206 |
+
ids=[doc_id],
|
207 |
+
embeddings=[embeddings.embed_query(doc.page_content)]
|
208 |
+
)
|
209 |
+
client.persist()
|
210 |
+
logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {collection.count()}")
|
211 |
except Exception as e:
|
212 |
logger.error(f"Error storing articles: {e}")
|
213 |
|