broadfield-dev commited on
Commit
522b0df
·
verified ·
1 Parent(s): 46fd3e1

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +14 -10
rss_processor.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import feedparser
3
- from langchain.vectorstores import Chroma
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.docstore.document import Document
6
  import logging
@@ -163,14 +163,11 @@ def process_and_store_articles(articles):
163
  if not os.path.exists(LOCAL_DB_DIR):
164
  os.makedirs(LOCAL_DB_DIR)
165
 
166
- vector_db = Chroma(
167
- persist_directory=LOCAL_DB_DIR,
168
- embedding_function=get_embedding_model(),
169
- collection_name=COLLECTION_NAME
170
- )
171
 
172
  try:
173
- existing_ids = set(vector_db.get(include=[])["ids"])
174
  logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
175
  except Exception as e:
176
  logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
@@ -201,9 +198,16 @@ def process_and_store_articles(articles):
201
 
202
  if docs_to_add:
203
  try:
204
- vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
205
- vector_db._client.persist()
206
- logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
 
 
 
 
 
 
 
207
  except Exception as e:
208
  logger.error(f"Error storing articles: {e}")
209
 
 
1
  import os
2
  import feedparser
3
+ from chromadb import Client, Documents
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.docstore.document import Document
6
  import logging
 
163
  if not os.path.exists(LOCAL_DB_DIR):
164
  os.makedirs(LOCAL_DB_DIR)
165
 
166
+ client = Client(persist_directory=LOCAL_DB_DIR)
167
+ collection = client.get_or_create_collection(name=COLLECTION_NAME)
 
 
 
168
 
169
  try:
170
+ existing_ids = set(collection.get(include=[])["ids"])
171
  logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
172
  except Exception as e:
173
  logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
 
198
 
199
  if docs_to_add:
200
  try:
201
+ embeddings = get_embedding_model()
202
+ for doc, doc_id in zip(docs_to_add, ids_to_add):
203
+ collection.add(
204
+ documents=[doc.page_content],
205
+ metadatas=[doc.metadata],
206
+ ids=[doc_id],
207
+ embeddings=[embeddings.embed_query(doc.page_content)]
208
+ )
209
+ client.persist()
210
+ logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {collection.count()}")
211
  except Exception as e:
212
  logger.error(f"Error storing articles: {e}")
213