broadfield-dev commited on
Commit
8c48251
·
verified ·
1 Parent(s): 104836a

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +11 -3
rss_processor.py CHANGED
@@ -160,15 +160,19 @@ def categorize_feed(url):
160
  return "Uncategorized"
161
 
162
  def process_and_store_articles(articles):
163
- if os.path.exists(LOCAL_DB_DIR):
164
- shutil.rmtree(LOCAL_DB_DIR)
165
-
166
  vector_db = Chroma(
167
  persist_directory=LOCAL_DB_DIR,
168
  embedding_function=get_embedding_model(),
169
  collection_name=COLLECTION_NAME
170
  )
171
 
 
 
 
 
 
 
 
172
  docs_to_add = []
173
  ids_to_add = []
174
 
@@ -177,6 +181,9 @@ def process_and_store_articles(articles):
177
  cleaned_link = clean_text(article["link"])
178
  doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
179
 
 
 
 
180
  metadata = {
181
  "title": article["title"],
182
  "link": article["link"],
@@ -188,6 +195,7 @@ def process_and_store_articles(articles):
188
  doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
189
  docs_to_add.append(doc)
190
  ids_to_add.append(doc_id)
 
191
 
192
  if docs_to_add:
193
  try:
 
160
  return "Uncategorized"
161
 
162
  def process_and_store_articles(articles):
 
 
 
163
  vector_db = Chroma(
164
  persist_directory=LOCAL_DB_DIR,
165
  embedding_function=get_embedding_model(),
166
  collection_name=COLLECTION_NAME
167
  )
168
 
169
+ try:
170
+ existing_ids = set(vector_db.get(include=[])["ids"])
171
+ logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
172
+ except Exception as e:
173
+ logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
174
+ existing_ids = set()
175
+
176
  docs_to_add = []
177
  ids_to_add = []
178
 
 
181
  cleaned_link = clean_text(article["link"])
182
  doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
183
 
184
+ if doc_id in existing_ids:
185
+ continue
186
+
187
  metadata = {
188
  "title": article["title"],
189
  "link": article["link"],
 
195
  doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
196
  docs_to_add.append(doc)
197
  ids_to_add.append(doc_id)
198
+ existing_ids.add(doc_id)
199
 
200
  if docs_to_add:
201
  try: