broadfield-dev commited on
Commit
104836a
·
verified ·
1 Parent(s): 62de7e5

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +4 -18
rss_processor.py CHANGED
@@ -17,7 +17,6 @@ logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
  LOCAL_DB_DIR = "chroma_db"
20
- #RSS_FEEDS = rss_feeds.RSS_FEEDS
21
  FEEDS_FILE = "rss_feeds.json"
22
  COLLECTION_NAME = "news_articles"
23
  HF_API_TOKEN = os.getenv("HF_TOKEN")
@@ -161,20 +160,15 @@ def categorize_feed(url):
161
  return "Uncategorized"
162
 
163
  def process_and_store_articles(articles):
 
 
 
164
  vector_db = Chroma(
165
  persist_directory=LOCAL_DB_DIR,
166
  embedding_function=get_embedding_model(),
167
- #embedding_function=embedding_model,
168
  collection_name=COLLECTION_NAME
169
  )
170
 
171
- try:
172
- existing_ids = set(vector_db.get(include=[])["ids"])
173
- logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
174
- except Exception:
175
- existing_ids = set()
176
- logger.info("No existing DB found or it is empty. Starting fresh.")
177
-
178
  docs_to_add = []
179
  ids_to_add = []
180
 
@@ -183,9 +177,6 @@ def process_and_store_articles(articles):
183
  cleaned_link = clean_text(article["link"])
184
  doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
185
 
186
- if doc_id in existing_ids:
187
- continue
188
-
189
  metadata = {
190
  "title": article["title"],
191
  "link": article["link"],
@@ -197,7 +188,6 @@ def process_and_store_articles(articles):
197
  doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
198
  docs_to_add.append(doc)
199
  ids_to_add.append(doc_id)
200
- existing_ids.add(doc_id)
201
 
202
  if docs_to_add:
203
  try:
@@ -239,8 +229,4 @@ def upload_to_hf_hub():
239
  )
240
  logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
241
  except Exception as e:
242
- logger.error(f"Error uploading to Hugging Face Hub: {e}")
243
-
244
-
245
-
246
-
 
17
  logger = logging.getLogger(__name__)
18
 
19
  LOCAL_DB_DIR = "chroma_db"
 
20
  FEEDS_FILE = "rss_feeds.json"
21
  COLLECTION_NAME = "news_articles"
22
  HF_API_TOKEN = os.getenv("HF_TOKEN")
 
160
  return "Uncategorized"
161
 
162
  def process_and_store_articles(articles):
163
+ if os.path.exists(LOCAL_DB_DIR):
164
+ shutil.rmtree(LOCAL_DB_DIR)
165
+
166
  vector_db = Chroma(
167
  persist_directory=LOCAL_DB_DIR,
168
  embedding_function=get_embedding_model(),
 
169
  collection_name=COLLECTION_NAME
170
  )
171
 
 
 
 
 
 
 
 
172
  docs_to_add = []
173
  ids_to_add = []
174
 
 
177
  cleaned_link = clean_text(article["link"])
178
  doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
179
 
 
 
 
180
  metadata = {
181
  "title": article["title"],
182
  "link": article["link"],
 
188
  doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
189
  docs_to_add.append(doc)
190
  ids_to_add.append(doc_id)
 
191
 
192
  if docs_to_add:
193
  try:
 
229
  )
230
  logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
231
  except Exception as e:
232
+ logger.error(f"Error uploading to Hugging Face Hub: {e}")