broadfield-dev commited on
Commit
2d5f1d5
·
verified ·
1 Parent(s): 30a7f9f

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +59 -34
rss_processor.py CHANGED
@@ -32,11 +32,16 @@ def get_embedding_model():
32
  return get_embedding_model.model
33
 
34
  # Initialize Chroma database globally
35
- vector_db = Chroma(
36
- persist_directory=LOCAL_DB_DIR,
37
- embedding_function=get_embedding_model(),
38
- collection_name=COLLECTION_NAME
39
- )
 
 
 
 
 
40
 
41
  def clean_text(text):
42
  if not text or not isinstance(text, str):
@@ -111,39 +116,60 @@ def fetch_rss_feeds():
111
  return articles
112
 
113
  def process_and_store_articles(articles):
 
 
 
 
114
  try:
115
  existing_ids = set(vector_db.get(include=[])["ids"])
116
- except Exception:
 
 
117
  existing_ids = set()
118
 
119
  docs_to_add = []
120
  ids_to_add = []
121
 
122
  for article in articles:
123
- cleaned_title = clean_text(article["title"])
124
- cleaned_link = clean_text(article["link"])
125
- doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
126
-
127
- if doc_id in existing_ids:
128
- continue
129
-
130
- metadata = {
131
- "title": article["title"],
132
- "link": article["link"],
133
- "original_description": article["description"],
134
- "published": article["published"],
135
- "category": article["category"],
136
- "image": article["image"],
137
- }
138
- doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
139
- docs_to_add.append(doc)
140
- ids_to_add.append(doc_id)
141
- existing_ids.add(doc_id)
 
 
 
 
 
 
 
 
 
 
142
 
143
  if docs_to_add:
144
- vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
145
- vector_db.persist()
146
- logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
 
 
 
 
 
147
 
148
  def download_from_hf_hub():
149
  if not os.path.exists(LOCAL_DB_DIR):
@@ -185,13 +211,12 @@ def upload_to_hf_hub():
185
  logger.error(f"Error uploading to Hugging Face Hub: {e}")
186
 
187
  if __name__ == "__main__":
188
- download_from_hf_hub() # Ensure DB is initialized
 
189
  if not os.path.exists(FEEDS_FILE):
190
  logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
191
  exit(1)
192
  articles = fetch_rss_feeds()
193
- if not articles:
194
- logger.warning("No articles fetched. Database remains empty.")
195
- else:
196
- process_and_store_articles(articles)
197
- upload_to_hf_hub()
 
32
  return get_embedding_model.model
33
 
34
  # Initialize Chroma database globally
35
+ try:
36
+ vector_db = Chroma(
37
+ persist_directory=LOCAL_DB_DIR,
38
+ embedding_function=get_embedding_model(),
39
+ collection_name=COLLECTION_NAME
40
+ )
41
+ logger.info("Chroma database initialized successfully")
42
+ except Exception as e:
43
+ logger.error(f"Failed to initialize Chroma database: {e}")
44
+ exit(1)
45
 
46
  def clean_text(text):
47
  if not text or not isinstance(text, str):
 
116
  return articles
117
 
118
  def process_and_store_articles(articles):
119
+ if not articles:
120
+ logger.warning("No articles to process")
121
+ return
122
+
123
  try:
124
  existing_ids = set(vector_db.get(include=[])["ids"])
125
+ logger.info(f"Existing documents in DB: {len(existing_ids)}")
126
+ except Exception as e:
127
+ logger.error(f"Error retrieving existing IDs: {e}")
128
  existing_ids = set()
129
 
130
  docs_to_add = []
131
  ids_to_add = []
132
 
133
  for article in articles:
134
+ try:
135
+ cleaned_title = clean_text(article["title"])
136
+ cleaned_link = clean_text(article["link"])
137
+ cleaned_description = clean_text(article["description"])
138
+ if not cleaned_description:
139
+ logger.warning(f"Skipping article with empty description: {cleaned_title}")
140
+ continue
141
+
142
+ doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
143
+
144
+ if doc_id in existing_ids:
145
+ logger.debug(f"Skipping duplicate article: {doc_id}")
146
+ continue
147
+
148
+ metadata = {
149
+ "title": article["title"],
150
+ "link": article["link"],
151
+ "original_description": article["description"],
152
+ "published": article["published"],
153
+ "category": article["category"],
154
+ "image": article["image"],
155
+ }
156
+ doc = Document(page_content=cleaned_description, metadata=metadata)
157
+ docs_to_add.append(doc)
158
+ ids_to_add.append(doc_id)
159
+ existing_ids.add(doc_id)
160
+ logger.debug(f"Prepared document for article: {cleaned_title}")
161
+ except Exception as e:
162
+ logger.error(f"Error processing article {article.get('title', 'Unknown')}: {e}")
163
 
164
  if docs_to_add:
165
+ try:
166
+ vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
167
+ vector_db.persist()
168
+ logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
169
+ except Exception as e:
170
+ logger.error(f"Error adding documents to Chroma: {e}")
171
+ else:
172
+ logger.warning("No new documents to add to the database")
173
 
174
  def download_from_hf_hub():
175
  if not os.path.exists(LOCAL_DB_DIR):
 
211
  logger.error(f"Error uploading to Hugging Face Hub: {e}")
212
 
213
  if __name__ == "__main__":
214
+ logger.info("Starting script execution")
215
+ download_from_hf_hub()
216
  if not os.path.exists(FEEDS_FILE):
217
  logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
218
  exit(1)
219
  articles = fetch_rss_feeds()
220
+ process_and_store_articles(articles)
221
+ upload_to_hf_hub()
222
+ logger.info("Script execution completed")