broadfield-dev commited on
Commit
ebf6a83
·
verified ·
1 Parent(s): 24c5dcb

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +37 -80
rss_processor.py CHANGED
@@ -21,28 +21,14 @@ HF_API_TOKEN = os.getenv("HF_TOKEN")
21
  REPO_ID = "broadfield-dev/news-rag-db"
22
  FEEDS_FILE = "rss_feeds.json"
23
 
24
- # Initialize Hugging Face API and login
25
  login(token=HF_API_TOKEN)
26
  hf_api = HfApi()
27
 
28
- # Initialize embedding model
29
  def get_embedding_model():
30
  if not hasattr(get_embedding_model, "model"):
31
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
32
  return get_embedding_model.model
33
 
34
- # Initialize Chroma database globally
35
- try:
36
- vector_db = Chroma(
37
- persist_directory=LOCAL_DB_DIR,
38
- embedding_function=get_embedding_model(),
39
- collection_name=COLLECTION_NAME
40
- )
41
- logger.info("Chroma database initialized successfully")
42
- except Exception as e:
43
- logger.error(f"Failed to initialize Chroma database: {e}")
44
- exit(1)
45
-
46
  def clean_text(text):
47
  if not text or not isinstance(text, str):
48
  return ""
@@ -69,13 +55,14 @@ def fetch_rss_feeds():
69
 
70
  try:
71
  logger.info(f"Fetching '{feed_info.get('name', feed_url)}' from category '{category}'")
 
72
  feed = feedparser.parse(feed_url, agent="RSSNewsBot/1.0 (+http://huggingface.co/spaces/broadfield-dev/RSS_News)")
 
73
  if feed.bozo:
74
  logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
75
  continue
76
 
77
- #for entry in feed.entries[:10]:
78
- for entry in feed.entries:
79
  title = entry.get("title", "No Title")
80
  link = entry.get("link", "")
81
  description = entry.get("summary", entry.get("description", ""))
@@ -107,7 +94,7 @@ def fetch_rss_feeds():
107
  "link": link,
108
  "description": description,
109
  "published": published,
110
- "category": category,
111
  "image": image,
112
  })
113
  except Exception as e:
@@ -117,67 +104,49 @@ def fetch_rss_feeds():
117
  return articles
118
 
119
  def process_and_store_articles(articles):
120
- if not articles:
121
- logger.warning("No articles to process")
122
- return
123
-
 
 
124
  try:
125
  existing_ids = set(vector_db.get(include=[])["ids"])
126
- logger.info(f"Existing documents in DB: {len(existing_ids)}")
127
- except Exception as e:
128
- logger.error(f"Error retrieving existing IDs: {e}")
129
  existing_ids = set()
130
 
131
  docs_to_add = []
132
  ids_to_add = []
133
 
134
  for article in articles:
135
- try:
136
- cleaned_title = clean_text(article["title"])
137
- cleaned_link = clean_text(article["link"])
138
- cleaned_description = clean_text(article["description"])
139
- if not cleaned_description:
140
- logger.warning(f"Skipping article with empty description: {cleaned_title}")
141
- continue
142
-
143
- doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
144
-
145
- if doc_id in existing_ids:
146
- logger.debug(f"Skipping duplicate article: {doc_id}")
147
- continue
148
-
149
- metadata = {
150
- "title": article["title"],
151
- "link": article["link"],
152
- "original_description": article["description"],
153
- "published": article["published"],
154
- "category": article["category"],
155
- "image": article["image"],
156
- }
157
- doc = Document(page_content=cleaned_description, metadata=metadata)
158
- docs_to_add.append(doc)
159
- ids_to_add.append(doc_id)
160
- existing_ids.add(doc_id)
161
- logger.debug(f"Prepared document for article: {cleaned_title}")
162
- except Exception as e:
163
- logger.error(f"Error processing article {article.get('title', 'Unknown')}: {e}")
164
 
165
  if docs_to_add:
166
- try:
167
- vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
168
- vector_db.persist()
169
- logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
170
- except Exception as e:
171
- logger.error(f"Error adding documents to Chroma: {e}")
172
- else:
173
- logger.warning("No new documents to add to the database")
174
 
175
  def download_from_hf_hub():
176
  if not os.path.exists(LOCAL_DB_DIR):
177
  try:
178
- # Create repo if it doesn't exist
179
- hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
180
- logger.info(f"Downloading Chroma DB from {REPO_ID}...")
181
  snapshot_download(
182
  repo_id=REPO_ID,
183
  repo_type="dataset",
@@ -186,19 +155,12 @@ def download_from_hf_hub():
186
  allow_patterns=f"{LOCAL_DB_DIR}/**",
187
  token=HF_API_TOKEN
188
  )
189
- logger.info(f"Successfully downloaded database from {REPO_ID}")
190
  except Exception as e:
191
- logger.warning(f"Could not download DB from Hub (normal on first run): {e}")
192
- # Ensure database is initialized even if download fails
193
- vector_db.persist()
194
- logger.info(f"Initialized empty Chroma database at {LOCAL_DB_DIR}")
195
- else:
196
- logger.info("Local Chroma DB exists, loading existing data.")
197
 
198
  def upload_to_hf_hub():
199
  if os.path.exists(LOCAL_DB_DIR):
200
  try:
201
- logger.info(f"Uploading updated Chroma DB to {REPO_ID}...")
202
  hf_api.upload_folder(
203
  folder_path=LOCAL_DB_DIR,
204
  path_in_repo=LOCAL_DB_DIR,
@@ -207,17 +169,12 @@ def upload_to_hf_hub():
207
  token=HF_API_TOKEN,
208
  commit_message="Update RSS news database"
209
  )
210
- logger.info(f"Database uploaded to: {REPO_ID}")
211
  except Exception as e:
212
  logger.error(f"Error uploading to Hugging Face Hub: {e}")
213
 
214
  if __name__ == "__main__":
215
- logger.info("Starting script execution")
216
  download_from_hf_hub()
217
- if not os.path.exists(FEEDS_FILE):
218
- logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
219
- exit(1)
220
  articles = fetch_rss_feeds()
221
- process_and_store_articles(articles)
222
- upload_to_hf_hub()
223
- logger.info("Script execution completed")
 
21
  REPO_ID = "broadfield-dev/news-rag-db"
22
  FEEDS_FILE = "rss_feeds.json"
23
 
 
24
  login(token=HF_API_TOKEN)
25
  hf_api = HfApi()
26
 
 
27
  def get_embedding_model():
28
  if not hasattr(get_embedding_model, "model"):
29
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
  return get_embedding_model.model
31
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def clean_text(text):
33
  if not text or not isinstance(text, str):
34
  return ""
 
55
 
56
  try:
57
  logger.info(f"Fetching '{feed_info.get('name', feed_url)}' from category '{category}'")
58
+ # Add a User-Agent to prevent getting blocked
59
  feed = feedparser.parse(feed_url, agent="RSSNewsBot/1.0 (+http://huggingface.co/spaces/broadfield-dev/RSS_News)")
60
+
61
  if feed.bozo:
62
  logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
63
  continue
64
 
65
+ for entry in feed.entries[:10]: # Process max 10 entries per feed
 
66
  title = entry.get("title", "No Title")
67
  link = entry.get("link", "")
68
  description = entry.get("summary", entry.get("description", ""))
 
94
  "link": link,
95
  "description": description,
96
  "published": published,
97
+ "category": category, # Directly use category from JSON
98
  "image": image,
99
  })
100
  except Exception as e:
 
104
  return articles
105
 
106
  def process_and_store_articles(articles):
107
+ vector_db = Chroma(
108
+ persist_directory=LOCAL_DB_DIR,
109
+ embedding_function=get_embedding_model(),
110
+ collection_name=COLLECTION_NAME
111
+ )
112
+
113
  try:
114
  existing_ids = set(vector_db.get(include=[])["ids"])
115
+ except Exception:
 
 
116
  existing_ids = set()
117
 
118
  docs_to_add = []
119
  ids_to_add = []
120
 
121
  for article in articles:
122
+ cleaned_title = clean_text(article["title"])
123
+ cleaned_link = clean_text(article["link"])
124
+ doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
125
+
126
+ if doc_id in existing_ids:
127
+ continue
128
+
129
+ metadata = {
130
+ "title": article["title"],
131
+ "link": article["link"],
132
+ "original_description": article["description"],
133
+ "published": article["published"],
134
+ "category": article["category"],
135
+ "image": article["image"],
136
+ }
137
+ doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
138
+ docs_to_add.append(doc)
139
+ ids_to_add.append(doc_id)
140
+ existing_ids.add(doc_id)
 
 
 
 
 
 
 
 
 
 
141
 
142
  if docs_to_add:
143
+ vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
144
+ vector_db.persist()
145
+ logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
 
 
 
 
 
146
 
147
  def download_from_hf_hub():
148
  if not os.path.exists(LOCAL_DB_DIR):
149
  try:
 
 
 
150
  snapshot_download(
151
  repo_id=REPO_ID,
152
  repo_type="dataset",
 
155
  allow_patterns=f"{LOCAL_DB_DIR}/**",
156
  token=HF_API_TOKEN
157
  )
 
158
  except Exception as e:
159
+ logger.warning(f"Could not download DB from Hub (this is normal on first run): {e}")
 
 
 
 
 
160
 
161
  def upload_to_hf_hub():
162
  if os.path.exists(LOCAL_DB_DIR):
163
  try:
 
164
  hf_api.upload_folder(
165
  folder_path=LOCAL_DB_DIR,
166
  path_in_repo=LOCAL_DB_DIR,
 
169
  token=HF_API_TOKEN,
170
  commit_message="Update RSS news database"
171
  )
 
172
  except Exception as e:
173
  logger.error(f"Error uploading to Hugging Face Hub: {e}")
174
 
175
  if __name__ == "__main__":
 
176
  download_from_hf_hub()
 
 
 
177
  articles = fetch_rss_feeds()
178
+ if articles:
179
+ process_and_store_articles(articles)
180
+ upload_to_hf_hub()