broadfield-dev commited on
Commit
cf10c85
·
verified ·
1 Parent(s): 66bca53

Create rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +180 -0
rss_processor.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import feedparser
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.docstore.document import Document
6
+ import logging
7
+ from huggingface_hub import HfApi, login, snapshot_download
8
+ import shutil
9
+ import json
10
+ from datetime import datetime
11
+ import dateutil.parser
12
+ import hashlib
13
+ import re
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ LOCAL_DB_DIR = "chroma_db"
19
+ COLLECTION_NAME = "news_articles"
20
+ HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
21
+ REPO_ID = "broadfield-dev/news-rag-db"
22
+ FEEDS_FILE = "rss_feeds.json"
23
+
24
+ login(token=HF_API_TOKEN)
25
+ hf_api = HfApi()
26
+
27
+ def get_embedding_model():
28
+ if not hasattr(get_embedding_model, "model"):
29
+ get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
+ return get_embedding_model.model
31
+
32
+ def clean_text(text):
33
+ if not text or not isinstance(text, str):
34
+ return ""
35
+ text = re.sub(r'<.*?>', '', text)
36
+ text = ' '.join(text.split())
37
+ return text.strip().lower()
38
+
39
+ def fetch_rss_feeds():
40
+ articles = []
41
+ seen_keys = set()
42
+
43
+ try:
44
+ with open(FEEDS_FILE, 'r') as f:
45
+ feed_categories = json.load(f)
46
+ except FileNotFoundError:
47
+ logger.error(f"{FEEDS_FILE} not found. No feeds to process.")
48
+ return []
49
+
50
+ for category, feeds in feed_categories.items():
51
+ for feed_info in feeds:
52
+ feed_url = feed_info.get("url")
53
+ if not feed_url:
54
+ continue
55
+
56
+ try:
57
+ logger.info(f"Fetching '{feed_info.get('name', feed_url)}' from category '{category}'")
58
+ # Add a User-Agent to prevent getting blocked
59
+ feed = feedparser.parse(feed_url, agent="RSSNewsBot/1.0 (+http://huggingface.co/spaces/broadfield-dev/RSS_News)")
60
+
61
+ if feed.bozo:
62
+ logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
63
+ continue
64
+
65
+ for entry in feed.entries[:10]: # Process max 10 entries per feed
66
+ title = entry.get("title", "No Title")
67
+ link = entry.get("link", "")
68
+ description = entry.get("summary", entry.get("description", ""))
69
+
70
+ cleaned_title = clean_text(title)
71
+ cleaned_link = clean_text(link)
72
+
73
+ published = "Unknown Date"
74
+ for date_field in ["published", "updated", "created", "pubDate"]:
75
+ if date_field in entry:
76
+ try:
77
+ parsed_date = dateutil.parser.parse(entry[date_field])
78
+ published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
79
+ break
80
+ except (ValueError, TypeError):
81
+ continue
82
+
83
+ key = f"{cleaned_title}|{cleaned_link}|{published}"
84
+ if key not in seen_keys:
85
+ seen_keys.add(key)
86
+ image = "svg"
87
+ if 'media_content' in entry and entry.media_content:
88
+ image = entry.media_content[0].get('url', 'svg')
89
+ elif 'media_thumbnail' in entry and entry.media_thumbnail:
90
+ image = entry.media_thumbnail[0].get('url', 'svg')
91
+
92
+ articles.append({
93
+ "title": title,
94
+ "link": link,
95
+ "description": description,
96
+ "published": published,
97
+ "category": category, # Directly use category from JSON
98
+ "image": image,
99
+ })
100
+ except Exception as e:
101
+ logger.error(f"Error fetching {feed_url}: {e}")
102
+
103
+ logger.info(f"Total articles fetched: {len(articles)}")
104
+ return articles
105
+
106
+ def process_and_store_articles(articles):
107
+ vector_db = Chroma(
108
+ persist_directory=LOCAL_DB_DIR,
109
+ embedding_function=get_embedding_model(),
110
+ collection_name=COLLECTION_NAME
111
+ )
112
+
113
+ try:
114
+ existing_ids = set(vector_db.get(include=[])["ids"])
115
+ except Exception:
116
+ existing_ids = set()
117
+
118
+ docs_to_add = []
119
+ ids_to_add = []
120
+
121
+ for article in articles:
122
+ cleaned_title = clean_text(article["title"])
123
+ cleaned_link = clean_text(article["link"])
124
+ doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
125
+
126
+ if doc_id in existing_ids:
127
+ continue
128
+
129
+ metadata = {
130
+ "title": article["title"],
131
+ "link": article["link"],
132
+ "original_description": article["description"],
133
+ "published": article["published"],
134
+ "category": article["category"],
135
+ "image": article["image"],
136
+ }
137
+ doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
138
+ docs_to_add.append(doc)
139
+ ids_to_add.append(doc_id)
140
+ existing_ids.add(doc_id)
141
+
142
+ if docs_to_add:
143
+ vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
144
+ vector_db.persist()
145
+ logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
146
+
147
+ def download_from_hf_hub():
148
+ if not os.path.exists(LOCAL_DB_DIR):
149
+ try:
150
+ snapshot_download(
151
+ repo_id=REPO_ID,
152
+ repo_type="dataset",
153
+ local_dir=".",
154
+ local_dir_use_symlinks=False,
155
+ allow_patterns=f"{LOCAL_DB_DIR}/**",
156
+ token=HF_API_TOKEN
157
+ )
158
+ except Exception as e:
159
+ logger.warning(f"Could not download DB from Hub (this is normal on first run): {e}")
160
+
161
+ def upload_to_hf_hub():
162
+ if os.path.exists(LOCAL_DB_DIR):
163
+ try:
164
+ hf_api.upload_folder(
165
+ folder_path=LOCAL_DB_DIR,
166
+ path_in_repo=LOCAL_DB_DIR,
167
+ repo_id=REPO_ID,
168
+ repo_type="dataset",
169
+ token=HF_API_TOKEN,
170
+ commit_message="Update RSS news database"
171
+ )
172
+ except Exception as e:
173
+ logger.error(f"Error uploading to Hugging Face Hub: {e}")
174
+
175
+ if __name__ == "__main__":
176
+ download_from_hf_hub()
177
+ articles = fetch_rss_feeds()
178
+ if articles:
179
+ process_and_store_articles(articles)
180
+ upload_to_hf_hub()