broadfield-dev commited on
Commit
9271377
·
verified ·
1 Parent(s): acc164e

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +62 -91
rss_processor.py CHANGED
@@ -5,35 +5,29 @@ from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.docstore.document import Document
6
  import logging
7
  from huggingface_hub import HfApi, login, snapshot_download
8
- from huggingface_hub.utils import HfHubHTTPError
9
- import json
10
  from datetime import datetime
11
  import dateutil.parser
12
  import hashlib
13
  import re
14
 
15
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
  logger = logging.getLogger(__name__)
17
 
18
- MAX_ARTICLES_PER_FEED = 1000
19
  LOCAL_DB_DIR = "chroma_db"
20
- FEEDS_FILE = "rss_feeds.json"
21
  COLLECTION_NAME = "news_articles"
22
  HF_API_TOKEN = os.getenv("HF_TOKEN")
23
  REPO_ID = "broadfield-dev/news-rag-db"
24
 
25
- if not HF_API_TOKEN:
26
- raise ValueError("HF_TOKEN environment variable not set.")
27
-
28
  login(token=HF_API_TOKEN)
29
  hf_api = HfApi()
30
 
31
- embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
32
-
33
- '''def get_embedding_model():
34
  if not hasattr(get_embedding_model, "model"):
35
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
36
- return get_embedding_model.model'''
37
 
38
  def clean_text(text):
39
  if not text or not isinstance(text, str):
@@ -45,85 +39,62 @@ def clean_text(text):
45
  def fetch_rss_feeds():
46
  articles = []
47
  seen_keys = set()
48
-
49
- try:
50
- with open(FEEDS_FILE, 'r') as f:
51
- feed_categories = json.load(f)
52
- except FileNotFoundError:
53
- logger.error(f"{FEEDS_FILE} not found. No feeds to process.")
54
- return []
55
-
56
- for category, feeds in feed_categories.items():
57
- for feed_info in feeds:
58
- feed_url = feed_info.get("url")
59
- if not feed_url:
60
- logger.warning(f"Skipping feed with no URL in category '{category}'")
61
  continue
62
-
63
- try:
64
- logger.info(f"Fetching {feed_url}")
65
- feed = feedparser.parse(feed_url)
66
- if feed.bozo:
67
- logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
68
- continue
69
- article_count = 0
70
- for entry in feed.entries:
71
- if article_count >= MAX_ARTICLES_PER_FEED:
72
- break
73
-
74
- title_raw = entry.get("title", "No Title")
75
- link = entry.get("link", "")
76
- description = entry.get("summary", entry.get("description", ""))
77
-
78
- clean_title_val = clean_text(title_raw)
79
- clean_desc_val = clean_text(description)
80
-
81
- if not clean_desc_val:
82
- continue
83
-
84
- published = "Unknown Date"
85
- for date_field in ["published", "updated", "created", "pubDate"]:
86
- if date_field in entry:
87
- try:
88
- parsed_date = dateutil.parser.parse(entry[date_field])
89
- published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
90
  break
91
- except (ValueError, TypeError):
92
- continue
93
-
94
- description_hash = hashlib.sha256(clean_desc_val.encode('utf-8')).hexdigest()
95
- key = f"{clean_title_val}|{link}|{published}|{description_hash}"
96
-
97
- if key not in seen_keys:
98
- seen_keys.add(key)
99
-
100
- image = "svg"
101
- for img_source in [
102
- lambda e: e.get("media_content", [{}])[0].get("url") if e.get("media_content") else "",
103
- lambda e: e.get("media_thumbnail", [{}])[0].get("url") if e.get("media_thumbnail") else "",
104
- lambda e: e.get("enclosure", {}).get("url") if e.get("enclosure") else "",
105
- lambda e: next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), ""),
106
- ]:
107
- try:
108
- img = img_source(entry)
109
- if img and img.strip():
110
- image = img
111
- break
112
- except (IndexError, AttributeError, TypeError):
113
- continue
114
-
115
- articles.append({
116
- "title": title_raw,
117
- "link": link,
118
- "description": clean_desc_val,
119
- "published": published,
120
- "category": category,
121
- "image": image,
122
- })
123
- article_count += 1
124
- except Exception as e:
125
- logger.error(f"Error fetching {feed_url}: {e}")
126
- logger.info(f"Total unique articles fetched: {len(articles)}")
127
  return articles
128
 
129
  def categorize_feed(url):
@@ -167,8 +138,8 @@ def categorize_feed(url):
167
  def process_and_store_articles(articles):
168
  vector_db = Chroma(
169
  persist_directory=LOCAL_DB_DIR,
170
- #embedding_function=get_embedding_model(),
171
- embedding_function=embedding_model,
172
  collection_name=COLLECTION_NAME
173
  )
174
 
 
5
  from langchain.docstore.document import Document
6
  import logging
7
  from huggingface_hub import HfApi, login, snapshot_download
8
+ import shutil
9
+ import rss_feeds
10
  from datetime import datetime
11
  import dateutil.parser
12
  import hashlib
13
  import re
14
 
15
+ logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
 
18
  LOCAL_DB_DIR = "chroma_db"
19
+ RSS_FEEDS = rss_feeds.RSS_FEEDS
20
  COLLECTION_NAME = "news_articles"
21
  HF_API_TOKEN = os.getenv("HF_TOKEN")
22
  REPO_ID = "broadfield-dev/news-rag-db"
23
 
 
 
 
24
  login(token=HF_API_TOKEN)
25
  hf_api = HfApi()
26
 
27
+ def get_embedding_model():
 
 
28
  if not hasattr(get_embedding_model, "model"):
29
  get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
+ return get_embedding_model.model
31
 
32
  def clean_text(text):
33
  if not text or not isinstance(text, str):
 
39
  def fetch_rss_feeds():
40
  articles = []
41
  seen_keys = set()
42
+ for feed_url in RSS_FEEDS:
43
+ try:
44
+ logger.info(f"Fetching {feed_url}")
45
+ feed = feedparser.parse(feed_url)
46
+ if feed.bozo:
47
+ logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
 
 
 
 
 
 
 
48
  continue
49
+ article_count = 0
50
+ for entry in feed.entries:
51
+ if article_count >= 10:
52
+ break
53
+ title = entry.get("title", "No Title")
54
+ link = entry.get("link", "")
55
+ description = entry.get("summary", entry.get("description", ""))
56
+
57
+ cleaned_title = clean_text(title)
58
+ cleaned_link = clean_text(link)
59
+
60
+ published = "Unknown Date"
61
+ for date_field in ["published", "updated", "created", "pubDate"]:
62
+ if date_field in entry:
63
+ try:
64
+ parsed_date = dateutil.parser.parse(entry[date_field])
65
+ published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
66
+ break
67
+ except (ValueError, TypeError):
68
+ continue
69
+
70
+ key = f"{cleaned_title}|{cleaned_link}|{published}"
71
+ if key not in seen_keys:
72
+ seen_keys.add(key)
73
+ image = "svg"
74
+ for img_source in [
75
+ lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
76
+ lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
77
+ ]:
78
+ try:
79
+ img = img_source(entry)
80
+ if img and img.strip():
81
+ image = img
82
  break
83
+ except (IndexError, AttributeError, TypeError):
84
+ continue
85
+
86
+ articles.append({
87
+ "title": title,
88
+ "link": link,
89
+ "description": description,
90
+ "published": published,
91
+ "category": categorize_feed(feed_url),
92
+ "image": image,
93
+ })
94
+ article_count += 1
95
+ except Exception as e:
96
+ logger.error(f"Error fetching {feed_url}: {e}")
97
+ logger.info(f"Total articles fetched: {len(articles)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  return articles
99
 
100
  def categorize_feed(url):
 
138
  def process_and_store_articles(articles):
139
  vector_db = Chroma(
140
  persist_directory=LOCAL_DB_DIR,
141
+ embedding_function=get_embedding_model(),
142
+ #embedding_function=embedding_model,
143
  collection_name=COLLECTION_NAME
144
  )
145