Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +62 -91
rss_processor.py
CHANGED
@@ -5,35 +5,29 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|
5 |
from langchain.docstore.document import Document
|
6 |
import logging
|
7 |
from huggingface_hub import HfApi, login, snapshot_download
|
8 |
-
|
9 |
-
import
|
10 |
from datetime import datetime
|
11 |
import dateutil.parser
|
12 |
import hashlib
|
13 |
import re
|
14 |
|
15 |
-
logging.basicConfig(level=logging.INFO
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
-
MAX_ARTICLES_PER_FEED = 1000
|
19 |
LOCAL_DB_DIR = "chroma_db"
|
20 |
-
|
21 |
COLLECTION_NAME = "news_articles"
|
22 |
HF_API_TOKEN = os.getenv("HF_TOKEN")
|
23 |
REPO_ID = "broadfield-dev/news-rag-db"
|
24 |
|
25 |
-
if not HF_API_TOKEN:
|
26 |
-
raise ValueError("HF_TOKEN environment variable not set.")
|
27 |
-
|
28 |
login(token=HF_API_TOKEN)
|
29 |
hf_api = HfApi()
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
'''def get_embedding_model():
|
34 |
if not hasattr(get_embedding_model, "model"):
|
35 |
get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
36 |
-
return get_embedding_model.model
|
37 |
|
38 |
def clean_text(text):
|
39 |
if not text or not isinstance(text, str):
|
@@ -45,85 +39,62 @@ def clean_text(text):
|
|
45 |
def fetch_rss_feeds():
|
46 |
articles = []
|
47 |
seen_keys = set()
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
return []
|
55 |
-
|
56 |
-
for category, feeds in feed_categories.items():
|
57 |
-
for feed_info in feeds:
|
58 |
-
feed_url = feed_info.get("url")
|
59 |
-
if not feed_url:
|
60 |
-
logger.warning(f"Skipping feed with no URL in category '{category}'")
|
61 |
continue
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
90 |
break
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
image
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
]:
|
107 |
-
try:
|
108 |
-
img = img_source(entry)
|
109 |
-
if img and img.strip():
|
110 |
-
image = img
|
111 |
-
break
|
112 |
-
except (IndexError, AttributeError, TypeError):
|
113 |
-
continue
|
114 |
-
|
115 |
-
articles.append({
|
116 |
-
"title": title_raw,
|
117 |
-
"link": link,
|
118 |
-
"description": clean_desc_val,
|
119 |
-
"published": published,
|
120 |
-
"category": category,
|
121 |
-
"image": image,
|
122 |
-
})
|
123 |
-
article_count += 1
|
124 |
-
except Exception as e:
|
125 |
-
logger.error(f"Error fetching {feed_url}: {e}")
|
126 |
-
logger.info(f"Total unique articles fetched: {len(articles)}")
|
127 |
return articles
|
128 |
|
129 |
def categorize_feed(url):
|
@@ -167,8 +138,8 @@ def categorize_feed(url):
|
|
167 |
def process_and_store_articles(articles):
|
168 |
vector_db = Chroma(
|
169 |
persist_directory=LOCAL_DB_DIR,
|
170 |
-
|
171 |
-
embedding_function=embedding_model,
|
172 |
collection_name=COLLECTION_NAME
|
173 |
)
|
174 |
|
|
|
5 |
from langchain.docstore.document import Document
|
6 |
import logging
|
7 |
from huggingface_hub import HfApi, login, snapshot_download
|
8 |
+
import shutil
|
9 |
+
import rss_feeds
|
10 |
from datetime import datetime
|
11 |
import dateutil.parser
|
12 |
import hashlib
|
13 |
import re
|
14 |
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
|
|
18 |
LOCAL_DB_DIR = "chroma_db"
|
19 |
+
RSS_FEEDS = rss_feeds.RSS_FEEDS
|
20 |
COLLECTION_NAME = "news_articles"
|
21 |
HF_API_TOKEN = os.getenv("HF_TOKEN")
|
22 |
REPO_ID = "broadfield-dev/news-rag-db"
|
23 |
|
|
|
|
|
|
|
24 |
login(token=HF_API_TOKEN)
|
25 |
hf_api = HfApi()
|
26 |
|
27 |
+
def get_embedding_model():
|
|
|
|
|
28 |
if not hasattr(get_embedding_model, "model"):
|
29 |
get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
30 |
+
return get_embedding_model.model
|
31 |
|
32 |
def clean_text(text):
|
33 |
if not text or not isinstance(text, str):
|
|
|
39 |
def fetch_rss_feeds():
|
40 |
articles = []
|
41 |
seen_keys = set()
|
42 |
+
for feed_url in RSS_FEEDS:
|
43 |
+
try:
|
44 |
+
logger.info(f"Fetching {feed_url}")
|
45 |
+
feed = feedparser.parse(feed_url)
|
46 |
+
if feed.bozo:
|
47 |
+
logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
continue
|
49 |
+
article_count = 0
|
50 |
+
for entry in feed.entries:
|
51 |
+
if article_count >= 10:
|
52 |
+
break
|
53 |
+
title = entry.get("title", "No Title")
|
54 |
+
link = entry.get("link", "")
|
55 |
+
description = entry.get("summary", entry.get("description", ""))
|
56 |
+
|
57 |
+
cleaned_title = clean_text(title)
|
58 |
+
cleaned_link = clean_text(link)
|
59 |
+
|
60 |
+
published = "Unknown Date"
|
61 |
+
for date_field in ["published", "updated", "created", "pubDate"]:
|
62 |
+
if date_field in entry:
|
63 |
+
try:
|
64 |
+
parsed_date = dateutil.parser.parse(entry[date_field])
|
65 |
+
published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
|
66 |
+
break
|
67 |
+
except (ValueError, TypeError):
|
68 |
+
continue
|
69 |
+
|
70 |
+
key = f"{cleaned_title}|{cleaned_link}|{published}"
|
71 |
+
if key not in seen_keys:
|
72 |
+
seen_keys.add(key)
|
73 |
+
image = "svg"
|
74 |
+
for img_source in [
|
75 |
+
lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
|
76 |
+
lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
|
77 |
+
]:
|
78 |
+
try:
|
79 |
+
img = img_source(entry)
|
80 |
+
if img and img.strip():
|
81 |
+
image = img
|
82 |
break
|
83 |
+
except (IndexError, AttributeError, TypeError):
|
84 |
+
continue
|
85 |
+
|
86 |
+
articles.append({
|
87 |
+
"title": title,
|
88 |
+
"link": link,
|
89 |
+
"description": description,
|
90 |
+
"published": published,
|
91 |
+
"category": categorize_feed(feed_url),
|
92 |
+
"image": image,
|
93 |
+
})
|
94 |
+
article_count += 1
|
95 |
+
except Exception as e:
|
96 |
+
logger.error(f"Error fetching {feed_url}: {e}")
|
97 |
+
logger.info(f"Total articles fetched: {len(articles)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
return articles
|
99 |
|
100 |
def categorize_feed(url):
|
|
|
138 |
def process_and_store_articles(articles):
|
139 |
vector_db = Chroma(
|
140 |
persist_directory=LOCAL_DB_DIR,
|
141 |
+
embedding_function=get_embedding_model(),
|
142 |
+
#embedding_function=embedding_model,
|
143 |
collection_name=COLLECTION_NAME
|
144 |
)
|
145 |
|