Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +4 -18
rss_processor.py
CHANGED
@@ -17,7 +17,6 @@ logging.basicConfig(level=logging.INFO)
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
LOCAL_DB_DIR = "chroma_db"
|
20 |
-
#RSS_FEEDS = rss_feeds.RSS_FEEDS
|
21 |
FEEDS_FILE = "rss_feeds.json"
|
22 |
COLLECTION_NAME = "news_articles"
|
23 |
HF_API_TOKEN = os.getenv("HF_TOKEN")
|
@@ -161,20 +160,15 @@ def categorize_feed(url):
|
|
161 |
return "Uncategorized"
|
162 |
|
163 |
def process_and_store_articles(articles):
|
|
|
|
|
|
|
164 |
vector_db = Chroma(
|
165 |
persist_directory=LOCAL_DB_DIR,
|
166 |
embedding_function=get_embedding_model(),
|
167 |
-
#embedding_function=embedding_model,
|
168 |
collection_name=COLLECTION_NAME
|
169 |
)
|
170 |
|
171 |
-
try:
|
172 |
-
existing_ids = set(vector_db.get(include=[])["ids"])
|
173 |
-
logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
|
174 |
-
except Exception:
|
175 |
-
existing_ids = set()
|
176 |
-
logger.info("No existing DB found or it is empty. Starting fresh.")
|
177 |
-
|
178 |
docs_to_add = []
|
179 |
ids_to_add = []
|
180 |
|
@@ -183,9 +177,6 @@ def process_and_store_articles(articles):
|
|
183 |
cleaned_link = clean_text(article["link"])
|
184 |
doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
|
185 |
|
186 |
-
if doc_id in existing_ids:
|
187 |
-
continue
|
188 |
-
|
189 |
metadata = {
|
190 |
"title": article["title"],
|
191 |
"link": article["link"],
|
@@ -197,7 +188,6 @@ def process_and_store_articles(articles):
|
|
197 |
doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
|
198 |
docs_to_add.append(doc)
|
199 |
ids_to_add.append(doc_id)
|
200 |
-
existing_ids.add(doc_id)
|
201 |
|
202 |
if docs_to_add:
|
203 |
try:
|
@@ -239,8 +229,4 @@ def upload_to_hf_hub():
|
|
239 |
)
|
240 |
logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
|
241 |
except Exception as e:
|
242 |
-
logger.error(f"Error uploading to Hugging Face Hub: {e}")
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
LOCAL_DB_DIR = "chroma_db"
|
|
|
20 |
FEEDS_FILE = "rss_feeds.json"
|
21 |
COLLECTION_NAME = "news_articles"
|
22 |
HF_API_TOKEN = os.getenv("HF_TOKEN")
|
|
|
160 |
return "Uncategorized"
|
161 |
|
162 |
def process_and_store_articles(articles):
|
163 |
+
if os.path.exists(LOCAL_DB_DIR):
|
164 |
+
shutil.rmtree(LOCAL_DB_DIR)
|
165 |
+
|
166 |
vector_db = Chroma(
|
167 |
persist_directory=LOCAL_DB_DIR,
|
168 |
embedding_function=get_embedding_model(),
|
|
|
169 |
collection_name=COLLECTION_NAME
|
170 |
)
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
docs_to_add = []
|
173 |
ids_to_add = []
|
174 |
|
|
|
177 |
cleaned_link = clean_text(article["link"])
|
178 |
doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
|
179 |
|
|
|
|
|
|
|
180 |
metadata = {
|
181 |
"title": article["title"],
|
182 |
"link": article["link"],
|
|
|
188 |
doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
|
189 |
docs_to_add.append(doc)
|
190 |
ids_to_add.append(doc_id)
|
|
|
191 |
|
192 |
if docs_to_add:
|
193 |
try:
|
|
|
229 |
)
|
230 |
logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
|
231 |
except Exception as e:
|
232 |
+
logger.error(f"Error uploading to Hugging Face Hub: {e}")
|
|
|
|
|
|
|
|