Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +11 -3
rss_processor.py
CHANGED
@@ -160,15 +160,19 @@ def categorize_feed(url):
|
|
160 |
return "Uncategorized"
|
161 |
|
162 |
def process_and_store_articles(articles):
|
163 |
-
if os.path.exists(LOCAL_DB_DIR):
|
164 |
-
shutil.rmtree(LOCAL_DB_DIR)
|
165 |
-
|
166 |
vector_db = Chroma(
|
167 |
persist_directory=LOCAL_DB_DIR,
|
168 |
embedding_function=get_embedding_model(),
|
169 |
collection_name=COLLECTION_NAME
|
170 |
)
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
docs_to_add = []
|
173 |
ids_to_add = []
|
174 |
|
@@ -177,6 +181,9 @@ def process_and_store_articles(articles):
|
|
177 |
cleaned_link = clean_text(article["link"])
|
178 |
doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
|
179 |
|
|
|
|
|
|
|
180 |
metadata = {
|
181 |
"title": article["title"],
|
182 |
"link": article["link"],
|
@@ -188,6 +195,7 @@ def process_and_store_articles(articles):
|
|
188 |
doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
|
189 |
docs_to_add.append(doc)
|
190 |
ids_to_add.append(doc_id)
|
|
|
191 |
|
192 |
if docs_to_add:
|
193 |
try:
|
|
|
160 |
return "Uncategorized"
|
161 |
|
162 |
def process_and_store_articles(articles):
|
|
|
|
|
|
|
163 |
vector_db = Chroma(
|
164 |
persist_directory=LOCAL_DB_DIR,
|
165 |
embedding_function=get_embedding_model(),
|
166 |
collection_name=COLLECTION_NAME
|
167 |
)
|
168 |
|
169 |
+
try:
|
170 |
+
existing_ids = set(vector_db.get(include=[])["ids"])
|
171 |
+
logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
|
172 |
+
except Exception as e:
|
173 |
+
logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
|
174 |
+
existing_ids = set()
|
175 |
+
|
176 |
docs_to_add = []
|
177 |
ids_to_add = []
|
178 |
|
|
|
181 |
cleaned_link = clean_text(article["link"])
|
182 |
doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
|
183 |
|
184 |
+
if doc_id in existing_ids:
|
185 |
+
continue
|
186 |
+
|
187 |
metadata = {
|
188 |
"title": article["title"],
|
189 |
"link": article["link"],
|
|
|
195 |
doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
|
196 |
docs_to_add.append(doc)
|
197 |
ids_to_add.append(doc_id)
|
198 |
+
existing_ids.add(doc_id)
|
199 |
|
200 |
if docs_to_add:
|
201 |
try:
|