Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +59 -34
rss_processor.py
CHANGED
@@ -32,11 +32,16 @@ def get_embedding_model():
|
|
32 |
return get_embedding_model.model
|
33 |
|
34 |
# Initialize Chroma database globally
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def clean_text(text):
|
42 |
if not text or not isinstance(text, str):
|
@@ -111,39 +116,60 @@ def fetch_rss_feeds():
|
|
111 |
return articles
|
112 |
|
113 |
def process_and_store_articles(articles):
|
|
|
|
|
|
|
|
|
114 |
try:
|
115 |
existing_ids = set(vector_db.get(include=[])["ids"])
|
116 |
-
|
|
|
|
|
117 |
existing_ids = set()
|
118 |
|
119 |
docs_to_add = []
|
120 |
ids_to_add = []
|
121 |
|
122 |
for article in articles:
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
"
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
if docs_to_add:
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
def download_from_hf_hub():
|
149 |
if not os.path.exists(LOCAL_DB_DIR):
|
@@ -185,13 +211,12 @@ def upload_to_hf_hub():
|
|
185 |
logger.error(f"Error uploading to Hugging Face Hub: {e}")
|
186 |
|
187 |
if __name__ == "__main__":
|
188 |
-
|
|
|
189 |
if not os.path.exists(FEEDS_FILE):
|
190 |
logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
|
191 |
exit(1)
|
192 |
articles = fetch_rss_feeds()
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
process_and_store_articles(articles)
|
197 |
-
upload_to_hf_hub()
|
|
|
32 |
return get_embedding_model.model
|
33 |
|
34 |
# Initialize Chroma database globally
|
35 |
+
try:
|
36 |
+
vector_db = Chroma(
|
37 |
+
persist_directory=LOCAL_DB_DIR,
|
38 |
+
embedding_function=get_embedding_model(),
|
39 |
+
collection_name=COLLECTION_NAME
|
40 |
+
)
|
41 |
+
logger.info("Chroma database initialized successfully")
|
42 |
+
except Exception as e:
|
43 |
+
logger.error(f"Failed to initialize Chroma database: {e}")
|
44 |
+
exit(1)
|
45 |
|
46 |
def clean_text(text):
|
47 |
if not text or not isinstance(text, str):
|
|
|
116 |
return articles
|
117 |
|
118 |
def process_and_store_articles(articles):
|
119 |
+
if not articles:
|
120 |
+
logger.warning("No articles to process")
|
121 |
+
return
|
122 |
+
|
123 |
try:
|
124 |
existing_ids = set(vector_db.get(include=[])["ids"])
|
125 |
+
logger.info(f"Existing documents in DB: {len(existing_ids)}")
|
126 |
+
except Exception as e:
|
127 |
+
logger.error(f"Error retrieving existing IDs: {e}")
|
128 |
existing_ids = set()
|
129 |
|
130 |
docs_to_add = []
|
131 |
ids_to_add = []
|
132 |
|
133 |
for article in articles:
|
134 |
+
try:
|
135 |
+
cleaned_title = clean_text(article["title"])
|
136 |
+
cleaned_link = clean_text(article["link"])
|
137 |
+
cleaned_description = clean_text(article["description"])
|
138 |
+
if not cleaned_description:
|
139 |
+
logger.warning(f"Skipping article with empty description: {cleaned_title}")
|
140 |
+
continue
|
141 |
+
|
142 |
+
doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
|
143 |
+
|
144 |
+
if doc_id in existing_ids:
|
145 |
+
logger.debug(f"Skipping duplicate article: {doc_id}")
|
146 |
+
continue
|
147 |
+
|
148 |
+
metadata = {
|
149 |
+
"title": article["title"],
|
150 |
+
"link": article["link"],
|
151 |
+
"original_description": article["description"],
|
152 |
+
"published": article["published"],
|
153 |
+
"category": article["category"],
|
154 |
+
"image": article["image"],
|
155 |
+
}
|
156 |
+
doc = Document(page_content=cleaned_description, metadata=metadata)
|
157 |
+
docs_to_add.append(doc)
|
158 |
+
ids_to_add.append(doc_id)
|
159 |
+
existing_ids.add(doc_id)
|
160 |
+
logger.debug(f"Prepared document for article: {cleaned_title}")
|
161 |
+
except Exception as e:
|
162 |
+
logger.error(f"Error processing article {article.get('title', 'Unknown')}: {e}")
|
163 |
|
164 |
if docs_to_add:
|
165 |
+
try:
|
166 |
+
vector_db.add_documents(documents=docs_to_add, ids=ids_to_add)
|
167 |
+
vector_db.persist()
|
168 |
+
logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {vector_db._collection.count()}")
|
169 |
+
except Exception as e:
|
170 |
+
logger.error(f"Error adding documents to Chroma: {e}")
|
171 |
+
else:
|
172 |
+
logger.warning("No new documents to add to the database")
|
173 |
|
174 |
def download_from_hf_hub():
|
175 |
if not os.path.exists(LOCAL_DB_DIR):
|
|
|
211 |
logger.error(f"Error uploading to Hugging Face Hub: {e}")
|
212 |
|
213 |
if __name__ == "__main__":
|
214 |
+
logger.info("Starting script execution")
|
215 |
+
download_from_hf_hub()
|
216 |
if not os.path.exists(FEEDS_FILE):
|
217 |
logger.error(f"Missing {FEEDS_FILE}. Please create it with RSS feed URLs.")
|
218 |
exit(1)
|
219 |
articles = fetch_rss_feeds()
|
220 |
+
process_and_store_articles(articles)
|
221 |
+
upload_to_hf_hub()
|
222 |
+
logger.info("Script execution completed")
|
|
|
|