Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -263,13 +263,18 @@ def summarize_news_content(content, model):
|
|
| 263 |
full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
|
| 264 |
|
| 265 |
# Extract only the summary part
|
| 266 |
-
summary_parts = full_response.split("
|
| 267 |
if len(summary_parts) > 1:
|
| 268 |
summary = summary_parts[-1].strip()
|
| 269 |
else:
|
| 270 |
summary = full_response.strip()
|
| 271 |
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
def process_google_news_rss(query, temperature, top_p, repetition_penalty):
|
| 275 |
model = get_model(temperature, top_p, repetition_penalty)
|
|
@@ -285,22 +290,29 @@ def process_google_news_rss(query, temperature, top_p, repetition_penalty):
|
|
| 285 |
try:
|
| 286 |
# Remove HTML tags from content
|
| 287 |
clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
processed_article = {
|
| 290 |
"published_date": article["published_date"],
|
| 291 |
"title": article["title"],
|
| 292 |
"url": article["url"],
|
| 293 |
"content": clean_content,
|
| 294 |
-
"summary":
|
|
|
|
| 295 |
}
|
| 296 |
processed_articles.append(processed_article)
|
| 297 |
except Exception as e:
|
| 298 |
-
print(f"Error processing article: {str(e)}")
|
|
|
|
| 299 |
if not processed_articles:
|
| 300 |
return "Failed to process any news articles. Please try a different query or check the summarization process."
|
| 301 |
|
| 302 |
# Add processed articles to the database
|
| 303 |
-
docs = [Document(page_content=article["
|
| 304 |
"source": article["url"],
|
| 305 |
"title": article["title"],
|
| 306 |
"published_date": article["published_date"]
|
|
@@ -327,6 +339,10 @@ def export_news_to_excel():
|
|
| 327 |
global news_database
|
| 328 |
df = pd.DataFrame(news_database)
|
| 329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
| 331 |
excel_path = tmp.name
|
| 332 |
df.to_excel(excel_path, index=False)
|
|
|
|
| 263 |
full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
|
| 264 |
|
| 265 |
# Extract only the summary part
|
| 266 |
+
summary_parts = full_response.split("Summary:")
|
| 267 |
if len(summary_parts) > 1:
|
| 268 |
summary = summary_parts[-1].strip()
|
| 269 |
else:
|
| 270 |
summary = full_response.strip()
|
| 271 |
|
| 272 |
+
# Create a cleaned version of the summary
|
| 273 |
+
lines = summary.split('\n')
|
| 274 |
+
cleaned_lines = [line for line in lines if not line.strip().startswith(("Human:", "Assistant:", "Summary:"))]
|
| 275 |
+
cleaned_summary = ' '.join(cleaned_lines).strip()
|
| 276 |
+
|
| 277 |
+
return summary, cleaned_summary
|
| 278 |
|
| 279 |
def process_google_news_rss(query, temperature, top_p, repetition_penalty):
|
| 280 |
model = get_model(temperature, top_p, repetition_penalty)
|
|
|
|
| 290 |
try:
|
| 291 |
# Remove HTML tags from content
|
| 292 |
clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
|
| 293 |
+
|
| 294 |
+
# If content is very short, use the title as content
|
| 295 |
+
if len(clean_content) < 50:
|
| 296 |
+
clean_content = article["title"]
|
| 297 |
+
|
| 298 |
+
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
| 299 |
processed_article = {
|
| 300 |
"published_date": article["published_date"],
|
| 301 |
"title": article["title"],
|
| 302 |
"url": article["url"],
|
| 303 |
"content": clean_content,
|
| 304 |
+
"summary": full_summary,
|
| 305 |
+
"cleaned_summary": cleaned_summary
|
| 306 |
}
|
| 307 |
processed_articles.append(processed_article)
|
| 308 |
except Exception as e:
|
| 309 |
+
print(f"Error processing article: {str(e)}")
|
| 310 |
+
|
| 311 |
if not processed_articles:
|
| 312 |
return "Failed to process any news articles. Please try a different query or check the summarization process."
|
| 313 |
|
| 314 |
# Add processed articles to the database
|
| 315 |
+
docs = [Document(page_content=article["cleaned_summary"], metadata={
|
| 316 |
"source": article["url"],
|
| 317 |
"title": article["title"],
|
| 318 |
"published_date": article["published_date"]
|
|
|
|
| 339 |
global news_database
|
| 340 |
df = pd.DataFrame(news_database)
|
| 341 |
|
| 342 |
+
# Use the cleaned summary for the Excel export
|
| 343 |
+
df['summary'] = df['cleaned_summary']
|
| 344 |
+
df = df.drop(columns=['cleaned_summary']) # Remove the extra column
|
| 345 |
+
|
| 346 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
| 347 |
excel_path = tmp.name
|
| 348 |
df.to_excel(excel_path, index=False)
|