Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -365,15 +365,13 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
| 365 |
model = get_model(temperature, top_p, repetition_penalty)
|
| 366 |
embed = get_embeddings()
|
| 367 |
|
| 368 |
-
if news_source
|
| 369 |
-
articles =
|
| 370 |
-
elif news_source == "Golomt Bank":
|
| 371 |
-
articles = fetch_golomt_bank_news()
|
| 372 |
else:
|
| 373 |
-
return "Invalid news source selected
|
| 374 |
|
| 375 |
if not articles:
|
| 376 |
-
return f"No news articles found for
|
| 377 |
|
| 378 |
processed_articles = []
|
| 379 |
|
|
@@ -388,7 +386,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
| 388 |
|
| 389 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
| 390 |
relevance_score = calculate_relevance_score(cleaned_summary, model)
|
| 391 |
-
print(f"Relevance score for article '{article['title']}': {relevance_score}") # Debug print
|
| 392 |
|
| 393 |
processed_article = {
|
| 394 |
"published_date": article["published_date"],
|
|
@@ -403,11 +400,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
| 403 |
except Exception as e:
|
| 404 |
print(f"Error processing article: {str(e)}")
|
| 405 |
|
| 406 |
-
# Debug print
|
| 407 |
-
print("Processed articles:")
|
| 408 |
-
for article in processed_articles:
|
| 409 |
-
print(f"Title: {article['title']}, Score: {article['relevance_score']}")
|
| 410 |
-
|
| 411 |
if not processed_articles:
|
| 412 |
return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
|
| 413 |
|
|
@@ -430,46 +422,65 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
| 430 |
|
| 431 |
# Update news_database for excel export
|
| 432 |
global news_database
|
| 433 |
-
news_database = processed_articles
|
| 434 |
-
|
| 435 |
-
print("Updated news_database:")
|
| 436 |
-
for article in news_database:
|
| 437 |
-
print(f"Title: {article['title']}, Score: {article['relevance_score']}")
|
| 438 |
|
| 439 |
return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
|
| 440 |
except Exception as e:
|
| 441 |
return f"Error adding articles to the database: {str(e)}"
|
| 442 |
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
response = requests.get(url)
|
| 452 |
response.raise_for_status()
|
| 453 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 454 |
-
articles = soup.find_all('
|
| 455 |
return articles, soup
|
| 456 |
|
| 457 |
-
def extract_articles(articles):
|
| 458 |
article_data = []
|
| 459 |
for article in articles:
|
| 460 |
-
title_div = article.find('
|
| 461 |
title = title_div.get_text(strip=True) if title_div else "No Title"
|
| 462 |
-
|
|
|
|
| 463 |
date = date_div.get_text(strip=True) if date_div else "No Date"
|
| 464 |
-
|
|
|
|
| 465 |
link = link_tag['href'] if link_tag else "No Link"
|
| 466 |
if not link.startswith('http'):
|
| 467 |
-
link =
|
|
|
|
| 468 |
article_response = requests.get(link)
|
| 469 |
article_response.raise_for_status()
|
| 470 |
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
| 471 |
-
article_content_div = article_soup.find('
|
| 472 |
article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
|
|
|
|
| 473 |
article_data.append({
|
| 474 |
'title': title,
|
| 475 |
'date': date,
|
|
@@ -478,30 +489,34 @@ def extract_articles(articles):
|
|
| 478 |
})
|
| 479 |
return article_data
|
| 480 |
|
| 481 |
-
def
|
| 482 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
current_page_url = base_url
|
| 484 |
all_articles = []
|
| 485 |
|
| 486 |
try:
|
| 487 |
while len(all_articles) < num_results:
|
| 488 |
print(f"Fetching articles from: {current_page_url}")
|
| 489 |
-
articles, soup = fetch_articles_from_page(current_page_url)
|
| 490 |
if not articles:
|
| 491 |
print("No articles found on this page.")
|
| 492 |
break
|
| 493 |
-
all_articles.extend(extract_articles(articles))
|
| 494 |
print(f"Total articles fetched so far: {len(all_articles)}")
|
| 495 |
if len(all_articles) >= num_results:
|
| 496 |
all_articles = all_articles[:num_results]
|
| 497 |
break
|
| 498 |
-
next_page_link = soup.find('
|
| 499 |
if not next_page_link:
|
| 500 |
print("No next page link found.")
|
| 501 |
break
|
| 502 |
current_page_url = next_page_link['href']
|
| 503 |
if not current_page_url.startswith('http'):
|
| 504 |
-
current_page_url =
|
| 505 |
|
| 506 |
return [
|
| 507 |
{
|
|
@@ -512,7 +527,7 @@ def fetch_golomt_bank_news(num_results=20):
|
|
| 512 |
} for article in all_articles
|
| 513 |
]
|
| 514 |
except Exception as e:
|
| 515 |
-
print(f"Error fetching
|
| 516 |
return []
|
| 517 |
|
| 518 |
def export_news_to_excel():
|
|
@@ -763,18 +778,25 @@ with gr.Blocks() as demo:
|
|
| 763 |
repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
|
| 764 |
web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
|
| 765 |
google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
|
| 766 |
-
|
| 767 |
with gr.Row():
|
| 768 |
-
|
| 769 |
-
choices=
|
| 770 |
-
label="Select
|
| 771 |
-
value=
|
| 772 |
)
|
| 773 |
-
|
| 774 |
-
fetch_news_button = gr.Button("Fetch News")
|
| 775 |
|
| 776 |
news_fetch_output = gr.Textbox(label="News Fetch Status")
|
| 777 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
| 779 |
answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
|
| 780 |
history.append((question, answer))
|
|
|
|
| 365 |
model = get_model(temperature, top_p, repetition_penalty)
|
| 366 |
embed = get_embeddings()
|
| 367 |
|
| 368 |
+
if news_source in website_configs:
|
| 369 |
+
articles = fetch_news_from_website(news_source)
|
|
|
|
|
|
|
| 370 |
else:
|
| 371 |
+
return f"Invalid news source selected: {news_source}"
|
| 372 |
|
| 373 |
if not articles:
|
| 374 |
+
return f"No news articles found for {news_source}."
|
| 375 |
|
| 376 |
processed_articles = []
|
| 377 |
|
|
|
|
| 386 |
|
| 387 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
| 388 |
relevance_score = calculate_relevance_score(cleaned_summary, model)
|
|
|
|
| 389 |
|
| 390 |
processed_article = {
|
| 391 |
"published_date": article["published_date"],
|
|
|
|
| 400 |
except Exception as e:
|
| 401 |
print(f"Error processing article: {str(e)}")
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
if not processed_articles:
|
| 404 |
return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
|
| 405 |
|
|
|
|
| 422 |
|
| 423 |
# Update news_database for excel export
|
| 424 |
global news_database
|
| 425 |
+
news_database = processed_articles
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
|
| 428 |
except Exception as e:
|
| 429 |
return f"Error adding articles to the database: {str(e)}"
|
| 430 |
|
| 431 |
+
website_configs = {
|
| 432 |
+
"Golomt Bank": {
|
| 433 |
+
"base_url": "https://golomtbank.com/en/rnews",
|
| 434 |
+
"article_selector": 'div.entry-post.gt-box-shadow-2',
|
| 435 |
+
"title_selector": 'h2.entry-title',
|
| 436 |
+
"date_selector": 'div.entry-date.gt-meta',
|
| 437 |
+
"link_selector": 'a',
|
| 438 |
+
"content_selector": 'div.entry-content',
|
| 439 |
+
"next_page_selector": 'a.next',
|
| 440 |
+
"url_prefix": "https://golomtbank.com"
|
| 441 |
+
},
|
| 442 |
+
"Bank of America": {
|
| 443 |
+
"base_url": "https://newsroom.bankofamerica.com/content/newsroom/press-releases.html",
|
| 444 |
+
"article_selector": 'div.views-row',
|
| 445 |
+
"title_selector": 'span.field-content',
|
| 446 |
+
"date_selector": 'span.date-display-single',
|
| 447 |
+
"link_selector": 'a',
|
| 448 |
+
"content_selector": 'div.field-name-body',
|
| 449 |
+
"next_page_selector": 'li.pager-next a',
|
| 450 |
+
"url_prefix": "https://newsroom.bankofamerica.com"
|
| 451 |
+
},
|
| 452 |
+
# Add more banks as needed
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def fetch_articles_from_page(url, config):
|
| 458 |
response = requests.get(url)
|
| 459 |
response.raise_for_status()
|
| 460 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 461 |
+
articles = soup.find_all(config['article_selector'].split('.')[0], class_=config['article_selector'].split('.')[-1])
|
| 462 |
return articles, soup
|
| 463 |
|
| 464 |
+
def extract_articles(articles, config):
|
| 465 |
article_data = []
|
| 466 |
for article in articles:
|
| 467 |
+
title_div = article.find(config['title_selector'].split('.')[0], class_=config['title_selector'].split('.')[-1])
|
| 468 |
title = title_div.get_text(strip=True) if title_div else "No Title"
|
| 469 |
+
|
| 470 |
+
date_div = article.find(config['date_selector'].split('.')[0], class_=config['date_selector'].split('.')[-1])
|
| 471 |
date = date_div.get_text(strip=True) if date_div else "No Date"
|
| 472 |
+
|
| 473 |
+
link_tag = article.find(config['link_selector'])
|
| 474 |
link = link_tag['href'] if link_tag else "No Link"
|
| 475 |
if not link.startswith('http'):
|
| 476 |
+
link = config['url_prefix'] + link
|
| 477 |
+
|
| 478 |
article_response = requests.get(link)
|
| 479 |
article_response.raise_for_status()
|
| 480 |
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
| 481 |
+
article_content_div = article_soup.find(config['content_selector'].split('.')[0], class_=config['content_selector'].split('.')[-1])
|
| 482 |
article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
|
| 483 |
+
|
| 484 |
article_data.append({
|
| 485 |
'title': title,
|
| 486 |
'date': date,
|
|
|
|
| 489 |
})
|
| 490 |
return article_data
|
| 491 |
|
| 492 |
+
def fetch_news_from_website(website_key, num_results=20):
|
| 493 |
+
config = website_configs.get(website_key)
|
| 494 |
+
if not config:
|
| 495 |
+
return f"No configuration found for website: {website_key}"
|
| 496 |
+
|
| 497 |
+
base_url = config['base_url']
|
| 498 |
current_page_url = base_url
|
| 499 |
all_articles = []
|
| 500 |
|
| 501 |
try:
|
| 502 |
while len(all_articles) < num_results:
|
| 503 |
print(f"Fetching articles from: {current_page_url}")
|
| 504 |
+
articles, soup = fetch_articles_from_page(current_page_url, config)
|
| 505 |
if not articles:
|
| 506 |
print("No articles found on this page.")
|
| 507 |
break
|
| 508 |
+
all_articles.extend(extract_articles(articles, config))
|
| 509 |
print(f"Total articles fetched so far: {len(all_articles)}")
|
| 510 |
if len(all_articles) >= num_results:
|
| 511 |
all_articles = all_articles[:num_results]
|
| 512 |
break
|
| 513 |
+
next_page_link = soup.find(config['next_page_selector'])
|
| 514 |
if not next_page_link:
|
| 515 |
print("No next page link found.")
|
| 516 |
break
|
| 517 |
current_page_url = next_page_link['href']
|
| 518 |
if not current_page_url.startswith('http'):
|
| 519 |
+
current_page_url = config['url_prefix'] + current_page_url
|
| 520 |
|
| 521 |
return [
|
| 522 |
{
|
|
|
|
| 527 |
} for article in all_articles
|
| 528 |
]
|
| 529 |
except Exception as e:
|
| 530 |
+
print(f"Error fetching news from {website_key}: {str(e)}")
|
| 531 |
return []
|
| 532 |
|
| 533 |
def export_news_to_excel():
|
|
|
|
| 778 |
repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
|
| 779 |
web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
|
| 780 |
google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
|
| 781 |
+
|
| 782 |
with gr.Row():
|
| 783 |
+
bank_dropdown = gr.Dropdown(
|
| 784 |
+
choices=list(website_configs.keys()),
|
| 785 |
+
label="Select Bank",
|
| 786 |
+
value=list(website_configs.keys())[0]
|
| 787 |
)
|
| 788 |
+
fetch_news_button = gr.Button("Fetch Bank News")
|
|
|
|
| 789 |
|
| 790 |
news_fetch_output = gr.Textbox(label="News Fetch Status")
|
| 791 |
|
| 792 |
+
submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox, google_news_rss_checkbox], outputs=[question_input, chatbot])
|
| 793 |
+
|
| 794 |
+
fetch_news_button.click(
|
| 795 |
+
fetch_bank_news,
|
| 796 |
+
inputs=[bank_dropdown, temperature_slider, top_p_slider, repetition_penalty_slider],
|
| 797 |
+
outputs=news_fetch_output
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
| 801 |
answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
|
| 802 |
history.append((question, answer))
|