Sentinel-AI-Beta

Sleeping

App Files Files Community

Shreyas094 commited on Jul 18, 2024

Commit

4b05267

verified ·

1 Parent(s): 4af5bad

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -46

app.py CHANGED Viewed

@@ -365,15 +365,13 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
-    if news_source == "Google News RSS":
-        articles = fetch_google_news_rss(query)
-    elif news_source == "Golomt Bank":
-        articles = fetch_golomt_bank_news()
     else:
-        return "Invalid news source selected."
     if not articles:
-        return f"No news articles found for the given {news_source}."
     processed_articles = []
@@ -388,7 +386,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
             full_summary, cleaned_summary = summarize_news_content(clean_content, model)
             relevance_score = calculate_relevance_score(cleaned_summary, model)
-            print(f"Relevance score for article '{article['title']}': {relevance_score}")  # Debug print
             processed_article = {
                 "published_date": article["published_date"],
@@ -403,11 +400,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
         except Exception as e:
             print(f"Error processing article: {str(e)}")
-    # Debug print
-    print("Processed articles:")
-    for article in processed_articles:
-        print(f"Title: {article['title']}, Score: {article['relevance_score']}")
     if not processed_articles:
         return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
@@ -430,46 +422,65 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
         # Update news_database for excel export
         global news_database
-        news_database = processed_articles  # Directly assign the processed articles
-        print("Updated news_database:")
-        for article in news_database:
-            print(f"Title: {article['title']}, Score: {article['relevance_score']}")
         return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
     except Exception as e:
         return f"Error adding articles to the database: {str(e)}"
-def fetch_articles_from_page(url):
-    response = requests.get(url)
-    response.raise_for_status()
-    soup = BeautifulSoup(response.content, 'html.parser')
-    articles = soup.find_all('div', class_='entry-post gt-box-shadow-2')
-    return articles, soup
-def fetch_articles_from_page(url):
     response = requests.get(url)
     response.raise_for_status()
     soup = BeautifulSoup(response.content, 'html.parser')
-    articles = soup.find_all('div', class_='entry-post gt-box-shadow-2')
     return articles, soup
-def extract_articles(articles):
     article_data = []
     for article in articles:
-        title_div = article.find('h2', class_='entry-title')
         title = title_div.get_text(strip=True) if title_div else "No Title"
-        date_div = article.find('div', class_='entry-date gt-meta')
         date = date_div.get_text(strip=True) if date_div else "No Date"
-        link_tag = article.find('a')
         link = link_tag['href'] if link_tag else "No Link"
         if not link.startswith('http'):
-            link = "https://golomtbank.com" + link
         article_response = requests.get(link)
         article_response.raise_for_status()
         article_soup = BeautifulSoup(article_response.content, 'html.parser')
-        article_content_div = article_soup.find('div', class_='entry-content')
         article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
         article_data.append({
             'title': title,
             'date': date,
@@ -478,30 +489,34 @@ def extract_articles(articles):
         })
     return article_data
-def fetch_golomt_bank_news(num_results=20):
-    base_url = "https://golomtbank.com/en/rnews"
     current_page_url = base_url
     all_articles = []
     try:
         while len(all_articles) < num_results:
             print(f"Fetching articles from: {current_page_url}")
-            articles, soup = fetch_articles_from_page(current_page_url)
             if not articles:
                 print("No articles found on this page.")
                 break
-            all_articles.extend(extract_articles(articles))
             print(f"Total articles fetched so far: {len(all_articles)}")
             if len(all_articles) >= num_results:
                 all_articles = all_articles[:num_results]
                 break
-            next_page_link = soup.find('a', class_='next')
             if not next_page_link:
                 print("No next page link found.")
                 break
             current_page_url = next_page_link['href']
             if not current_page_url.startswith('http'):
-                current_page_url = "https://golomtbank.com" + current_page_url
         return [
             {
@@ -512,7 +527,7 @@ def fetch_golomt_bank_news(num_results=20):
             } for article in all_articles
         ]
     except Exception as e:
-        print(f"Error fetching Golomt Bank news: {str(e)}")
         return []
 def export_news_to_excel():
@@ -763,18 +778,25 @@ with gr.Blocks() as demo:
             repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
             web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
             google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
     with gr.Row():
-        news_source_dropdown = gr.Dropdown(
-            choices=["Google News RSS", "Golomt Bank"],
-            label="Select News Source",
-            value="Google News RSS"
         )
-        news_query_input = gr.Textbox(label="Enter news query (for Google News RSS)")
-        fetch_news_button = gr.Button("Fetch News")
     news_fetch_output = gr.Textbox(label="News Fetch Status")
     def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
         answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
         history.append((question, answer))

     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
+    if news_source in website_configs:
+        articles = fetch_news_from_website(news_source)
     else:
+        return f"Invalid news source selected: {news_source}"
     if not articles:
+        return f"No news articles found for {news_source}."
     processed_articles = []
             full_summary, cleaned_summary = summarize_news_content(clean_content, model)
             relevance_score = calculate_relevance_score(cleaned_summary, model)
             processed_article = {
                 "published_date": article["published_date"],
         except Exception as e:
             print(f"Error processing article: {str(e)}")
     if not processed_articles:
         return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
         # Update news_database for excel export
         global news_database
+        news_database = processed_articles
         return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
     except Exception as e:
         return f"Error adding articles to the database: {str(e)}"
+website_configs = {
+    "Golomt Bank": {
+        "base_url": "https://golomtbank.com/en/rnews",
+        "article_selector": 'div.entry-post.gt-box-shadow-2',
+        "title_selector": 'h2.entry-title',
+        "date_selector": 'div.entry-date.gt-meta',
+        "link_selector": 'a',
+        "content_selector": 'div.entry-content',
+        "next_page_selector": 'a.next',
+        "url_prefix": "https://golomtbank.com"
+    },
+    "Bank of America": {
+        "base_url": "https://newsroom.bankofamerica.com/content/newsroom/press-releases.html",
+        "article_selector": 'div.views-row',
+        "title_selector": 'span.field-content',
+        "date_selector": 'span.date-display-single',
+        "link_selector": 'a',
+        "content_selector": 'div.field-name-body',
+        "next_page_selector": 'li.pager-next a',
+        "url_prefix": "https://newsroom.bankofamerica.com"
+    },
+    # Add more banks as needed
+}
+def fetch_articles_from_page(url, config):
     response = requests.get(url)
     response.raise_for_status()
     soup = BeautifulSoup(response.content, 'html.parser')
+    articles = soup.find_all(config['article_selector'].split('.')[0], class_=config['article_selector'].split('.')[-1])
     return articles, soup
+def extract_articles(articles, config):
     article_data = []
     for article in articles:
+        title_div = article.find(config['title_selector'].split('.')[0], class_=config['title_selector'].split('.')[-1])
         title = title_div.get_text(strip=True) if title_div else "No Title"
+        date_div = article.find(config['date_selector'].split('.')[0], class_=config['date_selector'].split('.')[-1])
         date = date_div.get_text(strip=True) if date_div else "No Date"
+        link_tag = article.find(config['link_selector'])
         link = link_tag['href'] if link_tag else "No Link"
         if not link.startswith('http'):
+            link = config['url_prefix'] + link
         article_response = requests.get(link)
         article_response.raise_for_status()
         article_soup = BeautifulSoup(article_response.content, 'html.parser')
+        article_content_div = article_soup.find(config['content_selector'].split('.')[0], class_=config['content_selector'].split('.')[-1])
         article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
         article_data.append({
             'title': title,
             'date': date,
         })
     return article_data
+def fetch_news_from_website(website_key, num_results=20):
+    config = website_configs.get(website_key)
+    if not config:
+        return f"No configuration found for website: {website_key}"
+    base_url = config['base_url']
     current_page_url = base_url
     all_articles = []
     try:
         while len(all_articles) < num_results:
             print(f"Fetching articles from: {current_page_url}")
+            articles, soup = fetch_articles_from_page(current_page_url, config)
             if not articles:
                 print("No articles found on this page.")
                 break
+            all_articles.extend(extract_articles(articles, config))
             print(f"Total articles fetched so far: {len(all_articles)}")
             if len(all_articles) >= num_results:
                 all_articles = all_articles[:num_results]
                 break
+            next_page_link = soup.find(config['next_page_selector'])
             if not next_page_link:
                 print("No next page link found.")
                 break
             current_page_url = next_page_link['href']
             if not current_page_url.startswith('http'):
+                current_page_url = config['url_prefix'] + current_page_url
         return [
             {
             } for article in all_articles
         ]
     except Exception as e:
+        print(f"Error fetching news from {website_key}: {str(e)}")
         return []
 def export_news_to_excel():
             repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
             web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
             google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
     with gr.Row():
+        bank_dropdown = gr.Dropdown(
+            choices=list(website_configs.keys()),
+            label="Select Bank",
+            value=list(website_configs.keys())[0]
         )
+        fetch_news_button = gr.Button("Fetch Bank News")
     news_fetch_output = gr.Textbox(label="News Fetch Status")
+    submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox, google_news_rss_checkbox], outputs=[question_input, chatbot])
+    fetch_news_button.click(
+        fetch_bank_news,
+        inputs=[bank_dropdown, temperature_slider, top_p_slider, repetition_penalty_slider],
+        outputs=news_fetch_output
+    )
     def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
         answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
         history.append((question, answer))