Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,8 @@ import urllib.parse
|
|
| 10 |
from tempfile import NamedTemporaryFile
|
| 11 |
from typing import List
|
| 12 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
| 13 |
from langchain_core.prompts import ChatPromptTemplate
|
| 14 |
from langchain_community.vectorstores import FAISS
|
| 15 |
from langchain_community.document_loaders import PyPDFLoader
|
|
@@ -22,6 +24,7 @@ from langchain_core.documents import Document
|
|
| 22 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 23 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 24 |
|
|
|
|
| 25 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
| 26 |
|
| 27 |
# Memory database to store question-answer pairs
|
|
@@ -302,13 +305,16 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
| 302 |
clean_content = article["title"]
|
| 303 |
|
| 304 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
|
|
|
|
|
|
| 305 |
processed_article = {
|
| 306 |
"published_date": article["published_date"],
|
| 307 |
"title": article["title"],
|
| 308 |
"url": article["url"],
|
| 309 |
"content": clean_content,
|
| 310 |
"summary": full_summary,
|
| 311 |
-
"cleaned_summary": cleaned_summary
|
|
|
|
| 312 |
}
|
| 313 |
processed_articles.append(processed_article)
|
| 314 |
except Exception as e:
|
|
@@ -321,7 +327,8 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
| 321 |
docs = [Document(page_content=article["cleaned_summary"], metadata={
|
| 322 |
"source": article["url"],
|
| 323 |
"title": article["title"],
|
| 324 |
-
"published_date": article["published_date"]
|
|
|
|
| 325 |
}) for article in processed_articles]
|
| 326 |
|
| 327 |
try:
|
|
@@ -341,7 +348,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
| 341 |
except Exception as e:
|
| 342 |
return f"Error adding articles to the database: {str(e)}"
|
| 343 |
|
| 344 |
-
|
| 345 |
def fetch_articles_from_page(url):
|
| 346 |
response = requests.get(url)
|
| 347 |
response.raise_for_status()
|
|
@@ -449,12 +455,45 @@ def export_news_to_excel():
|
|
| 449 |
df['summary'] = df['cleaned_summary']
|
| 450 |
df = df.drop(columns=['cleaned_summary']) # Remove the extra column
|
| 451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
| 453 |
excel_path = tmp.name
|
| 454 |
df.to_excel(excel_path, index=False)
|
| 455 |
|
| 456 |
return excel_path
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
| 459 |
global conversation_history
|
| 460 |
|
|
|
|
| 10 |
from tempfile import NamedTemporaryFile
|
| 11 |
from typing import List
|
| 12 |
from bs4 import BeautifulSoup
|
| 13 |
+
from langchain.prompts import PromptTemplate
|
| 14 |
+
from langchain.chains import LLMChain
|
| 15 |
from langchain_core.prompts import ChatPromptTemplate
|
| 16 |
from langchain_community.vectorstores import FAISS
|
| 17 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
|
| 24 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 25 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 26 |
|
| 27 |
+
|
| 28 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
| 29 |
|
| 30 |
# Memory database to store question-answer pairs
|
|
|
|
| 305 |
clean_content = article["title"]
|
| 306 |
|
| 307 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
| 308 |
+
relevance_score = calculate_relevance_score(cleaned_summary, model)
|
| 309 |
+
|
| 310 |
processed_article = {
|
| 311 |
"published_date": article["published_date"],
|
| 312 |
"title": article["title"],
|
| 313 |
"url": article["url"],
|
| 314 |
"content": clean_content,
|
| 315 |
"summary": full_summary,
|
| 316 |
+
"cleaned_summary": cleaned_summary,
|
| 317 |
+
"relevance_score": relevance_score
|
| 318 |
}
|
| 319 |
processed_articles.append(processed_article)
|
| 320 |
except Exception as e:
|
|
|
|
| 327 |
docs = [Document(page_content=article["cleaned_summary"], metadata={
|
| 328 |
"source": article["url"],
|
| 329 |
"title": article["title"],
|
| 330 |
+
"published_date": article["published_date"],
|
| 331 |
+
"relevance_score": article["relevance_score"]
|
| 332 |
}) for article in processed_articles]
|
| 333 |
|
| 334 |
try:
|
|
|
|
| 348 |
except Exception as e:
|
| 349 |
return f"Error adding articles to the database: {str(e)}"
|
| 350 |
|
|
|
|
| 351 |
def fetch_articles_from_page(url):
|
| 352 |
response = requests.get(url)
|
| 353 |
response.raise_for_status()
|
|
|
|
| 455 |
df['summary'] = df['cleaned_summary']
|
| 456 |
df = df.drop(columns=['cleaned_summary']) # Remove the extra column
|
| 457 |
|
| 458 |
+
# Reorder columns to put relevance_score after summary
|
| 459 |
+
columns = ['published_date', 'title', 'url', 'content', 'summary', 'relevance_score']
|
| 460 |
+
df = df[columns]
|
| 461 |
+
|
| 462 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
| 463 |
excel_path = tmp.name
|
| 464 |
df.to_excel(excel_path, index=False)
|
| 465 |
|
| 466 |
return excel_path
|
| 467 |
|
| 468 |
+
def calculate_relevance_score(summary, model):
|
| 469 |
+
prompt_template = PromptTemplate(
|
| 470 |
+
input_variables=["summary"],
|
| 471 |
+
template="""You are a financial analyst tasked with providing a relevance score to news summaries.
|
| 472 |
+
The score should be based on the financial significance and impact of the news.
|
| 473 |
+
Use the following scoring guide:
|
| 474 |
+
- 0.00-0.20: Not relevant to finance or economics
|
| 475 |
+
- 0.21-0.40: Slightly relevant, but minimal financial impact
|
| 476 |
+
- 0.41-0.60: Moderately relevant, some financial implications
|
| 477 |
+
- 0.61-0.80: Highly relevant, significant financial impact
|
| 478 |
+
- 0.81-1.00: Extremely relevant, major financial implications
|
| 479 |
+
|
| 480 |
+
Provide a score between 0.00 and 1.00, where 0.00 is not relevant at all, and 1.00 is extremely relevant from a financial perspective.
|
| 481 |
+
|
| 482 |
+
Summary: {summary}
|
| 483 |
+
|
| 484 |
+
Relevance Score:"""
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
chain = LLMChain(llm=model, prompt=prompt_template)
|
| 488 |
+
response = chain.run(summary=summary)
|
| 489 |
+
|
| 490 |
+
try:
|
| 491 |
+
score = float(response.strip())
|
| 492 |
+
return min(max(score, 0.00), 1.00) # Ensure the score is between 0.00 and 1.00
|
| 493 |
+
except ValueError:
|
| 494 |
+
print(f"Error parsing relevance score: {response}")
|
| 495 |
+
return 0.00
|
| 496 |
+
|
| 497 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
| 498 |
global conversation_history
|
| 499 |
|