Update app.py
Browse files
app.py
CHANGED
|
@@ -20,10 +20,15 @@ from datetime import datetime
|
|
| 20 |
import os
|
| 21 |
from dotenv import load_dotenv
|
| 22 |
import certifi
|
| 23 |
-
from bs4 import BeautifulSoup
|
| 24 |
import requests
|
| 25 |
-
|
| 26 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Load environment variables from a .env file
|
| 29 |
load_dotenv()
|
|
@@ -37,7 +42,7 @@ SEARXNG_URL = 'https://shreyas094-searxng-local.hf.space/search'
|
|
| 37 |
SEARXNG_KEY = 'f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5'
|
| 38 |
|
| 39 |
# Use the environment variable
|
| 40 |
-
HF_TOKEN = os.getenv(
|
| 41 |
client = InferenceClient(
|
| 42 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 43 |
token=HF_TOKEN,
|
|
@@ -74,6 +79,51 @@ def is_valid_url(url):
|
|
| 74 |
except ValueError:
|
| 75 |
return False
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
def scrape_with_bs4(url, session, max_chars=None):
|
| 78 |
try:
|
| 79 |
response = session.get(url, timeout=5)
|
|
@@ -248,7 +298,8 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
| 248 |
response = llm_client.chat_completion(
|
| 249 |
messages=messages,
|
| 250 |
max_tokens=150,
|
| 251 |
-
temperature=temperature
|
|
|
|
| 252 |
)
|
| 253 |
return response.choices[0].message.content.strip()
|
| 254 |
except Exception as e:
|
|
@@ -272,8 +323,15 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
|
| 272 |
content = main_content.get_text(strip=True, separator='\n')
|
| 273 |
else:
|
| 274 |
content = soup.get_text(strip=True, separator='\n')
|
| 275 |
-
|
| 276 |
content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
# Limit the content to max_chars
|
| 279 |
return content[:max_chars] if content else ""
|
|
@@ -314,7 +372,10 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
| 314 |
response = llm_client.chat_completion(
|
| 315 |
messages=messages,
|
| 316 |
max_tokens=10000,
|
| 317 |
-
temperature=temperature
|
|
|
|
|
|
|
|
|
|
| 318 |
)
|
| 319 |
return response.choices[0].message.content.strip()
|
| 320 |
except Exception as e:
|
|
@@ -408,51 +469,17 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
| 408 |
try:
|
| 409 |
logger.info(f"Scraping content from: {url}")
|
| 410 |
|
| 411 |
-
#
|
| 412 |
-
|
| 413 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 414 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
| 415 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 416 |
-
]
|
| 417 |
-
|
| 418 |
-
content = ""
|
| 419 |
-
for ua in user_agents:
|
| 420 |
-
try:
|
| 421 |
-
if scraper == "bs4":
|
| 422 |
-
session.headers.update({'User-Agent': ua})
|
| 423 |
-
content = scrape_with_bs4(url, session, max_chars)
|
| 424 |
-
else: # trafilatura
|
| 425 |
-
# Use urllib to handle custom headers for trafilatura
|
| 426 |
-
req = Request(url, headers={'User-Agent': ua})
|
| 427 |
-
with urlopen(req) as response:
|
| 428 |
-
downloaded = response.read()
|
| 429 |
-
|
| 430 |
-
# Configure trafilatura to use a specific user agent
|
| 431 |
-
config = use_config()
|
| 432 |
-
config.set("DEFAULT", "USER_AGENT", ua)
|
| 433 |
-
|
| 434 |
-
content = scrape_with_trafilatura(url, max_chars, timeout=timeout, use_beautifulsoup=True)
|
| 435 |
-
|
| 436 |
-
if content:
|
| 437 |
-
break
|
| 438 |
-
except requests.exceptions.HTTPError as e:
|
| 439 |
-
if e.response.status_code == 403:
|
| 440 |
-
logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
|
| 441 |
-
continue
|
| 442 |
-
else:
|
| 443 |
-
raise
|
| 444 |
-
except Exception as e:
|
| 445 |
-
logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
|
| 446 |
-
continue
|
| 447 |
|
| 448 |
if not content:
|
| 449 |
-
logger.warning(f"Failed to scrape content from {url}
|
| 450 |
continue
|
| 451 |
|
| 452 |
scraped_content.append({
|
| 453 |
"title": title,
|
| 454 |
"url": url,
|
| 455 |
-
"content": content,
|
| 456 |
"scraper": scraper
|
| 457 |
})
|
| 458 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
|
@@ -558,7 +585,7 @@ iface = gr.ChatInterface(
|
|
| 558 |
description="Enter your query, and I'll search the web for the most recent and relevant financial news, scrape content, and provide summarized results.",
|
| 559 |
additional_inputs=[
|
| 560 |
gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
|
| 561 |
-
gr.Dropdown(["bs4", "trafilatura"], value="bs4", label="Scraping Method"),
|
| 562 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
| 563 |
gr.Dropdown(["", "day", "week", "month", "year"], value="year", label="Time Range"),
|
| 564 |
gr.Dropdown(["all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="en", label="Language"),
|
|
|
|
| 20 |
import os
|
| 21 |
from dotenv import load_dotenv
|
| 22 |
import certifi
|
|
|
|
| 23 |
import requests
|
| 24 |
+
import scrapy
|
| 25 |
+
from scrapy.crawler import CrawlerProcess
|
| 26 |
+
from scrapy import signals
|
| 27 |
+
from scrapy.signalmanager import dispatcher
|
| 28 |
+
from scrapy.utils.log import configure_logging
|
| 29 |
+
from newspaper import Article
|
| 30 |
+
|
| 31 |
+
|
| 32 |
|
| 33 |
# Load environment variables from a .env file
|
| 34 |
load_dotenv()
|
|
|
|
| 42 |
SEARXNG_KEY = 'f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5'
|
| 43 |
|
| 44 |
# Use the environment variable
|
| 45 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 46 |
client = InferenceClient(
|
| 47 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 48 |
token=HF_TOKEN,
|
|
|
|
| 79 |
except ValueError:
|
| 80 |
return False
|
| 81 |
|
| 82 |
+
class NewsSpider(scrapy.Spider):
|
| 83 |
+
name = 'news_spider'
|
| 84 |
+
|
| 85 |
+
def __init__(self, url=None, *args, **kwargs):
|
| 86 |
+
super(NewsSpider, self).__init__(*args, **kwargs)
|
| 87 |
+
self.start_urls = [url] if url else []
|
| 88 |
+
|
| 89 |
+
def parse(self, response):
|
| 90 |
+
content = ' '.join(response.css('p::text').getall())
|
| 91 |
+
self.logger.info(f"Scraped content length: {len(content)}")
|
| 92 |
+
return {'content': content}
|
| 93 |
+
|
| 94 |
+
def scrape_with_scrapy(url, timeout=30):
|
| 95 |
+
logger.info(f"Starting to scrape with Scrapy: {url}")
|
| 96 |
+
configure_logging(install_root_handler=False)
|
| 97 |
+
logging.getLogger('scrapy').setLevel(logging.WARNING)
|
| 98 |
+
|
| 99 |
+
results = []
|
| 100 |
+
|
| 101 |
+
def spider_results(signal, sender, item, response, spider):
|
| 102 |
+
results.append(item)
|
| 103 |
+
|
| 104 |
+
process = CrawlerProcess(settings={
|
| 105 |
+
'LOG_ENABLED': True,
|
| 106 |
+
'LOG_LEVEL': 'WARNING',
|
| 107 |
+
'DOWNLOAD_TIMEOUT': timeout
|
| 108 |
+
})
|
| 109 |
+
|
| 110 |
+
dispatcher.connect(spider_results, signal=signals.item_scraped)
|
| 111 |
+
|
| 112 |
+
process.crawl(NewsSpider, url=url)
|
| 113 |
+
process.start()
|
| 114 |
+
|
| 115 |
+
# Get the content from results
|
| 116 |
+
if results:
|
| 117 |
+
return results[0]['content']
|
| 118 |
+
return ''
|
| 119 |
+
|
| 120 |
+
def scrape_with_newspaper(url):
|
| 121 |
+
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
| 122 |
+
article = Article(url)
|
| 123 |
+
article.download()
|
| 124 |
+
article.parse()
|
| 125 |
+
return article.text
|
| 126 |
+
|
| 127 |
def scrape_with_bs4(url, session, max_chars=None):
|
| 128 |
try:
|
| 129 |
response = session.get(url, timeout=5)
|
|
|
|
| 298 |
response = llm_client.chat_completion(
|
| 299 |
messages=messages,
|
| 300 |
max_tokens=150,
|
| 301 |
+
temperature=temperature,
|
| 302 |
+
top_p=0.9
|
| 303 |
)
|
| 304 |
return response.choices[0].message.content.strip()
|
| 305 |
except Exception as e:
|
|
|
|
| 323 |
content = main_content.get_text(strip=True, separator='\n')
|
| 324 |
else:
|
| 325 |
content = soup.get_text(strip=True, separator='\n')
|
| 326 |
+
elif scraper == "trafilatura":
|
| 327 |
content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
|
| 328 |
+
elif scraper == "scrapy":
|
| 329 |
+
content = scrape_with_scrapy(url, timeout)
|
| 330 |
+
elif scraper == "newspaper":
|
| 331 |
+
content = scrape_with_newspaper(url)
|
| 332 |
+
else:
|
| 333 |
+
logger.error(f"Unknown scraper: {scraper}")
|
| 334 |
+
return ""
|
| 335 |
|
| 336 |
# Limit the content to max_chars
|
| 337 |
return content[:max_chars] if content else ""
|
|
|
|
| 372 |
response = llm_client.chat_completion(
|
| 373 |
messages=messages,
|
| 374 |
max_tokens=10000,
|
| 375 |
+
temperature=temperature,
|
| 376 |
+
frequency_penalty=1.1,
|
| 377 |
+
top_p=0.9,
|
| 378 |
+
stream=True
|
| 379 |
)
|
| 380 |
return response.choices[0].message.content.strip()
|
| 381 |
except Exception as e:
|
|
|
|
| 469 |
try:
|
| 470 |
logger.info(f"Scraping content from: {url}")
|
| 471 |
|
| 472 |
+
# MODIFY: Remove the user agent loop and use a single scraping method
|
| 473 |
+
content = scrape_full_content(url, scraper, max_chars, timeout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
if not content:
|
| 476 |
+
logger.warning(f"Failed to scrape content from {url}")
|
| 477 |
continue
|
| 478 |
|
| 479 |
scraped_content.append({
|
| 480 |
"title": title,
|
| 481 |
"url": url,
|
| 482 |
+
"content": content,
|
| 483 |
"scraper": scraper
|
| 484 |
})
|
| 485 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
|
|
|
| 585 |
description="Enter your query, and I'll search the web for the most recent and relevant financial news, scrape content, and provide summarized results.",
|
| 586 |
additional_inputs=[
|
| 587 |
gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
|
| 588 |
+
gr.Dropdown(["bs4", "trafilatura", "scrapy", "newspaper"], value="bs4", label="Scraping Method"),
|
| 589 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
| 590 |
gr.Dropdown(["", "day", "week", "month", "year"], value="year", label="Time Range"),
|
| 591 |
gr.Dropdown(["all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="en", label="Language"),
|