Spaces:
No application file
No application file
import requests | |
from bs4 import BeautifulSoup | |
# NewsAPI Key | |
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6" | |
def extract_news(company, num_articles=2): | |
"""Fetch multiple news articles from NewsAPI and return titles and contents.""" | |
url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}" | |
response = requests.get(url) | |
if response.status_code != 200: | |
print("Error:", response.status_code, response.text) | |
return [] | |
data = response.json() | |
articles = data.get("articles", []) | |
if not articles: | |
print("No articles found.") | |
return [] | |
extracted_articles = [] | |
for article in articles[:num_articles]: # Get the required number of articles | |
article_url = article.get("url", "No URL available.") | |
# Scrape the article for title and content | |
article_response = requests.get(article_url) | |
if article_response.status_code == 200: | |
soup = BeautifulSoup(article_response.content, 'html.parser') | |
title = soup.title.string if soup.title else "No Title Found" | |
# Extract paragraphs and clean the content | |
paragraphs = soup.find_all('p') | |
content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip()) | |
# Optionally, filter out unwanted text patterns | |
unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"] | |
for pattern in unwanted_patterns: | |
content = content.replace(pattern, "") | |
# Clean up extra spaces | |
content = ' '.join(content.split()) | |
extracted_articles.append({"title": title, "content": content}) | |
return extracted_articles | |
# import requests | |
# from bs4 import BeautifulSoup | |
# # NewsAPI Key | |
# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6" | |
# def fetch_articles(company, num_articles=11): | |
# """Fetch multiple news articles from NewsAPI and return their titles and content.""" | |
# url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}" | |
# response = requests.get(url) | |
# if response.status_code != 200: | |
# print("Error:", response.status_code, response.text) | |
# return [] | |
# data = response.json() | |
# articles = data.get("articles", []) | |
# if not articles: | |
# print("No articles found.") | |
# return [] | |
# fetched_articles = [] | |
# for article in articles[:num_articles]: # Fetch only the required number of articles | |
# article_url = article.get("url") | |
# if not article_url: | |
# continue | |
# # Scrape the article for title and content | |
# try: | |
# article_response = requests.get(article_url, timeout=5) # Removed headers | |
# if article_response.status_code == 200: | |
# soup = BeautifulSoup(article_response.content, 'html.parser') | |
# title = soup.title.string if soup.title else "No Title Found" | |
# # Extract paragraphs and clean the content | |
# paragraphs = soup.find_all('p') | |
# content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip()) | |
# # Remove unwanted text patterns | |
# unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"] | |
# for pattern in unwanted_patterns: | |
# content = content.replace(pattern, "") | |
# # Clean up extra spaces | |
# content = ' '.join(content.split()) | |
# # Store the article's title and content | |
# fetched_articles.append({"title": title, "content": content}) | |
# except requests.exceptions.RequestException as e: | |
# print(f"Error fetching article: {e}") | |
# return fetched_articles | |
# if __name__ == "__main__": | |
# company = input("Enter the company name for analysis: ").strip() | |
# articles = fetch_articles(company, num_articles=11) | |
# print(articles) |