v1shal's picture
first_commit
b396e94
import requests
from bs4 import BeautifulSoup
# NewsAPI Key
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
def extract_news(company, num_articles=2):
"""Fetch multiple news articles from NewsAPI and return titles and contents."""
url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
response = requests.get(url)
if response.status_code != 200:
print("Error:", response.status_code, response.text)
return []
data = response.json()
articles = data.get("articles", [])
if not articles:
print("No articles found.")
return []
extracted_articles = []
for article in articles[:num_articles]: # Get the required number of articles
article_url = article.get("url", "No URL available.")
# Scrape the article for title and content
article_response = requests.get(article_url)
if article_response.status_code == 200:
soup = BeautifulSoup(article_response.content, 'html.parser')
title = soup.title.string if soup.title else "No Title Found"
# Extract paragraphs and clean the content
paragraphs = soup.find_all('p')
content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
# Optionally, filter out unwanted text patterns
unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
for pattern in unwanted_patterns:
content = content.replace(pattern, "")
# Clean up extra spaces
content = ' '.join(content.split())
extracted_articles.append({"title": title, "content": content})
return extracted_articles
# import requests
# from bs4 import BeautifulSoup
# # NewsAPI Key
# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
# def fetch_articles(company, num_articles=11):
# """Fetch multiple news articles from NewsAPI and return their titles and content."""
# url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
# response = requests.get(url)
# if response.status_code != 200:
# print("Error:", response.status_code, response.text)
# return []
# data = response.json()
# articles = data.get("articles", [])
# if not articles:
# print("No articles found.")
# return []
# fetched_articles = []
# for article in articles[:num_articles]: # Fetch only the required number of articles
# article_url = article.get("url")
# if not article_url:
# continue
# # Scrape the article for title and content
# try:
# article_response = requests.get(article_url, timeout=5) # Removed headers
# if article_response.status_code == 200:
# soup = BeautifulSoup(article_response.content, 'html.parser')
# title = soup.title.string if soup.title else "No Title Found"
# # Extract paragraphs and clean the content
# paragraphs = soup.find_all('p')
# content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
# # Remove unwanted text patterns
# unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
# for pattern in unwanted_patterns:
# content = content.replace(pattern, "")
# # Clean up extra spaces
# content = ' '.join(content.split())
# # Store the article's title and content
# fetched_articles.append({"title": title, "content": content})
# except requests.exceptions.RequestException as e:
# print(f"Error fetching article: {e}")
# return fetched_articles
# if __name__ == "__main__":
# company = input("Enter the company name for analysis: ").strip()
# articles = fetch_articles(company, num_articles=11)
# print(articles)