Spaces:

v1shal
/

News_Summarisation_Sentiment_Analysis

No application file

App Files Files Community

News_Summarisation_Sentiment_Analysis / approach_api /utils /news_extraction_api.py

v1shal

first_commit

b396e94 3 months ago

raw

history blame contribute delete

4.11 kB

	import requests
	from bs4 import BeautifulSoup

	# NewsAPI Key
	NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"

	def extract_news(company, num_articles=2):
	"""Fetch multiple news articles from NewsAPI and return titles and contents."""
	url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
	response = requests.get(url)

	if response.status_code != 200:
	print("Error:", response.status_code, response.text)
	return []

	data = response.json()
	articles = data.get("articles", [])

	if not articles:
	print("No articles found.")
	return []

	extracted_articles = []

	for article in articles[:num_articles]: # Get the required number of articles
	article_url = article.get("url", "No URL available.")

	# Scrape the article for title and content
	article_response = requests.get(article_url)
	if article_response.status_code == 200:
	soup = BeautifulSoup(article_response.content, 'html.parser')
	title = soup.title.string if soup.title else "No Title Found"

	# Extract paragraphs and clean the content
	paragraphs = soup.find_all('p')
	content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())

	# Optionally, filter out unwanted text patterns
	unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
	for pattern in unwanted_patterns:
	content = content.replace(pattern, "")

	# Clean up extra spaces
	content = ' '.join(content.split())

	extracted_articles.append({"title": title, "content": content})

	return extracted_articles


	# import requests
	# from bs4 import BeautifulSoup

	# # NewsAPI Key
	# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"

	# def fetch_articles(company, num_articles=11):
	# """Fetch multiple news articles from NewsAPI and return their titles and content."""
	# url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
	# response = requests.get(url)

	# if response.status_code != 200:
	# print("Error:", response.status_code, response.text)
	# return []

	# data = response.json()
	# articles = data.get("articles", [])

	# if not articles:
	# print("No articles found.")
	# return []

	# fetched_articles = []

	# for article in articles[:num_articles]: # Fetch only the required number of articles
	# article_url = article.get("url")
	# if not article_url:
	# continue

	# # Scrape the article for title and content
	# try:
	# article_response = requests.get(article_url, timeout=5) # Removed headers
	# if article_response.status_code == 200:
	# soup = BeautifulSoup(article_response.content, 'html.parser')
	# title = soup.title.string if soup.title else "No Title Found"

	# # Extract paragraphs and clean the content
	# paragraphs = soup.find_all('p')
	# content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())

	# # Remove unwanted text patterns
	# unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
	# for pattern in unwanted_patterns:
	# content = content.replace(pattern, "")

	# # Clean up extra spaces
	# content = ' '.join(content.split())

	# # Store the article's title and content
	# fetched_articles.append({"title": title, "content": content})
	# except requests.exceptions.RequestException as e:
	# print(f"Error fetching article: {e}")

	# return fetched_articles

	# if __name__ == "__main__":
	# company = input("Enter the company name for analysis: ").strip()
	# articles = fetch_articles(company, num_articles=11)
	# print(articles)