Update approach_api/utils/news_extraction_api.py
Browse files
approach_api/utils/news_extraction_api.py
CHANGED
|
@@ -4,7 +4,7 @@ from bs4 import BeautifulSoup
|
|
| 4 |
# NewsAPI Key
|
| 5 |
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
| 6 |
|
| 7 |
-
def extract_news(company, num_articles=
|
| 8 |
"""Fetch multiple news articles from NewsAPI and return titles and contents."""
|
| 9 |
url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
| 10 |
response = requests.get(url)
|
|
@@ -48,62 +48,3 @@ def extract_news(company, num_articles=2):
|
|
| 48 |
return extracted_articles
|
| 49 |
|
| 50 |
|
| 51 |
-
# import requests
|
| 52 |
-
# from bs4 import BeautifulSoup
|
| 53 |
-
|
| 54 |
-
# # NewsAPI Key
|
| 55 |
-
# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
| 56 |
-
|
| 57 |
-
# def fetch_articles(company, num_articles=11):
|
| 58 |
-
# """Fetch multiple news articles from NewsAPI and return their titles and content."""
|
| 59 |
-
# url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
| 60 |
-
# response = requests.get(url)
|
| 61 |
-
|
| 62 |
-
# if response.status_code != 200:
|
| 63 |
-
# print("Error:", response.status_code, response.text)
|
| 64 |
-
# return []
|
| 65 |
-
|
| 66 |
-
# data = response.json()
|
| 67 |
-
# articles = data.get("articles", [])
|
| 68 |
-
|
| 69 |
-
# if not articles:
|
| 70 |
-
# print("No articles found.")
|
| 71 |
-
# return []
|
| 72 |
-
|
| 73 |
-
# fetched_articles = []
|
| 74 |
-
|
| 75 |
-
# for article in articles[:num_articles]: # Fetch only the required number of articles
|
| 76 |
-
# article_url = article.get("url")
|
| 77 |
-
# if not article_url:
|
| 78 |
-
# continue
|
| 79 |
-
|
| 80 |
-
# # Scrape the article for title and content
|
| 81 |
-
# try:
|
| 82 |
-
# article_response = requests.get(article_url, timeout=5) # Removed headers
|
| 83 |
-
# if article_response.status_code == 200:
|
| 84 |
-
# soup = BeautifulSoup(article_response.content, 'html.parser')
|
| 85 |
-
# title = soup.title.string if soup.title else "No Title Found"
|
| 86 |
-
|
| 87 |
-
# # Extract paragraphs and clean the content
|
| 88 |
-
# paragraphs = soup.find_all('p')
|
| 89 |
-
# content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
|
| 90 |
-
|
| 91 |
-
# # Remove unwanted text patterns
|
| 92 |
-
# unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
|
| 93 |
-
# for pattern in unwanted_patterns:
|
| 94 |
-
# content = content.replace(pattern, "")
|
| 95 |
-
|
| 96 |
-
# # Clean up extra spaces
|
| 97 |
-
# content = ' '.join(content.split())
|
| 98 |
-
|
| 99 |
-
# # Store the article's title and content
|
| 100 |
-
# fetched_articles.append({"title": title, "content": content})
|
| 101 |
-
# except requests.exceptions.RequestException as e:
|
| 102 |
-
# print(f"Error fetching article: {e}")
|
| 103 |
-
|
| 104 |
-
# return fetched_articles
|
| 105 |
-
|
| 106 |
-
# if __name__ == "__main__":
|
| 107 |
-
# company = input("Enter the company name for analysis: ").strip()
|
| 108 |
-
# articles = fetch_articles(company, num_articles=11)
|
| 109 |
-
# print(articles)
|
|
|
|
| 4 |
# NewsAPI Key
|
| 5 |
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
| 6 |
|
| 7 |
+
def extract_news(company, num_articles=15):
|
| 8 |
"""Fetch multiple news articles from NewsAPI and return titles and contents."""
|
| 9 |
url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
| 10 |
response = requests.get(url)
|
|
|
|
| 48 |
return extracted_articles
|
| 49 |
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|