Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import streamlit as st | |
import time | |
import matplotlib.pyplot as plt | |
from sklearn.feature_extraction.text import CountVectorizer | |
urls = ['https://en.wikipedia.org/wiki/Health_care', | |
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet', | |
'https://www.who.int/health-topics/coronavirus#tab=tab_1'] | |
def scrape_wikipedia(url): | |
try: | |
start_time = time.time() | |
response = requests.get(url) | |
end_time = time.time() | |
return {'url': url, 'response_time': end_time - start_time, 'content': response.content} | |
except: | |
return {'url': url, 'response_time': None, 'content': ""} | |
def plot_word_frequencies(content): | |
soup = BeautifulSoup(content, 'html.parser') | |
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) | |
words = text.split() | |
word_freq = {} | |
for word in words: | |
word_freq[word] = word_freq.get(word, 0) + 1 | |
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) | |
top_words = [word for word, freq in sorted_word_freq[:10]] | |
plt.bar(top_words, [word_freq[word] for word in top_words]) | |
plt.xticks(rotation=45) | |
st.pyplot() | |
def display_top_words(content): | |
soup = BeautifulSoup(content, 'html.parser') | |
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform([text]) | |
terms = vectorizer.get_feature_names() | |
word_freq = X.toarray()[0] | |
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]] | |
st.write(f"Top words: {', '.join(top_words)}") | |
def main(): | |
st.title("List of Articles on Health Care") | |
for url in urls: | |
st.write(f"Scraping {url}...") | |
scraped_data = scrape_wikipedia(url) | |
st.write(f"Response time: {scraped_data['response_time']}") | |
content = scraped_data['content'] | |
st.write(f"Content: ") | |
st.write(content.decode(), unsafe_allow_html=True) | |
plot_word_frequencies(content) | |
display_top_words(content) | |
if __name__ == '__main__': | |
main() |