Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import streamlit as st | |
import time | |
import plotly.express as px | |
import pandas as pd | |
from sklearn.feature_extraction.text import CountVectorizer | |
urls = ['https://en.wikipedia.org/wiki/Health_care', | |
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet', | |
'https://www.who.int/health-topics/coronavirus#tab=tab_1'] | |
def scrape_wikipedia(url): | |
try: | |
start_time = time.time() | |
response = requests.get(url) | |
end_time = time.time() | |
return {'url': url, 'response_time': end_time - start_time, 'content': response.content} | |
except: | |
return {'url': url, 'response_time': None, 'content': ""} | |
def plot_word_frequencies(content): | |
soup = BeautifulSoup(content, 'html.parser') | |
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) | |
words = text.split() | |
word_freq = {} | |
for word in words: | |
word_freq[word] = word_freq.get(word, 0) + 1 | |
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) | |
df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq], | |
'freq': [freq for word, freq in sorted_word_freq], | |
'len': [len(word) for word, freq in sorted_word_freq]}) | |
fig = px.treemap(df, path=['len', 'word'], values='freq', color='len') | |
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0)) | |
st.plotly_chart(fig) | |
def display_top_words(content): | |
soup = BeautifulSoup(content, 'html.parser') | |
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform([text]) | |
terms = vectorizer.get_feature_names() | |
word_freq = X.toarray()[0] | |
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]] | |
st.write(f"Top words: {', '.join(top_words)}") | |
def main(): | |
st.set_page_config(layout='wide') | |
st.title("List of Articles on Health Care") | |
for url in urls: | |
st.write(f"Scraping {url}...") | |
scraped_data = scrape_wikipedia(url) | |
st.write(f"Response time: {scraped_data['response_time']}") | |
content = scraped_data['content'] | |
st.write(f"Content: ") | |
st.markdown(f"```{content.decode()}```") | |
plot_word_frequencies(content) | |
display_top_words(content) | |
if __name__ == '__main__': | |
main() | |