Spaces:
Runtime error
Runtime error
File size: 2,434 Bytes
91710f3 fae8400 91710f3 cf20733 fae8400 91710f3 95fcb96 91710f3 95fcb96 91710f3 fae8400 cf20733 fae8400 91710f3 cf20733 91710f3 fae8400 95fcb96 cf20733 fae8400 91710f3 cf20733 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import plotly.express as px
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
urls = ['https://en.wikipedia.org/wiki/Health_care',
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
'https://www.who.int/health-topics/coronavirus#tab=tab_1']
def scrape_wikipedia(url):
try:
start_time = time.time()
response = requests.get(url)
end_time = time.time()
return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
except:
return {'url': url, 'response_time': None, 'content': ""}
def plot_word_frequencies(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
words = text.split()
word_freq = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
'freq': [freq for word, freq in sorted_word_freq],
'len': [len(word) for word, freq in sorted_word_freq]})
fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
st.plotly_chart(fig)
def display_top_words(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])
terms = vectorizer.get_feature_names()
word_freq = X.toarray()[0]
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
st.write(f"Top words: {', '.join(top_words)}")
def main():
st.set_page_config(layout='wide')
st.title("List of Articles on Health Care")
for url in urls:
st.write(f"Scraping {url}...")
scraped_data = scrape_wikipedia(url)
st.write(f"Response time: {scraped_data['response_time']}")
content = scraped_data['content']
st.write(f"Content: ")
st.markdown(f"```{content.decode()}```")
plot_word_frequencies(content)
display_top_words(content)
if __name__ == '__main__':
main()
|