Spaces:
Runtime error
Runtime error
File size: 2,486 Bytes
e7120e9 4508b3b e7120e9 8ddf073 42b3f33 612c7fd e7120e9 bee76fe f025397 8ddf073 f025397 8ddf073 c485d4d f025397 c485d4d bee76fe 612c7fd 42b3f33 612c7fd e6e9141 612c7fd b4e1b44 42b3f33 bee76fe f025397 8ddf073 4508b3b c485d4d 42b3f33 612c7fd e7120e9 b4e1b44 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import plotly.express as px
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
urls = ['https://en.wikipedia.org/wiki/Health_care',
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
'https://www.who.int/health-topics/coronavirus#tab=tab_1']
def scrape_wikipedia(url):
try:
start_time = time.time()
response = requests.get(url)
end_time = time.time()
return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
except:
return {'url': url, 'response_time': None, 'content': ""}
def plot_word_frequencies(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
words = text.split()
word_freq = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
'freq': [freq for word, freq in sorted_word_freq],
'len': [len(word) for word, freq in sorted_word_freq]})
fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
st.plotly_chart(fig)
def display_top_words(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])
try:
terms = vectorizer.get_feature_names()
word_freq = X.toarray()[0]
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
st.write(f"Top words: {', '.join(top_words)}")
except:
return
def main():
st.set_page_config(layout='wide')
st.title("List of Articles on Health Care")
for url in urls:
st.write(f"Scraping {url}...")
scraped_data = scrape_wikipedia(url)
st.write(f"Response time: {scraped_data['response_time']}")
content = scraped_data['content']
st.write(f"Content: ")
st.markdown(f"```{content.decode()}```")
plot_word_frequencies(content)
display_top_words(content)
if __name__ == '__main__':
main()
|