awacke1's picture
Update app.py
42b3f33
raw
history blame
2.43 kB
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import plotly.express as px
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
urls = ['https://en.wikipedia.org/wiki/Health_care',
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
'https://www.who.int/health-topics/coronavirus#tab=tab_1']
def scrape_wikipedia(url):
try:
start_time = time.time()
response = requests.get(url)
end_time = time.time()
return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
except:
return {'url': url, 'response_time': None, 'content': ""}
def plot_word_frequencies(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
words = text.split()
word_freq = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
'freq': [freq for word, freq in sorted_word_freq],
'len': [len(word) for word, freq in sorted_word_freq]})
fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
st.plotly_chart(fig)
def display_top_words(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])
terms = vectorizer.get_feature_names()
word_freq = X.toarray()[0]
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
st.write(f"Top words: {', '.join(top_words)}")
def main():
st.set_page_config(layout='wide')
st.title("List of Articles on Health Care")
for url in urls:
st.write(f"Scraping {url}...")
scraped_data = scrape_wikipedia(url)
st.write(f"Response time: {scraped_data['response_time']}")
content = scraped_data['content']
st.write(f"Content: ")
st.markdown(f"```{content.decode()}```")
plot_word_frequencies(content)
display_top_words(content)
if __name__ == '__main__':
main()