VizLib-BeautifulSoup / backup-app.py
awacke1's picture
Update backup-app.py
fae8400
raw
history blame
2.18 kB
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
urls = ['https://en.wikipedia.org/wiki/Health_care',
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
'https://www.who.int/health-topics/coronavirus#tab=tab_1']
def scrape_wikipedia(url):
try:
start_time = time.time()
response = requests.get(url)
end_time = time.time()
return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
except:
return {'url': url, 'response_time': None, 'content': ""}
def plot_word_frequencies(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
words = text.split()
word_freq = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
top_words = [word for word, freq in sorted_word_freq[:10]]
plt.bar(top_words, [word_freq[word] for word in top_words])
plt.xticks(rotation=45)
st.pyplot()
def display_top_words(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])
terms = vectorizer.get_feature_names()
word_freq = X.toarray()[0]
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
st.write(f"Top words: {', '.join(top_words)}")
def main():
st.title("List of Articles on Health Care")
for url in urls:
st.write(f"Scraping {url}...")
scraped_data = scrape_wikipedia(url)
st.write(f"Response time: {scraped_data['response_time']}")
content = scraped_data['content']
st.write(f"Content: ")
st.write(content.decode(), unsafe_allow_html=True)
plot_word_frequencies(content)
display_top_words(content)
if __name__ == '__main__':
main()