import requests from bs4 import BeautifulSoup import streamlit as st import time import plotly.express as px import pandas as pd from sklearn.feature_extraction.text import CountVectorizer urls = ['https://en.wikipedia.org/wiki/Health_care', 'https://en.wikipedia.org/wiki/Health_information_on_the_Internet', 'https://www.who.int/health-topics/coronavirus#tab=tab_1'] def scrape_wikipedia(url): try: start_time = time.time() response = requests.get(url) end_time = time.time() return {'url': url, 'response_time': end_time - start_time, 'content': response.content} except: return {'url': url, 'response_time': None, 'content': ""} def plot_word_frequencies(content): soup = BeautifulSoup(content, 'html.parser') text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) words = text.split() word_freq = {} for word in words: word_freq[word] = word_freq.get(word, 0) + 1 sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq], 'freq': [freq for word, freq in sorted_word_freq], 'len': [len(word) for word, freq in sorted_word_freq]}) fig = px.treemap(df, path=['len', 'word'], values='freq', color='len') fig.update_layout(margin=dict(l=0, r=0, t=0, b=0)) st.plotly_chart(fig) def display_top_words(content): soup = BeautifulSoup(content, 'html.parser') text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) vectorizer = CountVectorizer() X = vectorizer.fit_transform([text]) terms = vectorizer.get_feature_names() word_freq = X.toarray()[0] top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]] st.write(f"Top words: {', '.join(top_words)}") def main(): st.set_page_config(layout='wide') st.title("List of Articles on Health Care") for url in urls: st.write(f"Scraping {url}...") scraped_data = scrape_wikipedia(url) st.write(f"Response time: {scraped_data['response_time']}") content = scraped_data['content'] st.write(f"Content: ") st.markdown(f"```{content.decode()}```") plot_word_frequencies(content) display_top_words(content) if __name__ == '__main__': main()