Spaces:

awacke1
/

VizLib-BeautifulSoup

Runtime error

App Files Files Community

VizLib-BeautifulSoup / app.py

awacke1

Update app.py

42b3f33 over 2 years ago

raw

history blame

2.43 kB

	import requests
	from bs4 import BeautifulSoup
	import streamlit as st
	import time
	import plotly.express as px
	import pandas as pd
	from sklearn.feature_extraction.text import CountVectorizer

	urls = ['https://en.wikipedia.org/wiki/Health_care',
	'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
	'https://www.who.int/health-topics/coronavirus#tab=tab_1']

	def scrape_wikipedia(url):
	try:
	start_time = time.time()
	response = requests.get(url)
	end_time = time.time()
	return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
	except:
	return {'url': url, 'response_time': None, 'content': ""}

	def plot_word_frequencies(content):
	soup = BeautifulSoup(content, 'html.parser')
	text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
	words = text.split()
	word_freq = {}
	for word in words:
	word_freq[word] = word_freq.get(word, 0) + 1
	sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
	df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
	'freq': [freq for word, freq in sorted_word_freq],
	'len': [len(word) for word, freq in sorted_word_freq]})
	fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
	fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
	st.plotly_chart(fig)

	def display_top_words(content):
	soup = BeautifulSoup(content, 'html.parser')
	text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
	vectorizer = CountVectorizer()
	X = vectorizer.fit_transform([text])
	terms = vectorizer.get_feature_names()
	word_freq = X.toarray()[0]
	top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
	st.write(f"Top words: {', '.join(top_words)}")

	def main():
	st.set_page_config(layout='wide')
	st.title("List of Articles on Health Care")

	for url in urls:
	st.write(f"Scraping {url}...")
	scraped_data = scrape_wikipedia(url)
	st.write(f"Response time: {scraped_data['response_time']}")
	content = scraped_data['content']
	st.write(f"Content: ")
	st.markdown(f"```{content.decode()}```")

	plot_word_frequencies(content)
	display_top_words(content)

	if __name__ == '__main__':
	main()