Spaces:

awacke1
/

VizLib-BeautifulSoup

Runtime error

App Files Files Community

VizLib-BeautifulSoup / backup-app.py

awacke1

Update backup-app.py

fae8400 over 2 years ago

raw

history blame

2.18 kB

	import requests
	from bs4 import BeautifulSoup
	import streamlit as st
	import time
	import matplotlib.pyplot as plt
	from sklearn.feature_extraction.text import CountVectorizer

	urls = ['https://en.wikipedia.org/wiki/Health_care',
	'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
	'https://www.who.int/health-topics/coronavirus#tab=tab_1']

	def scrape_wikipedia(url):
	try:
	start_time = time.time()
	response = requests.get(url)
	end_time = time.time()
	return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
	except:
	return {'url': url, 'response_time': None, 'content': ""}

	def plot_word_frequencies(content):
	soup = BeautifulSoup(content, 'html.parser')
	text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
	words = text.split()
	word_freq = {}
	for word in words:
	word_freq[word] = word_freq.get(word, 0) + 1
	sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
	top_words = [word for word, freq in sorted_word_freq[:10]]
	plt.bar(top_words, [word_freq[word] for word in top_words])
	plt.xticks(rotation=45)
	st.pyplot()

	def display_top_words(content):
	soup = BeautifulSoup(content, 'html.parser')
	text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
	vectorizer = CountVectorizer()
	X = vectorizer.fit_transform([text])
	terms = vectorizer.get_feature_names()
	word_freq = X.toarray()[0]
	top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
	st.write(f"Top words: {', '.join(top_words)}")

	def main():
	st.title("List of Articles on Health Care")

	for url in urls:
	st.write(f"Scraping {url}...")
	scraped_data = scrape_wikipedia(url)
	st.write(f"Response time: {scraped_data['response_time']}")
	content = scraped_data['content']
	st.write(f"Content: ")
	st.write(content.decode(), unsafe_allow_html=True)

	plot_word_frequencies(content)
	display_top_words(content)

	if __name__ == '__main__':
	main()