Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import streamlit as st | |
import time | |
import plotly.express as px | |
import pandas as pd | |
from sklearn.feature_extraction.text import CountVectorizer | |
urls = ['https://en.wikipedia.org/wiki/Health_care', | |
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet', | |
'https://www.who.int/health-topics/coronavirus#tab=tab_1'] | |
def scrape_wikipedia(url): | |
try: | |
start_time = time.time() | |
response = requests.get(url) | |
end_time = time.time() | |
return {'url': url, 'response_time': end_time - start_time, 'content': response.content} | |
except: | |
return {'url': url, 'response_time': None, 'content': ""} | |
def plot_word_frequencies(content): | |
soup = BeautifulSoup(content, 'html.parser') | |
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) | |
words = text.split() | |
word_freq = {} | |
for word in words: | |
word_freq[word] = word_freq.get(word, 0) + 1 | |
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) | |
df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq], | |
'freq': [freq for word, freq in sorted_word_freq], | |
'len': [len(word) for word, freq in sorted_word_freq]}) | |
fig = px.treemap(df, path=['len', 'word'], values='freq', color='len') | |
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0)) | |
st.plotly_chart(fig) | |
def display_top_words(content): | |
soup = BeautifulSoup(content, 'html.parser') | |
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform([text]) | |
terms = vectorizer.get_feature_names() | |
word_freq = X.toarray()[0] | |
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]] | |
st.write(f"Top words: {', '.join(top_words)}") | |
def main(): | |
st.set_page_config(layout='wide') | |
st.title("List of Articles on Health Care") | |
for url in urls: | |
st.write(f"Scraping {url}...") | |
scraped_data = scrape_wikipedia(url) | |
st.write(f"Response time: {scraped_data['response_time']}") | |
content = scraped_data['content'] | |
st.write(f"Content: ") | |
st.markdown(f"```{content.decode()}```") | |
plot_word_frequencies(content) | |
display_top_words(content) | |
st.markdown(""" | |
# π’ Press Release: Beautiful Soup - Your Ultimate Tool to Treat Internet as Your Dataset | |
Mound, MN - In today's digital age, the internet has become the primary source of information, and analyzing online content has become a critical aspect of business, research, and academic activities. To make it possible, Beautiful Soup - a Python library - is becoming increasingly popular among data scientists, researchers, and business professionals. | |
## π€ What is Beautiful Soup? | |
Beautiful Soup is a Python library used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree for parsed pages that can be used to extract data from HTML. The library provides methods that can be used to navigate, search, and modify the parse tree. | |
## π Powerful Features of Beautiful Soup | |
The Beautiful Soup library offers an array of features that make it the ultimate tool for web scraping and data extraction, including: | |
- Ability to extract data from HTML/XML files | |
- Powerful search capabilities to navigate the parse tree and locate specific tags and elements | |
- Ability to handle badly formatted HTML/XML | |
- Ability to convert XML to a tree-based structure | |
- Wide range of output formats | |
- Supports common web parsing libraries like html5lib and lxml | |
- Free and open-source library | |
## πΌ Applications of Beautiful Soup | |
Beautiful Soup has a wide range of applications, including: | |
- Data mining | |
- Web scraping | |
- Information extraction | |
- Research and analysis | |
- Content management | |
- Data journalism | |
- Competitive intelligence | |
## π€ Program Demonstrating the Power of Beautiful Soup | |
The recently developed Python program demonstrates how Beautiful Soup can be used to analyze content from Wikipedia pages and WHO's official website on Coronavirus. | |
The program uses various Beautiful Soup functions to scrape data from these websites and generate insights. | |
## π₯ Why Choose Beautiful Soup? | |
Beautiful Soup is a user-friendly library that offers unmatched capabilities to treat the internet as a dataset. | |
Its powerful search capabilities, ability to handle badly formatted HTML/XML, and support for multiple output formats make it the go-to tool for web scraping and data extraction. | |
## π About the Developers | |
The program was developed by a team of data scientists and web developers who specialize in web scraping and data analysis and augmented using AI. | |
They are passionate about using technology to make data analysis more accessible and easy for everyone. | |
""") | |
if __name__ == '__main__': | |
main() | |