awacke1's picture
Update app.py
a853745
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import plotly.express as px
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
urls = ['https://en.wikipedia.org/wiki/Health_care',
'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
'https://www.who.int/health-topics/coronavirus#tab=tab_1']
def scrape_wikipedia(url):
try:
start_time = time.time()
response = requests.get(url)
end_time = time.time()
return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
except:
return {'url': url, 'response_time': None, 'content': ""}
def plot_word_frequencies(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
words = text.split()
word_freq = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
'freq': [freq for word, freq in sorted_word_freq],
'len': [len(word) for word, freq in sorted_word_freq]})
fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
st.plotly_chart(fig)
def display_top_words(content):
soup = BeautifulSoup(content, 'html.parser')
text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])
terms = vectorizer.get_feature_names()
word_freq = X.toarray()[0]
top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
st.write(f"Top words: {', '.join(top_words)}")
def main():
st.set_page_config(layout='wide')
st.title("List of Articles on Health Care")
for url in urls:
st.write(f"Scraping {url}...")
scraped_data = scrape_wikipedia(url)
st.write(f"Response time: {scraped_data['response_time']}")
content = scraped_data['content']
st.write(f"Content: ")
st.markdown(f"```{content.decode()}```")
plot_word_frequencies(content)
display_top_words(content)
st.markdown("""
# πŸ“’ Press Release: Beautiful Soup - Your Ultimate Tool to Treat Internet as Your Dataset
Mound, MN - In today's digital age, the internet has become the primary source of information, and analyzing online content has become a critical aspect of business, research, and academic activities. To make it possible, Beautiful Soup - a Python library - is becoming increasingly popular among data scientists, researchers, and business professionals.
## πŸ€” What is Beautiful Soup?
Beautiful Soup is a Python library used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree for parsed pages that can be used to extract data from HTML. The library provides methods that can be used to navigate, search, and modify the parse tree.
## πŸš€ Powerful Features of Beautiful Soup
The Beautiful Soup library offers an array of features that make it the ultimate tool for web scraping and data extraction, including:
- Ability to extract data from HTML/XML files
- Powerful search capabilities to navigate the parse tree and locate specific tags and elements
- Ability to handle badly formatted HTML/XML
- Ability to convert XML to a tree-based structure
- Wide range of output formats
- Supports common web parsing libraries like html5lib and lxml
- Free and open-source library
## πŸ’Ό Applications of Beautiful Soup
Beautiful Soup has a wide range of applications, including:
- Data mining
- Web scraping
- Information extraction
- Research and analysis
- Content management
- Data journalism
- Competitive intelligence
## πŸ€– Program Demonstrating the Power of Beautiful Soup
The recently developed Python program demonstrates how Beautiful Soup can be used to analyze content from Wikipedia pages and WHO's official website on Coronavirus.
The program uses various Beautiful Soup functions to scrape data from these websites and generate insights.
## πŸ”₯ Why Choose Beautiful Soup?
Beautiful Soup is a user-friendly library that offers unmatched capabilities to treat the internet as a dataset.
Its powerful search capabilities, ability to handle badly formatted HTML/XML, and support for multiple output formats make it the go-to tool for web scraping and data extraction.
## πŸš€ About the Developers
The program was developed by a team of data scientists and web developers who specialize in web scraping and data analysis and augmented using AI.
They are passionate about using technology to make data analysis more accessible and easy for everyone.
""")
if __name__ == '__main__':
main()