File size: 4,983 Bytes
e7120e9
4508b3b
e7120e9
8ddf073
42b3f33
 
612c7fd
e7120e9
bee76fe
 
 
 
f025397
 
8ddf073
f025397
8ddf073
c485d4d
f025397
c485d4d
bee76fe
612c7fd
 
 
 
 
 
 
 
42b3f33
 
 
 
 
 
612c7fd
 
 
 
 
 
0c3b067
 
 
 
612c7fd
b4e1b44
42b3f33
bee76fe
 
f025397
8ddf073
 
 
4508b3b
c485d4d
42b3f33
612c7fd
 
0c3b067
e7120e9
a853745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4e1b44
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import plotly.express as px
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

urls = ['https://en.wikipedia.org/wiki/Health_care',
        'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
        'https://www.who.int/health-topics/coronavirus#tab=tab_1']

def scrape_wikipedia(url):
    try:
        start_time = time.time()
        response = requests.get(url)
        end_time = time.time()
        return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
    except:
        return {'url': url, 'response_time': None, 'content': ""}

def plot_word_frequencies(content):
    soup = BeautifulSoup(content, 'html.parser')
    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
    words = text.split()
    word_freq = {}
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
                       'freq': [freq for word, freq in sorted_word_freq],
                       'len': [len(word) for word, freq in sorted_word_freq]})
    fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    st.plotly_chart(fig)

def display_top_words(content):
    soup = BeautifulSoup(content, 'html.parser')
    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([text])
    terms = vectorizer.get_feature_names()
    word_freq = X.toarray()[0]
    top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
    st.write(f"Top words: {', '.join(top_words)}")

def main():
    st.set_page_config(layout='wide')
    st.title("List of Articles on Health Care")

    for url in urls:
        st.write(f"Scraping {url}...")
        scraped_data = scrape_wikipedia(url)
        st.write(f"Response time: {scraped_data['response_time']}")
        content = scraped_data['content']
        st.write(f"Content: ")
        st.markdown(f"```{content.decode()}```")

        plot_word_frequencies(content)
        display_top_words(content)

    st.markdown("""

    # πŸ“’ Press Release: Beautiful Soup - Your Ultimate Tool to Treat Internet as Your Dataset

Mound, MN - In today's digital age, the internet has become the primary source of information, and analyzing online content has become a critical aspect of business, research, and academic activities. To make it possible, Beautiful Soup - a Python library - is becoming increasingly popular among data scientists, researchers, and business professionals.

## πŸ€” What is Beautiful Soup?

Beautiful Soup is a Python library used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree for parsed pages that can be used to extract data from HTML. The library provides methods that can be used to navigate, search, and modify the parse tree.

## πŸš€ Powerful Features of Beautiful Soup

The Beautiful Soup library offers an array of features that make it the ultimate tool for web scraping and data extraction, including:

- Ability to extract data from HTML/XML files
- Powerful search capabilities to navigate the parse tree and locate specific tags and elements
- Ability to handle badly formatted HTML/XML
- Ability to convert XML to a tree-based structure
- Wide range of output formats
- Supports common web parsing libraries like html5lib and lxml
- Free and open-source library

## πŸ’Ό Applications of Beautiful Soup

Beautiful Soup has a wide range of applications, including:

- Data mining
- Web scraping
- Information extraction
- Research and analysis
- Content management
- Data journalism
- Competitive intelligence

## πŸ€– Program Demonstrating the Power of Beautiful Soup

The recently developed Python program demonstrates how Beautiful Soup can be used to analyze content from Wikipedia pages and WHO's official website on Coronavirus. 
The program uses various Beautiful Soup functions to scrape data from these websites and generate insights.

## πŸ”₯ Why Choose Beautiful Soup?

Beautiful Soup is a user-friendly library that offers unmatched capabilities to treat the internet as a dataset. 
Its powerful search capabilities, ability to handle badly formatted HTML/XML, and support for multiple output formats make it the go-to tool for web scraping and data extraction.

## πŸš€ About the Developers

The program was developed by a team of data scientists and web developers who specialize in web scraping and data analysis and augmented using AI. 
They are passionate about using technology to make data analysis more accessible and easy for everyone.
    
    """)

if __name__ == '__main__':
    main()