File size: 2,434 Bytes
91710f3
fae8400
91710f3
 
cf20733
 
fae8400
91710f3
 
 
 
 
 
 
 
 
 
95fcb96
91710f3
95fcb96
91710f3
fae8400
 
 
 
 
 
 
 
cf20733
 
 
 
 
 
fae8400
 
 
 
 
 
 
 
 
 
 
91710f3
cf20733
91710f3
 
 
 
 
 
fae8400
95fcb96
cf20733
fae8400
 
 
91710f3
 
cf20733
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import plotly.express as px
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

urls = ['https://en.wikipedia.org/wiki/Health_care',
        'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
        'https://www.who.int/health-topics/coronavirus#tab=tab_1']

def scrape_wikipedia(url):
    try:
        start_time = time.time()
        response = requests.get(url)
        end_time = time.time()
        return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
    except:
        return {'url': url, 'response_time': None, 'content': ""}

def plot_word_frequencies(content):
    soup = BeautifulSoup(content, 'html.parser')
    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
    words = text.split()
    word_freq = {}
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
                       'freq': [freq for word, freq in sorted_word_freq],
                       'len': [len(word) for word, freq in sorted_word_freq]})
    fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    st.plotly_chart(fig)

def display_top_words(content):
    soup = BeautifulSoup(content, 'html.parser')
    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([text])
    terms = vectorizer.get_feature_names()
    word_freq = X.toarray()[0]
    top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
    st.write(f"Top words: {', '.join(top_words)}")

def main():
    st.set_page_config(layout='wide')
    st.title("List of Articles on Health Care")

    for url in urls:
        st.write(f"Scraping {url}...")
        scraped_data = scrape_wikipedia(url)
        st.write(f"Response time: {scraped_data['response_time']}")
        content = scraped_data['content']
        st.write(f"Content: ")
        st.markdown(f"```{content.decode()}```")

        plot_word_frequencies(content)
        display_top_words(content)

if __name__ == '__main__':
    main()