File size: 2,488 Bytes
e7120e9
4508b3b
e7120e9
8ddf073
42b3f33
 
612c7fd
e7120e9
bee76fe
 
 
 
f025397
 
8ddf073
f025397
8ddf073
c485d4d
f025397
c485d4d
bee76fe
612c7fd
 
 
 
 
 
 
 
42b3f33
 
 
 
 
 
612c7fd
 
 
 
 
 
e6e9141
 
 
 
 
 
 
612c7fd
b4e1b44
42b3f33
bee76fe
 
f025397
8ddf073
 
 
4508b3b
c485d4d
42b3f33
612c7fd
 
f733d00
e7120e9
b4e1b44
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import requests
from bs4 import BeautifulSoup
import streamlit as st
import time
import plotly.express as px
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

urls = ['https://en.wikipedia.org/wiki/Health_care',
        'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
        'https://www.who.int/health-topics/coronavirus#tab=tab_1']

def scrape_wikipedia(url):
    try:
        start_time = time.time()
        response = requests.get(url)
        end_time = time.time()
        return {'url': url, 'response_time': end_time - start_time, 'content': response.content}
    except:
        return {'url': url, 'response_time': None, 'content': ""}

def plot_word_frequencies(content):
    soup = BeautifulSoup(content, 'html.parser')
    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
    words = text.split()
    word_freq = {}
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq],
                       'freq': [freq for word, freq in sorted_word_freq],
                       'len': [len(word) for word, freq in sorted_word_freq]})
    fig = px.treemap(df, path=['len', 'word'], values='freq', color='len')
    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    st.plotly_chart(fig)

def display_top_words(content):
    soup = BeautifulSoup(content, 'html.parser')
    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([text])
    try:
        terms = vectorizer.get_feature_names()
        word_freq = X.toarray()[0]
        top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
        st.write(f"Top words: {', '.join(top_words)}")
    except:
        return

def main():
    st.set_page_config(layout='wide')
    st.title("List of Articles on Health Care")

    for url in urls:
        st.write(f"Scraping {url}...")
        scraped_data = scrape_wikipedia(url)
        st.write(f"Response time: {scraped_data['response_time']}")
        content = scraped_data['content']
        st.write(f"Content: ")
        st.markdown(f"```{content.decode()}```")

        plot_word_frequencies(content)
        # display_top_words(content)

if __name__ == '__main__':
    main()