awacke1 commited on
Commit
612c7fd
·
1 Parent(s): 4508b3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -4
app.py CHANGED
@@ -2,6 +2,8 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import streamlit as st
4
  import time
 
 
5
 
6
  urls = ['https://en.wikipedia.org/wiki/Health_care',
7
  'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
@@ -16,6 +18,29 @@ def scrape_wikipedia(url):
16
  except:
17
  return {'url': url, 'response_time': None, 'content': ""}
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def main():
20
  st.title("List of Articles on Health Care")
21
 
@@ -24,11 +49,11 @@ def main():
24
  scraped_data = scrape_wikipedia(url)
25
  st.write(f"Response time: {scraped_data['response_time']}")
26
  content = scraped_data['content']
27
- if b'\r\n' not in content:
28
- soup = BeautifulSoup(content, 'html.parser')
29
- content = soup.prettify().encode()
30
  st.write(f"Content: ")
31
- st.text_area("", content.decode(), height=200, max_chars=None, key=None, help=None, return_streamlit=False, value=None, on_change=None, args=None, kwargs=None)
 
 
 
32
 
33
  if __name__ == '__main__':
34
  main()
 
2
  from bs4 import BeautifulSoup
3
  import streamlit as st
4
  import time
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
 
8
  urls = ['https://en.wikipedia.org/wiki/Health_care',
9
  'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
 
18
  except:
19
  return {'url': url, 'response_time': None, 'content': ""}
20
 
21
+ def plot_word_frequencies(content):
22
+ soup = BeautifulSoup(content, 'html.parser')
23
+ text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
24
+ words = text.split()
25
+ word_freq = {}
26
+ for word in words:
27
+ word_freq[word] = word_freq.get(word, 0) + 1
28
+ sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
29
+ top_words = [word for word, freq in sorted_word_freq[:10]]
30
+ plt.bar(top_words, [word_freq[word] for word in top_words])
31
+ plt.xticks(rotation=45)
32
+ st.pyplot()
33
+
34
+ def display_top_words(content):
35
+ soup = BeautifulSoup(content, 'html.parser')
36
+ text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
37
+ vectorizer = CountVectorizer()
38
+ X = vectorizer.fit_transform([text])
39
+ terms = vectorizer.get_feature_names()
40
+ word_freq = X.toarray()[0]
41
+ top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
42
+ st.write(f"Top words: {', '.join(top_words)}")
43
+
44
  def main():
45
  st.title("List of Articles on Health Care")
46
 
 
49
  scraped_data = scrape_wikipedia(url)
50
  st.write(f"Response time: {scraped_data['response_time']}")
51
  content = scraped_data['content']
 
 
 
52
  st.write(f"Content: ")
53
+ st.write(content.decode(), unsafe_allow_html=True)
54
+
55
+ plot_word_frequencies(content)
56
+ display_top_words(content)
57
 
58
  if __name__ == '__main__':
59
  main()