awacke1 commited on
Commit
fae8400
·
1 Parent(s): db910f7

Update backup-app.py

Browse files
Files changed (1) hide show
  1. backup-app.py +32 -2
backup-app.py CHANGED
@@ -1,6 +1,9 @@
1
  import requests
 
2
  import streamlit as st
3
  import time
 
 
4
 
5
  urls = ['https://en.wikipedia.org/wiki/Health_care',
6
  'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
@@ -15,6 +18,29 @@ def scrape_wikipedia(url):
15
  except:
16
  return {'url': url, 'response_time': None, 'content': ""}
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def main():
19
  st.title("List of Articles on Health Care")
20
 
@@ -22,8 +48,12 @@ def main():
22
  st.write(f"Scraping {url}...")
23
  scraped_data = scrape_wikipedia(url)
24
  st.write(f"Response time: {scraped_data['response_time']}")
 
25
  st.write(f"Content: ")
26
- st.text(scraped_data['content'])
 
 
 
27
 
28
  if __name__ == '__main__':
29
- main()
 
1
  import requests
2
+ from bs4 import BeautifulSoup
3
  import streamlit as st
4
  import time
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
 
8
  urls = ['https://en.wikipedia.org/wiki/Health_care',
9
  'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
 
18
  except:
19
  return {'url': url, 'response_time': None, 'content': ""}
20
 
21
+ def plot_word_frequencies(content):
22
+ soup = BeautifulSoup(content, 'html.parser')
23
+ text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
24
+ words = text.split()
25
+ word_freq = {}
26
+ for word in words:
27
+ word_freq[word] = word_freq.get(word, 0) + 1
28
+ sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
29
+ top_words = [word for word, freq in sorted_word_freq[:10]]
30
+ plt.bar(top_words, [word_freq[word] for word in top_words])
31
+ plt.xticks(rotation=45)
32
+ st.pyplot()
33
+
34
+ def display_top_words(content):
35
+ soup = BeautifulSoup(content, 'html.parser')
36
+ text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
37
+ vectorizer = CountVectorizer()
38
+ X = vectorizer.fit_transform([text])
39
+ terms = vectorizer.get_feature_names()
40
+ word_freq = X.toarray()[0]
41
+ top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
42
+ st.write(f"Top words: {', '.join(top_words)}")
43
+
44
  def main():
45
  st.title("List of Articles on Health Care")
46
 
 
48
  st.write(f"Scraping {url}...")
49
  scraped_data = scrape_wikipedia(url)
50
  st.write(f"Response time: {scraped_data['response_time']}")
51
+ content = scraped_data['content']
52
  st.write(f"Content: ")
53
+ st.write(content.decode(), unsafe_allow_html=True)
54
+
55
+ plot_word_frequencies(content)
56
+ display_top_words(content)
57
 
58
  if __name__ == '__main__':
59
+ main()