Spaces:

awacke1
/

VizLib-BeautifulSoup

Runtime error

App Files Files Community

awacke1 commited on Feb 21, 2023

Commit

fae8400

1 Parent(s): db910f7

Update backup-app.py

Browse files

Files changed (1) hide show

backup-app.py +32 -2

backup-app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import requests
 import streamlit as st
 import time
 urls = ['https://en.wikipedia.org/wiki/Health_care',
         'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
@@ -15,6 +18,29 @@ def scrape_wikipedia(url):
     except:
         return {'url': url, 'response_time': None, 'content': ""}
 def main():
     st.title("List of Articles on Health Care")
@@ -22,8 +48,12 @@ def main():
         st.write(f"Scraping {url}...")
         scraped_data = scrape_wikipedia(url)
         st.write(f"Response time: {scraped_data['response_time']}")
         st.write(f"Content: ")
-        st.text(scraped_data['content'])
 if __name__ == '__main__':
-    main()

 import requests
+from bs4 import BeautifulSoup
 import streamlit as st
 import time
+import matplotlib.pyplot as plt
+from sklearn.feature_extraction.text import CountVectorizer
 urls = ['https://en.wikipedia.org/wiki/Health_care',
         'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
     except:
         return {'url': url, 'response_time': None, 'content': ""}
+def plot_word_frequencies(content):
+    soup = BeautifulSoup(content, 'html.parser')
+    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
+    words = text.split()
+    word_freq = {}
+    for word in words:
+        word_freq[word] = word_freq.get(word, 0) + 1
+    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+    top_words = [word for word, freq in sorted_word_freq[:10]]
+    plt.bar(top_words, [word_freq[word] for word in top_words])
+    plt.xticks(rotation=45)
+    st.pyplot()
+def display_top_words(content):
+    soup = BeautifulSoup(content, 'html.parser')
+    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
+    vectorizer = CountVectorizer()
+    X = vectorizer.fit_transform([text])
+    terms = vectorizer.get_feature_names()
+    word_freq = X.toarray()[0]
+    top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
+    st.write(f"Top words: {', '.join(top_words)}")
 def main():
     st.title("List of Articles on Health Care")
         st.write(f"Scraping {url}...")
         scraped_data = scrape_wikipedia(url)
         st.write(f"Response time: {scraped_data['response_time']}")
+        content = scraped_data['content']
         st.write(f"Content: ")
+        st.write(content.decode(), unsafe_allow_html=True)
+        plot_word_frequencies(content)
+        display_top_words(content)
 if __name__ == '__main__':
+    main()