Spaces:

awacke1
/

VizLib-BeautifulSoup

Runtime error

App Files Files Community

awacke1 commited on Feb 21, 2023

Commit

612c7fd

1 Parent(s): 4508b3b

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -4

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import requests
 from bs4 import BeautifulSoup
 import streamlit as st
 import time
 urls = ['https://en.wikipedia.org/wiki/Health_care',
         'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
@@ -16,6 +18,29 @@ def scrape_wikipedia(url):
     except:
         return {'url': url, 'response_time': None, 'content': ""}
 def main():
     st.title("List of Articles on Health Care")
@@ -24,11 +49,11 @@ def main():
         scraped_data = scrape_wikipedia(url)
         st.write(f"Response time: {scraped_data['response_time']}")
         content = scraped_data['content']
-        if b'\r\n' not in content:
-            soup = BeautifulSoup(content, 'html.parser')
-            content = soup.prettify().encode()
         st.write(f"Content: ")
-        st.text_area("", content.decode(), height=200, max_chars=None, key=None, help=None, return_streamlit=False, value=None, on_change=None, args=None, kwargs=None)
 if __name__ == '__main__':
     main()

 from bs4 import BeautifulSoup
 import streamlit as st
 import time
+import matplotlib.pyplot as plt
+from sklearn.feature_extraction.text import CountVectorizer
 urls = ['https://en.wikipedia.org/wiki/Health_care',
         'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
     except:
         return {'url': url, 'response_time': None, 'content': ""}
+def plot_word_frequencies(content):
+    soup = BeautifulSoup(content, 'html.parser')
+    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
+    words = text.split()
+    word_freq = {}
+    for word in words:
+        word_freq[word] = word_freq.get(word, 0) + 1
+    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+    top_words = [word for word, freq in sorted_word_freq[:10]]
+    plt.bar(top_words, [word_freq[word] for word in top_words])
+    plt.xticks(rotation=45)
+    st.pyplot()
+def display_top_words(content):
+    soup = BeautifulSoup(content, 'html.parser')
+    text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
+    vectorizer = CountVectorizer()
+    X = vectorizer.fit_transform([text])
+    terms = vectorizer.get_feature_names()
+    word_freq = X.toarray()[0]
+    top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]]
+    st.write(f"Top words: {', '.join(top_words)}")
 def main():
     st.title("List of Articles on Health Care")
         scraped_data = scrape_wikipedia(url)
         st.write(f"Response time: {scraped_data['response_time']}")
         content = scraped_data['content']
         st.write(f"Content: ")
+        st.write(content.decode(), unsafe_allow_html=True)
+        plot_word_frequencies(content)
+        display_top_words(content)
 if __name__ == '__main__':
     main()