import requests from bs4 import BeautifulSoup import streamlit as st import time import plotly.express as px import pandas as pd from sklearn.feature_extraction.text import CountVectorizer urls = ['https://en.wikipedia.org/wiki/Health_care', 'https://en.wikipedia.org/wiki/Health_information_on_the_Internet', 'https://www.who.int/health-topics/coronavirus#tab=tab_1'] def scrape_wikipedia(url): try: start_time = time.time() response = requests.get(url) end_time = time.time() return {'url': url, 'response_time': end_time - start_time, 'content': response.content} except: return {'url': url, 'response_time': None, 'content': ""} def plot_word_frequencies(content): soup = BeautifulSoup(content, 'html.parser') text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) words = text.split() word_freq = {} for word in words: word_freq[word] = word_freq.get(word, 0) + 1 sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) df = pd.DataFrame({'word': [word for word, freq in sorted_word_freq], 'freq': [freq for word, freq in sorted_word_freq], 'len': [len(word) for word, freq in sorted_word_freq]}) fig = px.treemap(df, path=['len', 'word'], values='freq', color='len') fig.update_layout(margin=dict(l=0, r=0, t=0, b=0)) st.plotly_chart(fig) def display_top_words(content): soup = BeautifulSoup(content, 'html.parser') text = ' '.join([elem.text for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) vectorizer = CountVectorizer() X = vectorizer.fit_transform([text]) terms = vectorizer.get_feature_names() word_freq = X.toarray()[0] top_words = [terms[i] for i in word_freq.argsort()[-10:][::-1]] st.write(f"Top words: {', '.join(top_words)}") def main(): st.set_page_config(layout='wide') st.title("List of Articles on Health Care") for url in urls: st.write(f"Scraping {url}...") scraped_data = scrape_wikipedia(url) st.write(f"Response time: {scraped_data['response_time']}") content = scraped_data['content'] st.write(f"Content: ") st.markdown(f"```{content.decode()}```") plot_word_frequencies(content) display_top_words(content) st.markdown(""" # 📢 Press Release: Beautiful Soup - Your Ultimate Tool to Treat Internet as Your Dataset Mound, MN - In today's digital age, the internet has become the primary source of information, and analyzing online content has become a critical aspect of business, research, and academic activities. To make it possible, Beautiful Soup - a Python library - is becoming increasingly popular among data scientists, researchers, and business professionals. ## 🤔 What is Beautiful Soup? Beautiful Soup is a Python library used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree for parsed pages that can be used to extract data from HTML. The library provides methods that can be used to navigate, search, and modify the parse tree. ## 🚀 Powerful Features of Beautiful Soup The Beautiful Soup library offers an array of features that make it the ultimate tool for web scraping and data extraction, including: - Ability to extract data from HTML/XML files - Powerful search capabilities to navigate the parse tree and locate specific tags and elements - Ability to handle badly formatted HTML/XML - Ability to convert XML to a tree-based structure - Wide range of output formats - Supports common web parsing libraries like html5lib and lxml - Free and open-source library ## 💼 Applications of Beautiful Soup Beautiful Soup has a wide range of applications, including: - Data mining - Web scraping - Information extraction - Research and analysis - Content management - Data journalism - Competitive intelligence ## 🤖 Program Demonstrating the Power of Beautiful Soup The recently developed Python program demonstrates how Beautiful Soup can be used to analyze content from Wikipedia pages and WHO's official website on Coronavirus. The program uses various Beautiful Soup functions to scrape data from these websites and generate insights. ## 🔥 Why Choose Beautiful Soup? Beautiful Soup is a user-friendly library that offers unmatched capabilities to treat the internet as a dataset. Its powerful search capabilities, ability to handle badly formatted HTML/XML, and support for multiple output formats make it the go-to tool for web scraping and data extraction. ## 🚀 About the Developers The program was developed by a team of data scientists and web developers who specialize in web scraping and data analysis and augmented using AI. They are passionate about using technology to make data analysis more accessible and easy for everyone. """) if __name__ == '__main__': main()