import requests from bs4 import BeautifulSoup import streamlit as st import asyncio import concurrent.futures urls = ['https://en.wikipedia.org/wiki/Health_care', 'https://en.wikipedia.org/wiki/Health_information_on_the_Internet', 'https://www.who.int/health-topics/coronavirus#tab=tab_1'] async def scrape_wikipedia(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') div_element = soup.find('div', {'class': 'div-col columns column-width'}) if div_element is not None: articles_list = div_element.find_all('li') else: articles_list = [] return articles_list async def scrape_urls(urls): tasks = [] async with concurrent.futures.ThreadPoolExecutor(max_workers=len(urls)) as executor: for url in urls: tasks.append(asyncio.ensure_future(scrape_wikipedia(url))) await asyncio.gather(*tasks) return tasks def main(): st.title("List of Articles on Health Care") loop = asyncio.get_event_loop() tasks = loop.run_until_complete(scrape_urls(urls)) data = [] for task in tasks: for article in task.result(): data.append({'url': task.result().index(article), 'article': article.text}) st.write('## Dataset') st.dataframe(data) st.write('## Grid') st.write('url', 'article') for d in data: st.write(d['url'], d['article']) if __name__ == '__main__': main()