awacke1 commited on
Commit
f025397
·
1 Parent(s): c36fabe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -27
app.py CHANGED
@@ -1,43 +1,33 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import streamlit as st
4
- import asyncio
5
- import concurrent.futures
6
 
7
  urls = ['https://en.wikipedia.org/wiki/Health_care',
8
  'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
9
  'https://www.who.int/health-topics/coronavirus#tab=tab_1']
10
 
11
- async def scrape_wikipedia(url):
12
- response = requests.get(url)
13
- soup = BeautifulSoup(response.content, 'html.parser')
14
- div_element = soup.find('div', {'class': 'div-col columns column-width'})
15
- if div_element is not None:
16
- articles_list = div_element.find_all('li')
17
- else:
18
- articles_list = []
19
- return articles_list
20
-
21
- async def scrape_urls(urls):
22
- tasks = []
23
- loop = asyncio.new_event_loop()
24
- asyncio.set_event_loop(loop)
25
- async with concurrent.futures.ThreadPoolExecutor(max_workers=len(urls)) as executor:
26
- for url in urls:
27
- tasks.append(loop.run_in_executor(executor, scrape_wikipedia, url))
28
- await asyncio.gather(*tasks)
29
- return tasks
30
 
31
  def main():
32
  st.title("List of Articles on Health Care")
33
 
34
- loop = asyncio.get_event_loop()
35
- tasks = loop.run_until_complete(scrape_urls(urls))
36
-
37
  data = []
38
- for task in tasks:
39
- for article in task.result():
40
- data.append({'url': task.result().index(article), 'article': article.text})
 
41
 
42
  st.write('## Dataset')
43
  st.dataframe(data)
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import streamlit as st
 
 
4
 
5
  urls = ['https://en.wikipedia.org/wiki/Health_care',
6
  'https://en.wikipedia.org/wiki/Health_information_on_the_Internet',
7
  'https://www.who.int/health-topics/coronavirus#tab=tab_1']
8
 
9
+ def scrape_wikipedia(url):
10
+ try:
11
+ response = requests.get(url)
12
+ soup = BeautifulSoup(response.content, 'html.parser')
13
+ div_element = soup.find('div', {'class': 'div-col columns column-width'})
14
+ if div_element is not None:
15
+ articles_list = div_element.find_all('li')
16
+ else:
17
+ articles_list = []
18
+ return articles_list
19
+ except:
20
+ st.write(f"Error scraping {url}")
21
+ return []
 
 
 
 
 
 
22
 
23
  def main():
24
  st.title("List of Articles on Health Care")
25
 
 
 
 
26
  data = []
27
+ for url in urls:
28
+ articles_list = scrape_wikipedia(url)
29
+ for article in articles_list:
30
+ data.append({'url': urls.index(url), 'article': article.text})
31
 
32
  st.write('## Dataset')
33
  st.dataframe(data)