AIRider commited on
Commit
eb256b7
·
verified ·
1 Parent(s): e1604fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -23
app.py CHANGED
@@ -1,16 +1,9 @@
1
  import gradio as gr
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from requests.adapters import HTTPAdapter
5
- from requests.packages.urllib3.util.retry import Retry
6
  import re
7
- from selenium import webdriver
8
- import os
9
 
10
  def setup_session():
11
- session = requests.Session()
12
- retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
13
- session.mount('https://', HTTPAdapter(max_retries=retries))
14
  return session
15
 
16
  def generate_naver_search_url(query):
@@ -23,15 +16,15 @@ def generate_naver_search_url(query):
23
  def crawl_naver_search_results(url):
24
  session = setup_session()
25
  response = session.get(url)
26
- soup = BeautifulSoup(response.text, "html.parser")
27
  results = []
28
  i = 1
29
- for li in soup.find_all("li", class_=re.compile("bx.*")):
30
- for div in li.find_all("div", class_="detail_box"):
31
- for div2 in div.find_all("div", class_="title_area"):
32
  title = div2.text.strip()
33
- for a in div2.find_all("a", href=True):
34
- link = a["href"]
35
  results.append({"번호": i, "제목": title, "링크": link})
36
  i += 1
37
  html_table = "<table><tr><th>번호</th><th>제목</th><th>링크</th></tr>"
@@ -41,15 +34,12 @@ def crawl_naver_search_results(url):
41
  return html_table
42
 
43
  def get_blog_content(link):
44
- options = webdriver.ChromeOptions()
45
- options.add_argument("--disable-javascript")
46
- os.environ["CHROMEDRIVER_PATH"] = "/usr/local/bin/chromedriver"
47
- driver = webdriver.Chrome(options=options)
48
- driver.get(link)
49
- soup = BeautifulSoup(driver.page_source, "html.parser")
50
  title = ""
51
- for component in soup.find_all("div", class_="se-component se-text se-l-default"):
52
- for paragraph in component.find_all("p", class_="se-text-paragraph"):
53
  title += paragraph.text.strip() + "\n"
54
  return title
55
 
 
1
  import gradio as gr
2
+ from requests_html import HTMLSession
 
 
 
3
  import re
 
 
4
 
5
  def setup_session():
6
+ session = HTMLSession()
 
 
7
  return session
8
 
9
  def generate_naver_search_url(query):
 
16
  def crawl_naver_search_results(url):
17
  session = setup_session()
18
  response = session.get(url)
19
+ response.html.render()
20
  results = []
21
  i = 1
22
+ for li in response.html.find("li.bx"):
23
+ for div in li.find("div.detail_box"):
24
+ for div2 in div.find("div.title_area"):
25
  title = div2.text.strip()
26
+ for a in div2.find("a", href=True):
27
+ link = a.attrs["href"]
28
  results.append({"번호": i, "제목": title, "링크": link})
29
  i += 1
30
  html_table = "<table><tr><th>번호</th><th>제목</th><th>링크</th></tr>"
 
34
  return html_table
35
 
36
  def get_blog_content(link):
37
+ session = setup_session()
38
+ response = session.get(link)
39
+ response.html.render()
 
 
 
40
  title = ""
41
+ for component in response.html.find("div.se-component.se-text.se-l-default"):
42
+ for paragraph in component.find("p.se-text-paragraph"):
43
  title += paragraph.text.strip() + "\n"
44
  return title
45