Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,16 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
from requests.adapters import HTTPAdapter
|
5 |
-
from requests.packages.urllib3.util.retry import Retry
|
6 |
import re
|
7 |
-
from selenium import webdriver
|
8 |
-
import os
|
9 |
|
10 |
def setup_session():
|
11 |
-
session =
|
12 |
-
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
|
13 |
-
session.mount('https://', HTTPAdapter(max_retries=retries))
|
14 |
return session
|
15 |
|
16 |
def generate_naver_search_url(query):
|
@@ -23,15 +16,15 @@ def generate_naver_search_url(query):
|
|
23 |
def crawl_naver_search_results(url):
|
24 |
session = setup_session()
|
25 |
response = session.get(url)
|
26 |
-
|
27 |
results = []
|
28 |
i = 1
|
29 |
-
for li in
|
30 |
-
for div in li.
|
31 |
-
for div2 in div.
|
32 |
title = div2.text.strip()
|
33 |
-
for a in div2.
|
34 |
-
link = a["href"]
|
35 |
results.append({"번호": i, "제목": title, "링크": link})
|
36 |
i += 1
|
37 |
html_table = "<table><tr><th>번호</th><th>제목</th><th>링크</th></tr>"
|
@@ -41,15 +34,12 @@ def crawl_naver_search_results(url):
|
|
41 |
return html_table
|
42 |
|
43 |
def get_blog_content(link):
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
driver = webdriver.Chrome(options=options)
|
48 |
-
driver.get(link)
|
49 |
-
soup = BeautifulSoup(driver.page_source, "html.parser")
|
50 |
title = ""
|
51 |
-
for component in
|
52 |
-
for paragraph in component.
|
53 |
title += paragraph.text.strip() + "\n"
|
54 |
return title
|
55 |
|
|
|
1 |
import gradio as gr
|
2 |
+
from requests_html import HTMLSession
|
|
|
|
|
|
|
3 |
import re
|
|
|
|
|
4 |
|
5 |
def setup_session():
|
6 |
+
session = HTMLSession()
|
|
|
|
|
7 |
return session
|
8 |
|
9 |
def generate_naver_search_url(query):
|
|
|
16 |
def crawl_naver_search_results(url):
|
17 |
session = setup_session()
|
18 |
response = session.get(url)
|
19 |
+
response.html.render()
|
20 |
results = []
|
21 |
i = 1
|
22 |
+
for li in response.html.find("li.bx"):
|
23 |
+
for div in li.find("div.detail_box"):
|
24 |
+
for div2 in div.find("div.title_area"):
|
25 |
title = div2.text.strip()
|
26 |
+
for a in div2.find("a", href=True):
|
27 |
+
link = a.attrs["href"]
|
28 |
results.append({"번호": i, "제목": title, "링크": link})
|
29 |
i += 1
|
30 |
html_table = "<table><tr><th>번호</th><th>제목</th><th>링크</th></tr>"
|
|
|
34 |
return html_table
|
35 |
|
36 |
def get_blog_content(link):
|
37 |
+
session = setup_session()
|
38 |
+
response = session.get(link)
|
39 |
+
response.html.render()
|
|
|
|
|
|
|
40 |
title = ""
|
41 |
+
for component in response.html.find("div.se-component.se-text.se-l-default"):
|
42 |
+
for paragraph in component.find("p.se-text-paragraph"):
|
43 |
title += paragraph.text.strip() + "\n"
|
44 |
return title
|
45 |
|