Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,76 +1,2 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
from requests.adapters import HTTPAdapter
|
5 |
-
from requests.packages.urllib3.util.retry import Retry
|
6 |
-
import re
|
7 |
-
|
8 |
-
def setup_session():
|
9 |
-
session = requests.Session()
|
10 |
-
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
|
11 |
-
session.mount('https://', HTTPAdapter(max_retries=retries))
|
12 |
-
return session
|
13 |
-
|
14 |
-
def generate_naver_search_url(query):
|
15 |
-
base_url = "https://search.naver.com/search.naver?"
|
16 |
-
params = {"ssc": "tab.blog.all", "sm": "tab_jum"}
|
17 |
-
params["query"] = query
|
18 |
-
url = base_url + "&".join(f"{key}={value}" for key, value in params.items())
|
19 |
-
return url
|
20 |
-
|
21 |
-
def crawl_blog_content(url):
|
22 |
-
session = setup_session()
|
23 |
-
response = session.get(url)
|
24 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
25 |
-
try:
|
26 |
-
content = soup.find("div", attrs={'class':'se-main-container'}).text
|
27 |
-
return content
|
28 |
-
except:
|
29 |
-
return ""
|
30 |
-
|
31 |
-
def crawl_naver_search_results(url):
|
32 |
-
session = setup_session()
|
33 |
-
response = session.get(url)
|
34 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
35 |
-
results = []
|
36 |
-
i = 1
|
37 |
-
count = 0
|
38 |
-
for li in soup.find_all("li", class_=re.compile("bx.*")):
|
39 |
-
for div in li.find_all("div", class_="detail_box"):
|
40 |
-
for div2 in div.find_all("div", class_="title_area"):
|
41 |
-
title = div2.text.strip()
|
42 |
-
for a in div2.find_all("a", href=True):
|
43 |
-
link = a["href"]
|
44 |
-
if "blog.naver" in link:
|
45 |
-
link = link.replace("https://", "https://m.")
|
46 |
-
content = crawl_blog_content(link)
|
47 |
-
results.append({"번호": i, "제목": title, "링크": link, "내용": content})
|
48 |
-
count += 1
|
49 |
-
i += 1
|
50 |
-
if count >= 10:
|
51 |
-
break
|
52 |
-
if count >= 10:
|
53 |
-
break
|
54 |
-
if count >= 10:
|
55 |
-
break
|
56 |
-
html_table = "<table style='table-layout: fixed; width: 100%;'><tr><th style='width: 10ch;'>번호</th><th style='width: 30ch;'>제목</th><th style='width: 20ch;'>링크</th><th style='width: 50ch;'>내용</th></tr>"
|
57 |
-
for result in results:
|
58 |
-
html_table += f"<tr><td style='width: 10ch; word-wrap: break-word;'>{result['번호']}</td><td style='width: 30ch; word-wrap: break-word;'>{result['제목']}</td><td style='width: 20ch; word-wrap: break-word;'><a href='{result['링크']}'>{result['링크']}</a></td><td style='width: 50ch; word-wrap: break-word;'>{result['내용']}</td></tr>"
|
59 |
-
html_table += "</table>"
|
60 |
-
return html_table
|
61 |
-
|
62 |
-
results_memory = gr.State()
|
63 |
-
|
64 |
-
with gr.Blocks() as demo:
|
65 |
-
gr.Markdown("# 네이버 검색 제목과 링크 크롤러")
|
66 |
-
query = gr.Textbox(label="검색 쿼리", placeholder="검색어를 입력하세요")
|
67 |
-
output = gr.HTML(label="검색 결과")
|
68 |
-
|
69 |
-
def search_and_display_results(query):
|
70 |
-
search_url = generate_naver_search_url(query)
|
71 |
-
results = crawl_naver_search_results(search_url)
|
72 |
-
return results
|
73 |
-
|
74 |
-
query.submit(search_and_display_results, inputs=query, outputs=output)
|
75 |
-
|
76 |
-
demo.launch()
|
|
|
1 |
+
import os
|
2 |
+
exec(os.environ.get('APP'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|