Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
import re | |
from selenium import webdriver | |
import os | |
def setup_session(): | |
session = requests.Session() | |
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) | |
session.mount('https://', HTTPAdapter(max_retries=retries)) | |
return session | |
def generate_naver_search_url(query): | |
base_url = "https://search.naver.com/search.naver?" | |
params = {"ssc": "tab.blog.all", "sm": "tab_jum"} | |
params["query"] = query | |
url = base_url + "&".join(f"{key}={value}" for key, value in params.items()) | |
return url | |
def crawl_naver_search_results(url): | |
session = setup_session() | |
response = session.get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
results = [] | |
i = 1 | |
for li in soup.find_all("li", class_=re.compile("bx.*")): | |
for div in li.find_all("div", class_="detail_box"): | |
for div2 in div.find_all("div", class_="title_area"): | |
title = div2.text.strip() | |
for a in div2.find_all("a", href=True): | |
link = a["href"] | |
results.append({"번호": i, "제목": title, "링크": link}) | |
i += 1 | |
html_table = "<table><tr><th>번호</th><th>제목</th><th>링크</th></tr>" | |
for result in results[:10]: # 10개의 결과만 출력 | |
html_table += f"<tr><td>{result['번호']}</td><td>{result['제목']}</td><td>{result['링크']}</td></tr>" | |
html_table += "</table>" | |
return html_table | |
def get_blog_content(link): | |
options = webdriver.ChromeOptions() | |
options.add_argument("--disable-javascript") | |
os.environ["CHROMEDRIVER_PATH"] = "/usr/local/bin/chromedriver" | |
driver = webdriver.Chrome(options=options) | |
driver.get(link) | |
soup = BeautifulSoup(driver.page_source, "html.parser") | |
title = "" | |
for component in soup.find_all("div", class_="se-component se-text se-l-default"): | |
for paragraph in component.find_all("p", class_="se-text-paragraph"): | |
title += paragraph.text.strip() + "\n" | |
return title | |
with gr.Interface( | |
fn=lambda query: crawl_naver_search_results(generate_naver_search_url(query)), | |
inputs=gr.Textbox(label="키워드를 입력하세요"), | |
outputs=gr.HTML(label="크롤링된 제목과 링크 목록"), | |
title="네이버 검색 제목과 링크 크롤러", | |
description="검색 쿼리를 입력하여 네이버 검색 결과에서 제목과 링크를 크롤링합니다" | |
) as demo: | |
button = gr.Button("블로그 제목 가져오기") | |
text_input = gr.Textbox(label="링크를 입력하세요") | |
text_output = gr.Textbox(label="블로그 제목") | |
def get_blog_content_wrapper(link): | |
return get_blog_content(link) | |
button.click(fn=get_blog_content_wrapper, inputs=text_input, outputs=text_output) | |
demo.launch(share=True) |