Bl_Crawl_t3

Running

File size: 2,976 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
from selenium import webdriver
import os

def setup_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

def generate_naver_search_url(query):
    base_url = "https://search.naver.com/search.naver?"
    params = {"ssc": "tab.blog.all", "sm": "tab_jum"}
    params["query"] = query
    url = base_url + "&".join(f"{key}={value}" for key, value in params.items())
    return url

def crawl_naver_search_results(url):
    session = setup_session()
    response = session.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    results = []
    i = 1
    for li in soup.find_all("li", class_=re.compile("bx.*")):
        for div in li.find_all("div", class_="detail_box"):
            for div2 in div.find_all("div", class_="title_area"):
                title = div2.text.strip()
                for a in div2.find_all("a", href=True):
                    link = a["href"]
                    results.append({"번호": i, "제목": title, "링크": link})
                    i += 1
    html_table = "<table><tr><th>번호</th><th>제목</th><th>링크</th></tr>"
    for result in results[:10]:  # 10개의 결과만 출력
        html_table += f"<tr><td>{result['번호']}</td><td>{result['제목']}</td><td>{result['링크']}</td></tr>"
    html_table += "</table>"
    return html_table

def get_blog_content(link):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-javascript")
    os.environ["CHROMEDRIVER_PATH"] = "/usr/local/bin/chromedriver"
    driver = webdriver.Chrome(options=options)
    driver.get(link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    title = ""
    for component in soup.find_all("div", class_="se-component se-text se-l-default"):
        for paragraph in component.find_all("p", class_="se-text-paragraph"):
            title += paragraph.text.strip() + "\n"
    return title

with gr.Interface(
    fn=lambda query: crawl_naver_search_results(generate_naver_search_url(query)),
    inputs=gr.Textbox(label="키워드를 입력하세요"),
    outputs=gr.HTML(label="크롤링된 제목과 링크 목록"),
    title="네이버 검색 제목과 링크 크롤러",
    description="검색 쿼리를 입력하여 네이버 검색 결과에서 제목과 링크를 크롤링합니다"
) as demo:
    button = gr.Button("블로그 제목 가져오기")
    text_input = gr.Textbox(label="링크를 입력하세요")
    text_output = gr.Textbox(label="블로그 제목")

    def get_blog_content_wrapper(link):
        return get_blog_content(link)

    button.click(fn=get_blog_content_wrapper, inputs=text_input, outputs=text_output)

    demo.launch(share=True)