Spaces:
Running
Running
File size: 2,976 Bytes
f4888ec 29582f1 f4888ec 29582f1 710f005 6ed34e9 8a64c95 29582f1 2e96832 29582f1 3305ec2 f4888ec 3305ec2 57f2a47 3305ec2 8715ff7 3305ec2 710f005 57f2a47 6ed34e9 8a64c95 6ed34e9 25cb68f 18844e0 3305ec2 2e96832 710f005 2e96832 710f005 2e96832 0075b4d a8319ba 57f2a47 a8319ba 8a64c95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
from selenium import webdriver
import os
def setup_session():
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
return session
def generate_naver_search_url(query):
base_url = "https://search.naver.com/search.naver?"
params = {"ssc": "tab.blog.all", "sm": "tab_jum"}
params["query"] = query
url = base_url + "&".join(f"{key}={value}" for key, value in params.items())
return url
def crawl_naver_search_results(url):
session = setup_session()
response = session.get(url)
soup = BeautifulSoup(response.text, "html.parser")
results = []
i = 1
for li in soup.find_all("li", class_=re.compile("bx.*")):
for div in li.find_all("div", class_="detail_box"):
for div2 in div.find_all("div", class_="title_area"):
title = div2.text.strip()
for a in div2.find_all("a", href=True):
link = a["href"]
results.append({"번호": i, "제목": title, "링크": link})
i += 1
html_table = "<table><tr><th>번호</th><th>제목</th><th>링크</th></tr>"
for result in results[:10]: # 10개의 결과만 출력
html_table += f"<tr><td>{result['번호']}</td><td>{result['제목']}</td><td>{result['링크']}</td></tr>"
html_table += "</table>"
return html_table
def get_blog_content(link):
options = webdriver.ChromeOptions()
options.add_argument("--disable-javascript")
os.environ["CHROMEDRIVER_PATH"] = "/usr/local/bin/chromedriver"
driver = webdriver.Chrome(options=options)
driver.get(link)
soup = BeautifulSoup(driver.page_source, "html.parser")
title = ""
for component in soup.find_all("div", class_="se-component se-text se-l-default"):
for paragraph in component.find_all("p", class_="se-text-paragraph"):
title += paragraph.text.strip() + "\n"
return title
with gr.Interface(
fn=lambda query: crawl_naver_search_results(generate_naver_search_url(query)),
inputs=gr.Textbox(label="키워드를 입력하세요"),
outputs=gr.HTML(label="크롤링된 제목과 링크 목록"),
title="네이버 검색 제목과 링크 크롤러",
description="검색 쿼리를 입력하여 네이버 검색 결과에서 제목과 링크를 크롤링합니다"
) as demo:
button = gr.Button("블로그 제목 가져오기")
text_input = gr.Textbox(label="링크를 입력하세요")
text_output = gr.Textbox(label="블로그 제목")
def get_blog_content_wrapper(link):
return get_blog_content(link)
button.click(fn=get_blog_content_wrapper, inputs=text_input, outputs=text_output)
demo.launch(share=True) |