Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import time | |
import random | |
import logging | |
# ๋๋ฒ๊น ๋ก๊ทธ ์ค์ (UI์ ์ถ๋ ฅ๋์ง ์๊ณ ์ฝ์์ ์ถ๋ ฅ) | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
# ๋ค์ํ ๋ธ๋ผ์ฐ์ ์ User-Agent ๋ฌธ์์ด ๋ฆฌ์คํธ | |
USER_AGENTS = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15", | |
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1", | |
"Mozilla/5.0 (Linux; Android 10; SM-G973N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36" | |
] | |
def scrape_blog(url): | |
logging.debug("์คํฌ๋ํ ์์: %s", url) | |
# URL์ด ๋ชจ๋ฐ์ผ ๋ฒ์ ์ด ์๋ ๊ฒฝ์ฐ ๋ชจ๋ฐ์ผ ๋ฒ์ ์ผ๋ก ๋ณํ | |
if "m.blog.naver.com" not in url: | |
new_url = url.replace("blog.naver.com", "m.blog.naver.com") | |
logging.debug("URL์ ๋ชจ๋ฐ์ผ ๋ฒ์ ์ผ๋ก ๋ณํ: %s", new_url) | |
url = new_url | |
# ์์ฒญ ์ ์ 1~3์ด ๋๋ค ๋๋ ์ด ์ ์ฉ (์๋ฒ ๋ถํ ๋ฐ ๋ด ๊ฐ์ง ํํผ) | |
delay = random.uniform(1, 3) | |
logging.debug("์์ฒญ ์ ๋๋ค ๋๋ ์ด: %.2f์ด", delay) | |
time.sleep(delay) | |
# ๋๋ค User-Agent ์ ํ ๋ฐ Referer, Accept-Language ์ค์ (์น์ฐจ๋จ๋ฐฉ์ง ๊ธฐ๋ฅ) | |
user_agent = random.choice(USER_AGENTS) | |
headers = { | |
"User-Agent": user_agent, | |
"Referer": "https://m.blog.naver.com", # ๋ชจ๋ฐ์ผ ํ์ด์ง์์ ์จ ๊ฒ์ฒ๋ผ ์ค์ | |
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7" | |
} | |
logging.debug("์ค์ ๋ HTTP ํค๋: %s", headers) | |
try: | |
response = requests.get(url, headers=headers, timeout=10) | |
logging.debug("HTTP ์์ฒญ ์๋ฃ - ์ํ ์ฝ๋: %s", response.status_code) | |
if response.status_code != 200: | |
logging.error("HTTP ์์ฒญ ์คํจ: ์ํ ์ฝ๋ %s", response.status_code) | |
return f"HTTP ์์ฒญ ์คํจ: ์ํ ์ฝ๋ {response.status_code}" | |
except Exception as e: | |
logging.exception("HTTP ์์ฒญ ์ค ์์ธ ๋ฐ์") | |
return f"HTTP ์์ฒญ ์ค ์์ธ ๋ฐ์: {e}" | |
# HTML ํ์ฑ | |
soup = BeautifulSoup(response.text, "html.parser") | |
logging.debug("HTML ํ์ฑ ์๋ฃ") | |
# ์ ๋ชฉ ์ถ์ถ (๋ชจ๋ฐ์ผ ๋ฒ์ HTML ๊ตฌ์กฐ ์ฌ์ฉ) | |
title_div = soup.find("div", class_="se-module se-module-text se-title-text") | |
if title_div: | |
title_tag = title_div.find(["p", "span"]) | |
if title_tag: | |
title_text = title_tag.get_text(strip=True) | |
else: | |
title_text = "" | |
else: | |
title_text = "" | |
logging.debug("์ถ์ถ๋ ์ ๋ชฉ: %s", title_text) | |
# ๋ณธ๋ฌธ ์ถ์ถ (๋ชจ๋ฐ์ผ ๋ฒ์ HTML ๊ตฌ์กฐ ์ฌ์ฉ) | |
body_div = soup.find("div", class_="se-main-container") | |
if body_div: | |
# ๋ณธ๋ฌธ ๋ด ๋ชจ๋ ํ ์คํธ๋ฅผ ์ถ์ถํ๋ฉฐ ๊ฐ ์์ ์ฌ์ด์ ์ค๋ฐ๊ฟ ์ ์ฉ | |
body_text = body_div.get_text(separator="\n", strip=True) | |
else: | |
body_text = "" | |
logging.debug("์ถ์ถ๋ ๋ณธ๋ฌธ ๊ธธ์ด: %d", len(body_text)) | |
# ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํ๋์ ํ ์คํธ๋ก ๊ฒฐํฉํ์ฌ ๋ฐํ | |
result = f"์ ๋ชฉ: {title_text}\n\n๋ณธ๋ฌธ:\n{body_text}" | |
logging.debug("์ต์ข ๊ฒฐ๊ณผ ์์ฑ ์๋ฃ") | |
return result | |
# Gradio ์ธํฐํ์ด์ค ์์ฑ (์ต์ Gradio API ์ฌ์ฉ) | |
iface = gr.Interface( | |
fn=scrape_blog, | |
inputs=gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์ ๋ ฅํ์ธ์."), | |
outputs=gr.Textbox(label="์คํฌ๋ํ ๊ฒฐ๊ณผ"), | |
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ", | |
description="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์ ๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ๋ด์ฉ์ ์คํฌ๋ํํ์ฌ ์ถ๋ ฅํฉ๋๋ค." | |
) | |
if __name__ == "__main__": | |
logging.debug("Gradio ์ธํฐํ์ด์ค ์คํ ์ค๋น") | |
iface.launch() | |