5-3_N-blog / app.py
Kims12's picture
Update app.py
e37ebe5 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
# ๋””๋ฒ„๊น… ๋กœ๊ทธ ์„ค์ • (UI์— ์ถœ๋ ฅ๋˜์ง€ ์•Š๊ณ  ์ฝ˜์†”์— ์ถœ๋ ฅ)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# ๋‹ค์–‘ํ•œ ๋ธŒ๋ผ์šฐ์ €์˜ User-Agent ๋ฌธ์ž์—ด ๋ฆฌ์ŠคํŠธ
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 10; SM-G973N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36"
]
def scrape_blog(url):
logging.debug("์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘: %s", url)
# URL์ด ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์ด ์•„๋‹ ๊ฒฝ์šฐ ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์œผ๋กœ ๋ณ€ํ™˜
if "m.blog.naver.com" not in url:
new_url = url.replace("blog.naver.com", "m.blog.naver.com")
logging.debug("URL์„ ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์œผ๋กœ ๋ณ€ํ™˜: %s", new_url)
url = new_url
# ์š”์ฒญ ์ „์— 1~3์ดˆ ๋žœ๋ค ๋”œ๋ ˆ์ด ์ ์šฉ (์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐ ๋ด‡ ๊ฐ์ง€ ํšŒํ”ผ)
delay = random.uniform(1, 3)
logging.debug("์š”์ฒญ ์ „ ๋žœ๋ค ๋”œ๋ ˆ์ด: %.2f์ดˆ", delay)
time.sleep(delay)
# ๋žœ๋ค User-Agent ์„ ํƒ ๋ฐ Referer, Accept-Language ์„ค์ • (์›น์ฐจ๋‹จ๋ฐฉ์ง€ ๊ธฐ๋Šฅ)
user_agent = random.choice(USER_AGENTS)
headers = {
"User-Agent": user_agent,
"Referer": "https://m.blog.naver.com", # ๋ชจ๋ฐ”์ผ ํŽ˜์ด์ง€์—์„œ ์˜จ ๊ฒƒ์ฒ˜๋Ÿผ ์„ค์ •
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
}
logging.debug("์„ค์ •๋œ HTTP ํ—ค๋”: %s", headers)
try:
response = requests.get(url, headers=headers, timeout=10)
logging.debug("HTTP ์š”์ฒญ ์™„๋ฃŒ - ์ƒํƒœ ์ฝ”๋“œ: %s", response.status_code)
if response.status_code != 200:
logging.error("HTTP ์š”์ฒญ ์‹คํŒจ: ์ƒํƒœ ์ฝ”๋“œ %s", response.status_code)
return f"HTTP ์š”์ฒญ ์‹คํŒจ: ์ƒํƒœ ์ฝ”๋“œ {response.status_code}"
except Exception as e:
logging.exception("HTTP ์š”์ฒญ ์ค‘ ์˜ˆ์™ธ ๋ฐœ์ƒ")
return f"HTTP ์š”์ฒญ ์ค‘ ์˜ˆ์™ธ ๋ฐœ์ƒ: {e}"
# HTML ํŒŒ์‹ฑ
soup = BeautifulSoup(response.text, "html.parser")
logging.debug("HTML ํŒŒ์‹ฑ ์™„๋ฃŒ")
# ์ œ๋ชฉ ์ถ”์ถœ (๋ชจ๋ฐ”์ผ ๋ฒ„์ „ HTML ๊ตฌ์กฐ ์‚ฌ์šฉ)
title_div = soup.find("div", class_="se-module se-module-text se-title-text")
if title_div:
title_tag = title_div.find(["p", "span"])
if title_tag:
title_text = title_tag.get_text(strip=True)
else:
title_text = ""
else:
title_text = ""
logging.debug("์ถ”์ถœ๋œ ์ œ๋ชฉ: %s", title_text)
# ๋ณธ๋ฌธ ์ถ”์ถœ (๋ชจ๋ฐ”์ผ ๋ฒ„์ „ HTML ๊ตฌ์กฐ ์‚ฌ์šฉ)
body_div = soup.find("div", class_="se-main-container")
if body_div:
# ๋ณธ๋ฌธ ๋‚ด ๋ชจ๋“  ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋ฉฐ ๊ฐ ์š”์†Œ ์‚ฌ์ด์— ์ค„๋ฐ”๊ฟˆ ์ ์šฉ
body_text = body_div.get_text(separator="\n", strip=True)
else:
body_text = ""
logging.debug("์ถ”์ถœ๋œ ๋ณธ๋ฌธ ๊ธธ์ด: %d", len(body_text))
# ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ํ•˜๋‚˜์˜ ํ…์ŠคํŠธ๋กœ ๊ฒฐํ•ฉํ•˜์—ฌ ๋ฐ˜ํ™˜
result = f"์ œ๋ชฉ: {title_text}\n\n๋ณธ๋ฌธ:\n{body_text}"
logging.debug("์ตœ์ข… ๊ฒฐ๊ณผ ์ƒ์„ฑ ์™„๋ฃŒ")
return result
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ (์ตœ์‹  Gradio API ์‚ฌ์šฉ)
iface = gr.Interface(
fn=scrape_blog,
inputs=gr.Textbox(label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”."),
outputs=gr.Textbox(label="์Šคํฌ๋ž˜ํ•‘ ๊ฒฐ๊ณผ"),
title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํ•‘",
description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ๋‚ด์šฉ์„ ์Šคํฌ๋ž˜ํ•‘ํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค."
)
if __name__ == "__main__":
logging.debug("Gradio ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰ ์ค€๋น„")
iface.launch()