Spaces:
Sleeping
Sleeping
File size: 4,005 Bytes
9c3152a 16e9240 9c3152a 16e9240 e37ebe5 16e9240 9c3152a 16e9240 9c3152a 16e9240 9c3152a 16e9240 e37ebe5 16e9240 9c3152a 16e9240 9c3152a 16e9240 9c3152a 16e9240 9c3152a 16e9240 9c3152a e37ebe5 9c3152a 16e9240 9c3152a 16e9240 9c3152a e37ebe5 16e9240 9c3152a 16e9240 9c3152a 16e9240 9c3152a d479e4e 16e9240 d479e4e 16e9240 9c3152a 16e9240 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
# ๋๋ฒ๊น
๋ก๊ทธ ์ค์ (UI์ ์ถ๋ ฅ๋์ง ์๊ณ ์ฝ์์ ์ถ๋ ฅ)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# ๋ค์ํ ๋ธ๋ผ์ฐ์ ์ User-Agent ๋ฌธ์์ด ๋ฆฌ์คํธ
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 10; SM-G973N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36"
]
def scrape_blog(url):
logging.debug("์คํฌ๋ํ ์์: %s", url)
# URL์ด ๋ชจ๋ฐ์ผ ๋ฒ์ ์ด ์๋ ๊ฒฝ์ฐ ๋ชจ๋ฐ์ผ ๋ฒ์ ์ผ๋ก ๋ณํ
if "m.blog.naver.com" not in url:
new_url = url.replace("blog.naver.com", "m.blog.naver.com")
logging.debug("URL์ ๋ชจ๋ฐ์ผ ๋ฒ์ ์ผ๋ก ๋ณํ: %s", new_url)
url = new_url
# ์์ฒญ ์ ์ 1~3์ด ๋๋ค ๋๋ ์ด ์ ์ฉ (์๋ฒ ๋ถํ ๋ฐ ๋ด ๊ฐ์ง ํํผ)
delay = random.uniform(1, 3)
logging.debug("์์ฒญ ์ ๋๋ค ๋๋ ์ด: %.2f์ด", delay)
time.sleep(delay)
# ๋๋ค User-Agent ์ ํ ๋ฐ Referer, Accept-Language ์ค์ (์น์ฐจ๋จ๋ฐฉ์ง ๊ธฐ๋ฅ)
user_agent = random.choice(USER_AGENTS)
headers = {
"User-Agent": user_agent,
"Referer": "https://m.blog.naver.com", # ๋ชจ๋ฐ์ผ ํ์ด์ง์์ ์จ ๊ฒ์ฒ๋ผ ์ค์
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
}
logging.debug("์ค์ ๋ HTTP ํค๋: %s", headers)
try:
response = requests.get(url, headers=headers, timeout=10)
logging.debug("HTTP ์์ฒญ ์๋ฃ - ์ํ ์ฝ๋: %s", response.status_code)
if response.status_code != 200:
logging.error("HTTP ์์ฒญ ์คํจ: ์ํ ์ฝ๋ %s", response.status_code)
return f"HTTP ์์ฒญ ์คํจ: ์ํ ์ฝ๋ {response.status_code}"
except Exception as e:
logging.exception("HTTP ์์ฒญ ์ค ์์ธ ๋ฐ์")
return f"HTTP ์์ฒญ ์ค ์์ธ ๋ฐ์: {e}"
# HTML ํ์ฑ
soup = BeautifulSoup(response.text, "html.parser")
logging.debug("HTML ํ์ฑ ์๋ฃ")
# ์ ๋ชฉ ์ถ์ถ (๋ชจ๋ฐ์ผ ๋ฒ์ HTML ๊ตฌ์กฐ ์ฌ์ฉ)
title_div = soup.find("div", class_="se-module se-module-text se-title-text")
if title_div:
title_tag = title_div.find(["p", "span"])
if title_tag:
title_text = title_tag.get_text(strip=True)
else:
title_text = ""
else:
title_text = ""
logging.debug("์ถ์ถ๋ ์ ๋ชฉ: %s", title_text)
# ๋ณธ๋ฌธ ์ถ์ถ (๋ชจ๋ฐ์ผ ๋ฒ์ HTML ๊ตฌ์กฐ ์ฌ์ฉ)
body_div = soup.find("div", class_="se-main-container")
if body_div:
# ๋ณธ๋ฌธ ๋ด ๋ชจ๋ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ฉฐ ๊ฐ ์์ ์ฌ์ด์ ์ค๋ฐ๊ฟ ์ ์ฉ
body_text = body_div.get_text(separator="\n", strip=True)
else:
body_text = ""
logging.debug("์ถ์ถ๋ ๋ณธ๋ฌธ ๊ธธ์ด: %d", len(body_text))
# ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํ๋์ ํ
์คํธ๋ก ๊ฒฐํฉํ์ฌ ๋ฐํ
result = f"์ ๋ชฉ: {title_text}\n\n๋ณธ๋ฌธ:\n{body_text}"
logging.debug("์ต์ข
๊ฒฐ๊ณผ ์์ฑ ์๋ฃ")
return result
# Gradio ์ธํฐํ์ด์ค ์์ฑ (์ต์ Gradio API ์ฌ์ฉ)
iface = gr.Interface(
fn=scrape_blog,
inputs=gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์
๋ ฅํ์ธ์."),
outputs=gr.Textbox(label="์คํฌ๋ํ ๊ฒฐ๊ณผ"),
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ",
description="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ๋ด์ฉ์ ์คํฌ๋ํํ์ฌ ์ถ๋ ฅํฉ๋๋ค."
)
if __name__ == "__main__":
logging.debug("Gradio ์ธํฐํ์ด์ค ์คํ ์ค๋น")
iface.launch()
|