File size: 4,005 Bytes
9c3152a
 
 
 
16e9240
 
 
 
 
9c3152a
16e9240
 
 
 
 
 
 
 
 
 
 
e37ebe5
 
 
 
 
 
16e9240
9c3152a
16e9240
9c3152a
 
16e9240
 
9c3152a
16e9240
e37ebe5
16e9240
9c3152a
16e9240
9c3152a
16e9240
 
 
 
 
 
 
 
 
9c3152a
16e9240
9c3152a
16e9240
9c3152a
e37ebe5
9c3152a
 
16e9240
 
 
 
 
9c3152a
16e9240
 
9c3152a
e37ebe5
16e9240
 
 
 
9c3152a
16e9240
 
9c3152a
16e9240
 
 
9c3152a
 
d479e4e
16e9240
 
d479e4e
 
16e9240
 
9c3152a
 
 
16e9240
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import requests
from bs4 import BeautifulSoup
import time
import random
import logging

# ๋””๋ฒ„๊น… ๋กœ๊ทธ ์„ค์ • (UI์— ์ถœ๋ ฅ๋˜์ง€ ์•Š๊ณ  ์ฝ˜์†”์— ์ถœ๋ ฅ)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# ๋‹ค์–‘ํ•œ ๋ธŒ๋ผ์šฐ์ €์˜ User-Agent ๋ฌธ์ž์—ด ๋ฆฌ์ŠคํŠธ
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 10; SM-G973N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36"
]

def scrape_blog(url):
    logging.debug("์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘: %s", url)
    
    # URL์ด ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์ด ์•„๋‹ ๊ฒฝ์šฐ ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์œผ๋กœ ๋ณ€ํ™˜
    if "m.blog.naver.com" not in url:
        new_url = url.replace("blog.naver.com", "m.blog.naver.com")
        logging.debug("URL์„ ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์œผ๋กœ ๋ณ€ํ™˜: %s", new_url)
        url = new_url

    # ์š”์ฒญ ์ „์— 1~3์ดˆ ๋žœ๋ค ๋”œ๋ ˆ์ด ์ ์šฉ (์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐ ๋ด‡ ๊ฐ์ง€ ํšŒํ”ผ)
    delay = random.uniform(1, 3)
    logging.debug("์š”์ฒญ ์ „ ๋žœ๋ค ๋”œ๋ ˆ์ด: %.2f์ดˆ", delay)
    time.sleep(delay)
    
    # ๋žœ๋ค User-Agent ์„ ํƒ ๋ฐ Referer, Accept-Language ์„ค์ • (์›น์ฐจ๋‹จ๋ฐฉ์ง€ ๊ธฐ๋Šฅ)
    user_agent = random.choice(USER_AGENTS)
    headers = {
        "User-Agent": user_agent,
        "Referer": "https://m.blog.naver.com",  # ๋ชจ๋ฐ”์ผ ํŽ˜์ด์ง€์—์„œ ์˜จ ๊ฒƒ์ฒ˜๋Ÿผ ์„ค์ •
        "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
    }
    logging.debug("์„ค์ •๋œ HTTP ํ—ค๋”: %s", headers)
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        logging.debug("HTTP ์š”์ฒญ ์™„๋ฃŒ - ์ƒํƒœ ์ฝ”๋“œ: %s", response.status_code)
        if response.status_code != 200:
            logging.error("HTTP ์š”์ฒญ ์‹คํŒจ: ์ƒํƒœ ์ฝ”๋“œ %s", response.status_code)
            return f"HTTP ์š”์ฒญ ์‹คํŒจ: ์ƒํƒœ ์ฝ”๋“œ {response.status_code}"
    except Exception as e:
        logging.exception("HTTP ์š”์ฒญ ์ค‘ ์˜ˆ์™ธ ๋ฐœ์ƒ")
        return f"HTTP ์š”์ฒญ ์ค‘ ์˜ˆ์™ธ ๋ฐœ์ƒ: {e}"
    
    # HTML ํŒŒ์‹ฑ
    soup = BeautifulSoup(response.text, "html.parser")
    logging.debug("HTML ํŒŒ์‹ฑ ์™„๋ฃŒ")
    
    # ์ œ๋ชฉ ์ถ”์ถœ (๋ชจ๋ฐ”์ผ ๋ฒ„์ „ HTML ๊ตฌ์กฐ ์‚ฌ์šฉ)
    title_div = soup.find("div", class_="se-module se-module-text se-title-text")
    if title_div:
        title_tag = title_div.find(["p", "span"])
        if title_tag:
            title_text = title_tag.get_text(strip=True)
        else:
            title_text = ""
    else:
        title_text = ""
    logging.debug("์ถ”์ถœ๋œ ์ œ๋ชฉ: %s", title_text)
    
    # ๋ณธ๋ฌธ ์ถ”์ถœ (๋ชจ๋ฐ”์ผ ๋ฒ„์ „ HTML ๊ตฌ์กฐ ์‚ฌ์šฉ)
    body_div = soup.find("div", class_="se-main-container")
    if body_div:
        # ๋ณธ๋ฌธ ๋‚ด ๋ชจ๋“  ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋ฉฐ ๊ฐ ์š”์†Œ ์‚ฌ์ด์— ์ค„๋ฐ”๊ฟˆ ์ ์šฉ
        body_text = body_div.get_text(separator="\n", strip=True)
    else:
        body_text = ""
    logging.debug("์ถ”์ถœ๋œ ๋ณธ๋ฌธ ๊ธธ์ด: %d", len(body_text))
    
    # ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ํ•˜๋‚˜์˜ ํ…์ŠคํŠธ๋กœ ๊ฒฐํ•ฉํ•˜์—ฌ ๋ฐ˜ํ™˜
    result = f"์ œ๋ชฉ: {title_text}\n\n๋ณธ๋ฌธ:\n{body_text}"
    logging.debug("์ตœ์ข… ๊ฒฐ๊ณผ ์ƒ์„ฑ ์™„๋ฃŒ")
    return result

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ (์ตœ์‹  Gradio API ์‚ฌ์šฉ)
iface = gr.Interface(
    fn=scrape_blog,
    inputs=gr.Textbox(label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”."),
    outputs=gr.Textbox(label="์Šคํฌ๋ž˜ํ•‘ ๊ฒฐ๊ณผ"),
    title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํ•‘",
    description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ๋‚ด์šฉ์„ ์Šคํฌ๋ž˜ํ•‘ํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค."
)

if __name__ == "__main__":
    logging.debug("Gradio ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰ ์ค€๋น„")
    iface.launch()