Spaces:

Kims12
/

5-3_N-blog

Sleeping

File size: 4,005 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
import time
import random
import logging

# 디버깅 로그 설정 (UI에 출력되지 않고 콘솔에 출력)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# 다양한 브라우저의 User-Agent 문자열 리스트
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 10; SM-G973N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36"
]

def scrape_blog(url):
    logging.debug("스크래핑 시작: %s", url)
    
    # URL이 모바일 버전이 아닐 경우 모바일 버전으로 변환
    if "m.blog.naver.com" not in url:
        new_url = url.replace("blog.naver.com", "m.blog.naver.com")
        logging.debug("URL을 모바일 버전으로 변환: %s", new_url)
        url = new_url

    # 요청 전에 1~3초 랜덤 딜레이 적용 (서버 부하 및 봇 감지 회피)
    delay = random.uniform(1, 3)
    logging.debug("요청 전 랜덤 딜레이: %.2f초", delay)
    time.sleep(delay)
    
    # 랜덤 User-Agent 선택 및 Referer, Accept-Language 설정 (웹차단방지 기능)
    user_agent = random.choice(USER_AGENTS)
    headers = {
        "User-Agent": user_agent,
        "Referer": "https://m.blog.naver.com",  # 모바일 페이지에서 온 것처럼 설정
        "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
    }
    logging.debug("설정된 HTTP 헤더: %s", headers)
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        logging.debug("HTTP 요청 완료 - 상태 코드: %s", response.status_code)
        if response.status_code != 200:
            logging.error("HTTP 요청 실패: 상태 코드 %s", response.status_code)
            return f"HTTP 요청 실패: 상태 코드 {response.status_code}"
    except Exception as e:
        logging.exception("HTTP 요청 중 예외 발생")
        return f"HTTP 요청 중 예외 발생: {e}"
    
    # HTML 파싱
    soup = BeautifulSoup(response.text, "html.parser")
    logging.debug("HTML 파싱 완료")
    
    # 제목 추출 (모바일 버전 HTML 구조 사용)
    title_div = soup.find("div", class_="se-module se-module-text se-title-text")
    if title_div:
        title_tag = title_div.find(["p", "span"])
        if title_tag:
            title_text = title_tag.get_text(strip=True)
        else:
            title_text = ""
    else:
        title_text = ""
    logging.debug("추출된 제목: %s", title_text)
    
    # 본문 추출 (모바일 버전 HTML 구조 사용)
    body_div = soup.find("div", class_="se-main-container")
    if body_div:
        # 본문 내 모든 텍스트를 추출하며 각 요소 사이에 줄바꿈 적용
        body_text = body_div.get_text(separator="\n", strip=True)
    else:
        body_text = ""
    logging.debug("추출된 본문 길이: %d", len(body_text))
    
    # 제목과 본문을 하나의 텍스트로 결합하여 반환
    result = f"제목: {title_text}\n\n본문:\n{body_text}"
    logging.debug("최종 결과 생성 완료")
    return result

# Gradio 인터페이스 생성 (최신 Gradio API 사용)
iface = gr.Interface(
    fn=scrape_blog,
    inputs=gr.Textbox(label="네이버 블로그 링크를 입력하세요."),
    outputs=gr.Textbox(label="스크래핑 결과"),
    title="네이버 블로그 스크래핑",
    description="네이버 블로그 링크를 입력하면 제목과 본문 내용을 스크래핑하여 출력합니다."
)

if __name__ == "__main__":
    logging.debug("Gradio 인터페이스 실행 준비")
    iface.launch()