import os import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from bs4 import BeautifulSoup import gradio as gr import datetime import pandas as pd import xlsxwriter import logging import time import random from datetime import datetime import pytz # 로깅 설정 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class ProxyConfig: def __init__(self): # 환경 변수에서 프록시 정보를 가져옵니다. self.proxy_base = { "username": os.environ.get("PROXY_USERNAME"), "password": os.environ.get("PROXY_PASSWORD"), "host": os.environ.get("PROXY_HOST"), "ports": { "http": os.environ.get("PROXY_HTTP_PORT"), "socks5": os.environ.get("PROXY_SOCKS5_PORT") } } def get_proxy_config(self, use_socks=False): """프록시 설정 생성""" try: username = self.proxy_base["username"] password = self.proxy_base["password"] host = self.proxy_base["host"] port = self.proxy_base["ports"]["socks5" if use_socks else "http"] proxy_auth = f"{username}__cr.kr" protocol = "socks5" if use_socks else "http" proxy_url = f"{protocol}://{proxy_auth}:{password}@{host}:{port}" logger.info(f"[PROXY] Configuration created: {protocol}://{host}:{port}") return { protocol: proxy_url } except Exception as e: logger.error(f"[PROXY] Configuration failed: {str(e)}") return None def setup_session(): """더 강화된 세션 설정""" session = requests.Session() # 프록시 설정 proxy_config = ProxyConfig() proxies = proxy_config.get_proxy_config(use_socks=False) if proxies: session.proxies.update(proxies) try: # 프록시 IP 확인 ip_response = session.get('https://api.ipify.org?format=json', timeout=10) if ip_response.status_code == 200: logger.info(f"[PROXY] Current IP: {ip_response.json().get('ip')}") else: logger.warning(f"[PROXY] Failed to get IP. Status code: {ip_response.status_code}") except Exception as e: logger.error(f"[PROXY] IP check failed: {str(e)}") else: logger.warning("[PROXY] No proxy configuration available") # 재시도 설정 retries = Retry( total=5, # 총 재시도 횟수 backoff_factor=1, # 재시도 간 대기 시간 계수 status_forcelist=[500, 502, 503, 504], # 재시도할 HTTP 상태 코드 allowed_methods=["GET", "HEAD", "OPTIONS"] # 재시도할 HTTP 메서드 ) # 기본 헤더 설정 session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Windows"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', 'DNT': '1' # Do Not Track 요청 }) # HTTPS 어댑터 설정 adapter = HTTPAdapter( max_retries=retries, pool_connections=100, # 연결 풀 크기 pool_maxsize=100 # 최대 연결 수 ) session.mount('https://', adapter) session.mount('http://', adapter) return session def get_base_url(board_select): urls = { "맘이베베": "https://cafe.naver.com/ArticleList.nhn?search.clubid=29434212&search.menuid=2&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=29434212", "맘스홀릭": "https://cafe.naver.com/ArticleList.nhn?search.clubid=10094499&search.menuid=599&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=10094499", "광주맘": "https://cafe.naver.com/ArticleList.nhn?search.clubid=26025763&search.menuid=508&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=26025763", "쇼핑지름신": "https://cafe.naver.com/ArticleList.nhn?search.clubid=25729954&search.menuid=751&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=25729954", "부산맘": "https://cafe.naver.com/ArticleList.nhn?search.clubid=28707025&search.menuid=282&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=28707025", "진희맘": "https://cafe.naver.com/ArticleList.nhn?search.clubid=21442290&search.menuid=476&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=21442290" } selected_url = urls.get(board_select) if not selected_url: logger.warning(f"Invalid board selected: {board_select}") return "Invalid board selected" return selected_url def convert_views(view_string): if '만' in view_string: number_part = view_string.replace('만', '') return int(float(number_part) * 10000) return int(view_string.replace(",", "")) def validate_row_data(row_data): """행 데이터의 유효성 검사""" required_fields = ['td_view', 'td_likes', 'td_date'] for field in required_fields: if not row_data.find('td', class_=field): return False return True def extract_data_to_excel_and_html(page, board_select, custom_url=""): try: if not isinstance(page, (int, float)) or page < 1 or page > 50: return None, "

페이지 수는 1-50 사이여야 합니다.

" session = setup_session() # 직접 입력 선택 시 custom_url 사용 if board_select == "직접입력" and custom_url.strip(): # 입력된 URL을 정리합니다. url_input = custom_url.strip() # 만약 예전 URL이 입력되었다면 현재 적용 URL로 변경 if url_input.startswith("https://cafe.naver.com/f-e/cafes/25729954/menus/186"): url_input = "https://cafe.naver.com/ArticleList.nhn?search.clubid=25729954&search.menuid=751&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=25729954" base_url = url_input filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx' else: base_url = get_base_url(board_select) if base_url == "Invalid board selected": return "Invalid board selected", "" filename = f'{board_select}_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx' workbook = xlsxwriter.Workbook(filename) worksheet = workbook.add_worksheet() # Excel 스타일 정의 header_format = workbook.add_format({ 'bold': True, 'align': 'center', 'valign': 'vcenter', 'bg_color': '#f8f9fa', 'border': 1 }) link_format = workbook.add_format({ 'align': 'left', 'valign': 'vcenter', 'border': 1, 'font_color': '#0066cc', 'underline': True }) date_format = workbook.add_format({ 'align': 'center', 'valign': 'vcenter', 'border': 1 }) number_format = workbook.add_format({ 'align': 'center', 'valign': 'vcenter', 'border': 1, 'num_format': '#,##0' }) # 헤더 작성 headers = ['제목', '작성일', '조회수', '좋아요', '댓글수'] for col, header in enumerate(headers): worksheet.write(0, col, header, header_format) worksheet.autofilter(0, 0, 0, len(headers) - 1) # HTML 테이블 시작 html_output = """ """ row = 1 current_date = datetime.now().strftime("%Y.%m.%d") for p in range(1, page + 1): try: url = f"{base_url}&search.page={p}" logger.info(f"[CRAWL] Fetching page {p}: {url}") response = session.get(url) time.sleep(random.uniform(0.5, 1.0)) if response.status_code != 200: logger.error(f"[CRAWL] Failed to fetch page {p}. Status code: {response.status_code}") continue soup = BeautifulSoup(response.text, 'html.parser') article_boards = soup.find_all('div', class_='article-board m-tcol-c') if len(article_boards) < 2: logger.warning(f"[CRAWL] No article boards found on page {p}") continue article_board = article_boards[1] rows = article_board.find_all('tr') logger.info(f"[CRAWL] Found {len(rows)} rows on page {p}") for row_data in rows: try: if not validate_row_data(row_data): continue a_tag = row_data.find('a', class_='article') if not a_tag: continue link = a_tag['href'] title = a_tag.get_text(strip=True) full_link = f"https://cafe.naver.com{link}" views = convert_views(row_data.find('td', class_='td_view').get_text(strip=True)) likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", "")) date = row_data.find('td', class_='td_date').get_text(strip=True) comment_tag = row_data.find('a', class_='cmt') comments = 0 if comment_tag and comment_tag.find('em'): comments = int(comment_tag.find('em').get_text(strip=True)) if ":" in date: date = current_date worksheet.write_url(row, 0, full_link, link_format, title) worksheet.write(row, 1, date, date_format) worksheet.write_number(row, 2, views, number_format) worksheet.write_number(row, 3, likes, number_format) worksheet.write_number(row, 4, comments, number_format) html_output += f""" """ row += 1 except AttributeError as e: logger.warning(f"[CRAWL] Row parsing error: {str(e)}") continue except Exception as e: logger.error(f"[CRAWL] Page {p} crawling error: {str(e)}") continue worksheet.set_column(0, 0, 50) worksheet.set_column(1, 1, 12) worksheet.set_column(2, 2, 10) worksheet.set_column(3, 3, 10) worksheet.set_column(4, 4, 10) workbook.close() html_output += """
제목 작성일 조회수 좋아요 댓글수
{title} {date} {views:,} {likes:,} {comments:,}
""" return filename, html_output except Exception as e: error_message = f"데이터 수집 중 오류가 발생했습니다. 잠시 후 다시 시도해 주세요. (에러: {str(e)})" logger.error(f"[CRAWL] 전체 크롤링 실패: {str(e)}") return None, f"

{error_message}

" def crawl_with_progress(board, pages, custom_url): try: excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url) if excel_file: return excel_file, html_output, "수집 완료" else: return None, "", "수집 실패" except Exception as e: return None, "", f"오류 발생: {str(e)}" def update_custom_url_visibility(selected): # "직접입력" 선택 시 직접 링크 입력 텍스트박스를 보이도록 함 if selected == "직접입력": return gr.update(visible=True) else: return gr.update(visible=False) css = """ /* 전체 컨테이너 스타일링 */ .gradio-container { font-family: 'Pretendard', -apple-system, BlinkMacSystemFont, system-ui, Roboto, sans-serif !important; max-width: 1000px !important; margin: 2rem auto !important; padding: 2rem !important; background-color: #ffffff !important; box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24) !important; border-radius: 12px !important; } /* 제목 스타일링 */ h1 { font-size: 2.2rem !important; font-weight: 700 !important; color: #000000 !important; text-align: center !important; margin-bottom: 2rem !important; padding-bottom: 1.5rem !important; border-bottom: 2px solid #000000 !important; } /* 설명 텍스트 스타일링 */ .gr-markdown { text-align: center !important; color: #666666 !important; font-size: 1rem !important; margin-bottom: 2rem !important; } /* 라디오 버튼 그룹 스타일링 */ .gr-form { background-color: #f8f8f8 !important; padding: 1.5rem !important; border-radius: 8px !important; margin-bottom: 1.5rem !important; } .gr-radio-row { display: grid !important; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)) !important; gap: 1rem !important; padding: 1rem !important; } .gr-radio { border: 2px solid #000000 !important; padding: 0.8rem !important; border-radius: 6px !important; transition: all 0.3s ease !important; } .gr-radio:checked { background-color: #000000 !important; color: #ffffff !important; } /* 숫자 입력 필드 스타일링 */ .gr-number-input { border: 2px solid #000000 !important; border-radius: 6px !important; padding: 0.8rem !important; font-size: 1rem !important; width: 100% !important; max-width: 300px !important; margin: 0 auto !important; } /* 상태 텍스트박스 스타일링 */ .gr-textbox { background-color: #f8f8f8 !important; border: 1px solid #e0e0e0 !important; border-radius: 6px !important; padding: 1rem !important; margin: 1rem 0 !important; font-size: 0.95rem !important; } /* 수집 버튼 스타일링 */ .gr-button { background-color: #000000 !important; color: #ffffff !important; padding: 1rem 2rem !important; border-radius: 6px !important; font-weight: 600 !important; font-size: 1.1rem !important; border: none !important; width: 100% !important; max-width: 300px !important; margin: 1.5rem auto !important; display: block !important; transition: all 0.3s ease !important; } .gr-button:hover { background-color: #333333 !important; transform: translateY(-2px) !important; box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important; } /* 파일 다운로드 영역 스타일링 */ .gr-file { border: 2px dashed #000000 !important; border-radius: 8px !important; padding: 2rem !important; text-align: center !important; background-color: #f8f8f8 !important; margin-top: 2rem !important; } /* HTML 결과 테이블 스타일링 */ table { width: 100% !important; border-collapse: collapse !important; margin-top: 1.5rem !important; border-radius: 8px !important; overflow: hidden !important; box-shadow: 0 1px 3px rgba(0,0,0,0.12) !important; } th { background-color: #000000 !important; color: #ffffff !important; padding: 1rem !important; text-align: center !important; font-weight: 600 !important; } td { padding: 0.8rem !important; border-bottom: 1px solid #e0e0e0 !important; color: #333333 !important; } tr:hover { background-color: #f5f5f5 !important; } /* 반응형 디자인 */ @media (max-width: 768px) { .gradio-container { padding: 1rem !important; margin: 1rem !important; } h1 { font-size: 1.8rem !important; } .gr-radio-row { grid-template-columns: 1fr !important; } } """ with gr.Blocks(css=css) as demo: gr.Markdown("# N사 Cafe 핫딜 게시판 크롤링") gr.Markdown(""" 페이지 수를 입력하거나, 게시판 선택 시 '직접입력'을 선택하면 직접 URL을 입력할 수 있습니다. (최대 페이지수는 50페이지 입니다.) """) with gr.Row(): board_select = gr.Radio( choices=["맘이베베", "맘스홀릭", "광주맘", "쇼핑지름신", "부산맘", "진희맘", "직접입력"], label="게시판을 선택하세요", container=True ) with gr.Row(): inp = gr.Number( label="수집할 페이지 수 (최대 50페이지)", value=1, minimum=1, maximum=50, container=True ) with gr.Row(): custom_url = gr.Textbox( label="직접 링크 입력 (옵션)", placeholder="예: https://cafe.naver.com/ArticleList.nhn?...", visible=False, # 기본은 숨김 container=True ) # board_select 값 변경 시 custom_url 텍스트박스 표시/숨김 업데이트 board_select.change(fn=update_custom_url_visibility, inputs=board_select, outputs=custom_url) status = gr.Textbox( label="상태", value="대기 중...", container=True ) btn = gr.Button("수집하기", variant="primary") output_file = gr.File(label="엑셀파일 다운로드") output_html = gr.HTML() btn.click( fn=crawl_with_progress, inputs=[board_select, inp, custom_url], outputs=[output_file, output_html, status] ) if __name__ == "__main__": demo.launch()