11122 / app.py
Kims12's picture
Update app.py
61b595f verified
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import gradio as gr
import datetime
import pandas as pd
import xlsxwriter
import logging
import time
import random
from datetime import datetime
import pytz
# λ‘œκΉ… μ„€μ •
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class ProxyConfig:
def __init__(self):
# ν™˜κ²½ λ³€μˆ˜μ—μ„œ ν”„λ‘μ‹œ 정보λ₯Ό κ°€μ Έμ˜΅λ‹ˆλ‹€.
self.proxy_base = {
"username": os.environ.get("PROXY_USERNAME"),
"password": os.environ.get("PROXY_PASSWORD"),
"host": os.environ.get("PROXY_HOST"),
"ports": {
"http": os.environ.get("PROXY_HTTP_PORT"),
"socks5": os.environ.get("PROXY_SOCKS5_PORT")
}
}
def get_proxy_config(self, use_socks=False):
"""ν”„λ‘μ‹œ μ„€μ • 생성"""
try:
username = self.proxy_base["username"]
password = self.proxy_base["password"]
host = self.proxy_base["host"]
port = self.proxy_base["ports"]["socks5" if use_socks else "http"]
proxy_auth = f"{username}__cr.kr"
protocol = "socks5" if use_socks else "http"
proxy_url = f"{protocol}://{proxy_auth}:{password}@{host}:{port}"
logger.info(f"[PROXY] Configuration created: {protocol}://{host}:{port}")
return {
protocol: proxy_url
}
except Exception as e:
logger.error(f"[PROXY] Configuration failed: {str(e)}")
return None
def setup_session():
"""더 κ°•ν™”λœ μ„Έμ…˜ μ„€μ •"""
session = requests.Session()
# ν”„λ‘μ‹œ μ„€μ •
proxy_config = ProxyConfig()
proxies = proxy_config.get_proxy_config(use_socks=False)
if proxies:
session.proxies.update(proxies)
try:
# ν”„λ‘μ‹œ IP 확인
ip_response = session.get('https://api.ipify.org?format=json', timeout=10)
if ip_response.status_code == 200:
logger.info(f"[PROXY] Current IP: {ip_response.json().get('ip')}")
else:
logger.warning(f"[PROXY] Failed to get IP. Status code: {ip_response.status_code}")
except Exception as e:
logger.error(f"[PROXY] IP check failed: {str(e)}")
else:
logger.warning("[PROXY] No proxy configuration available")
# μž¬μ‹œλ„ μ„€μ •
retries = Retry(
total=5, # 총 μž¬μ‹œλ„ 횟수
backoff_factor=1, # μž¬μ‹œλ„ κ°„ λŒ€κΈ° μ‹œκ°„ κ³„μˆ˜
status_forcelist=[500, 502, 503, 504], # μž¬μ‹œλ„ν•  HTTP μƒνƒœ μ½”λ“œ
allowed_methods=["GET", "HEAD", "OPTIONS"] # μž¬μ‹œλ„ν•  HTTP λ©”μ„œλ“œ
)
# κΈ°λ³Έ 헀더 μ„€μ •
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
'DNT': '1' # Do Not Track μš”μ²­
})
# HTTPS μ–΄λŒ‘ν„° μ„€μ •
adapter = HTTPAdapter(
max_retries=retries,
pool_connections=100, # μ—°κ²° ν’€ 크기
pool_maxsize=100 # μ΅œλŒ€ μ—°κ²° 수
)
session.mount('https://', adapter)
session.mount('http://', adapter)
return session
def get_base_url(board_select):
urls = {
"맘이베베": "https://cafe.naver.com/ArticleList.nhn?search.clubid=29434212&search.menuid=2&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=29434212",
"λ§˜μŠ€ν™€λ¦­": "https://cafe.naver.com/ArticleList.nhn?search.clubid=10094499&search.menuid=599&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=10094499",
"κ΄‘μ£Όλ§˜": "https://cafe.naver.com/ArticleList.nhn?search.clubid=26025763&search.menuid=508&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=26025763",
"쇼핑지름신": "https://cafe.naver.com/ArticleList.nhn?search.clubid=25729954&search.menuid=751&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=25729954",
"λΆ€μ‚°λ§˜": "https://cafe.naver.com/ArticleList.nhn?search.clubid=28707025&search.menuid=282&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=28707025",
"μ§„ν¬λ§˜": "https://cafe.naver.com/ArticleList.nhn?search.clubid=21442290&search.menuid=476&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=21442290"
}
selected_url = urls.get(board_select)
if not selected_url:
logger.warning(f"Invalid board selected: {board_select}")
return "Invalid board selected"
return selected_url
def convert_views(view_string):
if '만' in view_string:
number_part = view_string.replace('만', '')
return int(float(number_part) * 10000)
return int(view_string.replace(",", ""))
def validate_row_data(row_data):
"""ν–‰ λ°μ΄ν„°μ˜ μœ νš¨μ„± 검사"""
required_fields = ['td_view', 'td_likes', 'td_date']
for field in required_fields:
if not row_data.find('td', class_=field):
return False
return True
def extract_data_to_excel_and_html(page, board_select, custom_url=""):
try:
if not isinstance(page, (int, float)) or page < 1 or page > 50:
return None, "<p>νŽ˜μ΄μ§€ μˆ˜λŠ” 1-50 사이여야 ν•©λ‹ˆλ‹€.</p>"
session = setup_session()
# 직접 μž…λ ₯ 선택 μ‹œ custom_url μ‚¬μš©
if board_select == "μ§μ ‘μž…λ ₯" and custom_url.strip():
# μž…λ ₯된 URL을 μ •λ¦¬ν•©λ‹ˆλ‹€.
url_input = custom_url.strip()
# λ§Œμ•½ μ˜ˆμ „ URL이 μž…λ ₯λ˜μ—ˆλ‹€λ©΄ ν˜„μž¬ 적용 URL둜 λ³€κ²½
if url_input.startswith("https://cafe.naver.com/f-e/cafes/25729954/menus/186"):
url_input = "https://cafe.naver.com/ArticleList.nhn?search.clubid=25729954&search.menuid=751&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=25729954"
base_url = url_input
filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
else:
base_url = get_base_url(board_select)
if base_url == "Invalid board selected":
return "Invalid board selected", ""
filename = f'{board_select}_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
workbook = xlsxwriter.Workbook(filename)
worksheet = workbook.add_worksheet()
# Excel μŠ€νƒ€μΌ μ •μ˜
header_format = workbook.add_format({
'bold': True,
'align': 'center',
'valign': 'vcenter',
'bg_color': '#f8f9fa',
'border': 1
})
link_format = workbook.add_format({
'align': 'left',
'valign': 'vcenter',
'border': 1,
'font_color': '#0066cc',
'underline': True
})
date_format = workbook.add_format({
'align': 'center',
'valign': 'vcenter',
'border': 1
})
number_format = workbook.add_format({
'align': 'center',
'valign': 'vcenter',
'border': 1,
'num_format': '#,##0'
})
# 헀더 μž‘μ„±
headers = ['제λͺ©', 'μž‘μ„±μΌ', '쑰회수', 'μ’‹μ•„μš”', 'λŒ“κΈ€μˆ˜']
for col, header in enumerate(headers):
worksheet.write(0, col, header, header_format)
worksheet.autofilter(0, 0, 0, len(headers) - 1)
# HTML ν…Œμ΄λΈ” μ‹œμž‘
html_output = """
<style>
.crawl-table {
width: 100%;
border-collapse: collapse;
margin: 10px 0;
font-family: 'Pretendard', -apple-system, BlinkMacSystemFont, system-ui, Roboto, sans-serif;
}
.crawl-table thead th,
.crawl-table tr:first-child th {
background-color: #000000;
color: #ffffff;
border: 1px solid #dee2e6;
padding: 12px 8px;
font-weight: 600;
vertical-align: middle;
text-align: center !important;
}
.crawl-table td {
border: 1px solid #dee2e6;
padding: 10px 8px;
line-height: 1.4;
}
.crawl-table td:first-child {
text-align: left;
}
.crawl-table td:nth-child(2),
.crawl-table td:nth-child(3),
.crawl-table td:nth-child(4),
.crawl-table td:nth-child(5) {
text-align: right;
}
.crawl-table td:first-child a {
text-decoration: none;
color: #0066cc;
}
.crawl-table tr:nth-child(even) {
background-color: #f8f9fa;
}
.crawl-table tr:hover {
background-color: #f0f0f0;
}
@media (max-width: 768px) {
.crawl-table {
font-size: 14px;
}
.crawl-table th,
.crawl-table td {
padding: 8px 4px;
}
}
</style>
<table class="crawl-table">
<thead>
<tr>
<th>제λͺ©</th>
<th>μž‘μ„±μΌ</th>
<th>쑰회수</th>
<th>μ’‹μ•„μš”</th>
<th>λŒ“κΈ€μˆ˜</th>
</tr>
</thead>
<tbody>
"""
row = 1
current_date = datetime.now().strftime("%Y.%m.%d")
for p in range(1, page + 1):
try:
url = f"{base_url}&search.page={p}"
logger.info(f"[CRAWL] Fetching page {p}: {url}")
response = session.get(url)
time.sleep(random.uniform(0.5, 1.0))
if response.status_code != 200:
logger.error(f"[CRAWL] Failed to fetch page {p}. Status code: {response.status_code}")
continue
soup = BeautifulSoup(response.text, 'html.parser')
article_boards = soup.find_all('div', class_='article-board m-tcol-c')
if len(article_boards) < 2:
logger.warning(f"[CRAWL] No article boards found on page {p}")
continue
article_board = article_boards[1]
rows = article_board.find_all('tr')
logger.info(f"[CRAWL] Found {len(rows)} rows on page {p}")
for row_data in rows:
try:
if not validate_row_data(row_data):
continue
a_tag = row_data.find('a', class_='article')
if not a_tag:
continue
link = a_tag['href']
title = a_tag.get_text(strip=True)
full_link = f"https://cafe.naver.com{link}"
views = convert_views(row_data.find('td', class_='td_view').get_text(strip=True))
likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", ""))
date = row_data.find('td', class_='td_date').get_text(strip=True)
comment_tag = row_data.find('a', class_='cmt')
comments = 0
if comment_tag and comment_tag.find('em'):
comments = int(comment_tag.find('em').get_text(strip=True))
if ":" in date:
date = current_date
worksheet.write_url(row, 0, full_link, link_format, title)
worksheet.write(row, 1, date, date_format)
worksheet.write_number(row, 2, views, number_format)
worksheet.write_number(row, 3, likes, number_format)
worksheet.write_number(row, 4, comments, number_format)
html_output += f""" <tr>
<td><a href='{full_link}' target='_blank'>{title}</a></td>
<td>{date}</td>
<td>{views:,}</td>
<td>{likes:,}</td>
<td>{comments:,}</td>
</tr>
"""
row += 1
except AttributeError as e:
logger.warning(f"[CRAWL] Row parsing error: {str(e)}")
continue
except Exception as e:
logger.error(f"[CRAWL] Page {p} crawling error: {str(e)}")
continue
worksheet.set_column(0, 0, 50)
worksheet.set_column(1, 1, 12)
worksheet.set_column(2, 2, 10)
worksheet.set_column(3, 3, 10)
worksheet.set_column(4, 4, 10)
workbook.close()
html_output += """ </tbody>
</table>"""
return filename, html_output
except Exception as e:
error_message = f"데이터 μˆ˜μ§‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”. (μ—λŸ¬: {str(e)})"
logger.error(f"[CRAWL] 전체 크둀링 μ‹€νŒ¨: {str(e)}")
return None, f"<p style='color: #dc3545; padding: 10px; background-color: #f8d7da; border-radius: 4px;'>{error_message}</p>"
def crawl_with_progress(board, pages, custom_url):
try:
excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url)
if excel_file:
return excel_file, html_output, "μˆ˜μ§‘ μ™„λ£Œ"
else:
return None, "", "μˆ˜μ§‘ μ‹€νŒ¨"
except Exception as e:
return None, "", f"였λ₯˜ λ°œμƒ: {str(e)}"
def update_custom_url_visibility(selected):
# "μ§μ ‘μž…λ ₯" 선택 μ‹œ 직접 링크 μž…λ ₯ ν…μŠ€νŠΈλ°•μŠ€λ₯Ό 보이도둝 함
if selected == "μ§μ ‘μž…λ ₯":
return gr.update(visible=True)
else:
return gr.update(visible=False)
css = """
/* 전체 μ»¨ν…Œμ΄λ„ˆ μŠ€νƒ€μΌλ§ */
.gradio-container {
font-family: 'Pretendard', -apple-system, BlinkMacSystemFont, system-ui, Roboto, sans-serif !important;
max-width: 1000px !important;
margin: 2rem auto !important;
padding: 2rem !important;
background-color: #ffffff !important;
box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24) !important;
border-radius: 12px !important;
}
/* 제λͺ© μŠ€νƒ€μΌλ§ */
h1 {
font-size: 2.2rem !important;
font-weight: 700 !important;
color: #000000 !important;
text-align: center !important;
margin-bottom: 2rem !important;
padding-bottom: 1.5rem !important;
border-bottom: 2px solid #000000 !important;
}
/* μ„€λͺ… ν…μŠ€νŠΈ μŠ€νƒ€μΌλ§ */
.gr-markdown {
text-align: center !important;
color: #666666 !important;
font-size: 1rem !important;
margin-bottom: 2rem !important;
}
/* λΌλ””μ˜€ λ²„νŠΌ κ·Έλ£Ή μŠ€νƒ€μΌλ§ */
.gr-form {
background-color: #f8f8f8 !important;
padding: 1.5rem !important;
border-radius: 8px !important;
margin-bottom: 1.5rem !important;
}
.gr-radio-row {
display: grid !important;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)) !important;
gap: 1rem !important;
padding: 1rem !important;
}
.gr-radio {
border: 2px solid #000000 !important;
padding: 0.8rem !important;
border-radius: 6px !important;
transition: all 0.3s ease !important;
}
.gr-radio:checked {
background-color: #000000 !important;
color: #ffffff !important;
}
/* 숫자 μž…λ ₯ ν•„λ“œ μŠ€νƒ€μΌλ§ */
.gr-number-input {
border: 2px solid #000000 !important;
border-radius: 6px !important;
padding: 0.8rem !important;
font-size: 1rem !important;
width: 100% !important;
max-width: 300px !important;
margin: 0 auto !important;
}
/* μƒνƒœ ν…μŠ€νŠΈλ°•μŠ€ μŠ€νƒ€μΌλ§ */
.gr-textbox {
background-color: #f8f8f8 !important;
border: 1px solid #e0e0e0 !important;
border-radius: 6px !important;
padding: 1rem !important;
margin: 1rem 0 !important;
font-size: 0.95rem !important;
}
/* μˆ˜μ§‘ λ²„νŠΌ μŠ€νƒ€μΌλ§ */
.gr-button {
background-color: #000000 !important;
color: #ffffff !important;
padding: 1rem 2rem !important;
border-radius: 6px !important;
font-weight: 600 !important;
font-size: 1.1rem !important;
border: none !important;
width: 100% !important;
max-width: 300px !important;
margin: 1.5rem auto !important;
display: block !important;
transition: all 0.3s ease !important;
}
.gr-button:hover {
background-color: #333333 !important;
transform: translateY(-2px) !important;
box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important;
}
/* 파일 λ‹€μš΄λ‘œλ“œ μ˜μ—­ μŠ€νƒ€μΌλ§ */
.gr-file {
border: 2px dashed #000000 !important;
border-radius: 8px !important;
padding: 2rem !important;
text-align: center !important;
background-color: #f8f8f8 !important;
margin-top: 2rem !important;
}
/* HTML κ²°κ³Ό ν…Œμ΄λΈ” μŠ€νƒ€μΌλ§ */
table {
width: 100% !important;
border-collapse: collapse !important;
margin-top: 1.5rem !important;
border-radius: 8px !important;
overflow: hidden !important;
box-shadow: 0 1px 3px rgba(0,0,0,0.12) !important;
}
th {
background-color: #000000 !important;
color: #ffffff !important;
padding: 1rem !important;
text-align: center !important;
font-weight: 600 !important;
}
td {
padding: 0.8rem !important;
border-bottom: 1px solid #e0e0e0 !important;
color: #333333 !important;
}
tr:hover {
background-color: #f5f5f5 !important;
}
/* λ°˜μ‘ν˜• λ””μžμΈ */
@media (max-width: 768px) {
.gradio-container {
padding: 1rem !important;
margin: 1rem !important;
}
h1 {
font-size: 1.8rem !important;
}
.gr-radio-row {
grid-template-columns: 1fr !important;
}
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown("# N사 Cafe ν•«λ”œ κ²Œμ‹œνŒ 크둀링")
gr.Markdown("""
νŽ˜μ΄μ§€ 수λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜, κ²Œμ‹œνŒ 선택 μ‹œ 'μ§μ ‘μž…λ ₯'을 μ„ νƒν•˜λ©΄ 직접 URL을 μž…λ ₯ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
(μ΅œλŒ€ νŽ˜μ΄μ§€μˆ˜λŠ” 50νŽ˜μ΄μ§€ μž…λ‹ˆλ‹€.)
""")
with gr.Row():
board_select = gr.Radio(
choices=["맘이베베", "λ§˜μŠ€ν™€λ¦­", "κ΄‘μ£Όλ§˜", "쇼핑지름신", "λΆ€μ‚°λ§˜", "μ§„ν¬λ§˜", "μ§μ ‘μž…λ ₯"],
label="κ²Œμ‹œνŒμ„ μ„ νƒν•˜μ„Έμš”",
container=True
)
with gr.Row():
inp = gr.Number(
label="μˆ˜μ§‘ν•  νŽ˜μ΄μ§€ 수 (μ΅œλŒ€ 50νŽ˜μ΄μ§€)",
value=1,
minimum=1,
maximum=50,
container=True
)
with gr.Row():
custom_url = gr.Textbox(
label="직접 링크 μž…λ ₯ (μ˜΅μ…˜)",
placeholder="예: https://cafe.naver.com/ArticleList.nhn?...",
visible=False, # 기본은 μˆ¨κΉ€
container=True
)
# board_select κ°’ λ³€κ²½ μ‹œ custom_url ν…μŠ€νŠΈλ°•μŠ€ ν‘œμ‹œ/μˆ¨κΉ€ μ—…λ°μ΄νŠΈ
board_select.change(fn=update_custom_url_visibility, inputs=board_select, outputs=custom_url)
status = gr.Textbox(
label="μƒνƒœ",
value="λŒ€κΈ° 쀑...",
container=True
)
btn = gr.Button("μˆ˜μ§‘ν•˜κΈ°", variant="primary")
output_file = gr.File(label="μ—‘μ…€νŒŒμΌ λ‹€μš΄λ‘œλ“œ")
output_html = gr.HTML()
btn.click(
fn=crawl_with_progress,
inputs=[board_select, inp, custom_url],
outputs=[output_file, output_html, status]
)
if __name__ == "__main__":
demo.launch()