Update app.py
Browse files
app.py
CHANGED
@@ -72,13 +72,12 @@ def setup_session():
|
|
72 |
|
73 |
# μ¬μλ μ€μ
|
74 |
retries = Retry(
|
75 |
-
total=5,
|
76 |
-
backoff_factor=1,
|
77 |
-
status_forcelist=[500, 502, 503, 504],
|
78 |
-
allowed_methods=["GET", "HEAD", "OPTIONS"]
|
79 |
)
|
80 |
|
81 |
-
# κΈ°λ³Έ ν€λ μ€μ
|
82 |
session.headers.update({
|
83 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
84 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
@@ -94,14 +93,13 @@ def setup_session():
|
|
94 |
'Sec-Fetch-User': '?1',
|
95 |
'Upgrade-Insecure-Requests': '1',
|
96 |
'Cache-Control': 'max-age=0',
|
97 |
-
'DNT': '1'
|
98 |
})
|
99 |
|
100 |
-
# HTTPS μ΄λν° μ€μ
|
101 |
adapter = HTTPAdapter(
|
102 |
max_retries=retries,
|
103 |
-
pool_connections=100,
|
104 |
-
pool_maxsize=100
|
105 |
)
|
106 |
session.mount('https://', adapter)
|
107 |
session.mount('http://', adapter)
|
@@ -119,7 +117,7 @@ def get_base_url(board_select):
|
|
119 |
}
|
120 |
selected_url = urls.get(board_select)
|
121 |
if not selected_url:
|
122 |
-
|
123 |
return "Invalid board selected"
|
124 |
return selected_url
|
125 |
|
@@ -143,8 +141,8 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
|
|
143 |
return None, "<p>νμ΄μ§ μλ 1-50 μ¬μ΄μ¬μΌ ν©λλ€.</p>"
|
144 |
|
145 |
session = setup_session()
|
146 |
-
#
|
147 |
-
if custom_url.strip():
|
148 |
base_url = custom_url.strip()
|
149 |
filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
|
150 |
else:
|
@@ -165,17 +163,11 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
|
|
165 |
'border': 1
|
166 |
})
|
167 |
|
168 |
-
data_format = workbook.add_format({
|
169 |
-
'align': 'left',
|
170 |
-
'valign': 'vcenter',
|
171 |
-
'border': 1
|
172 |
-
})
|
173 |
-
|
174 |
link_format = workbook.add_format({
|
175 |
'align': 'left',
|
176 |
'valign': 'vcenter',
|
177 |
'border': 1,
|
178 |
-
'font_color': '#0066cc',
|
179 |
'underline': True
|
180 |
})
|
181 |
|
@@ -197,7 +189,6 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
|
|
197 |
for col, header in enumerate(headers):
|
198 |
worksheet.write(0, col, header, header_format)
|
199 |
|
200 |
-
# νν° μΆκ°
|
201 |
worksheet.autofilter(0, 0, 0, len(headers) - 1)
|
202 |
|
203 |
# HTML ν
μ΄λΈ μμ
|
@@ -271,12 +262,10 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
|
|
271 |
|
272 |
for p in range(1, page + 1):
|
273 |
try:
|
274 |
-
# κΈ°λ³Έ URLμ νμ΄μ§ λ²νΈ νλΌλ―Έν° μΆκ°
|
275 |
url = f"{base_url}&search.page={p}"
|
276 |
logger.info(f"[CRAWL] Fetching page {p}: {url}")
|
277 |
response = session.get(url)
|
278 |
-
|
279 |
-
time.sleep(delay)
|
280 |
|
281 |
if response.status_code != 200:
|
282 |
logger.error(f"[CRAWL] Failed to fetch page {p}. Status code: {response.status_code}")
|
@@ -310,7 +299,6 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
|
|
310 |
likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", ""))
|
311 |
date = row_data.find('td', class_='td_date').get_text(strip=True)
|
312 |
|
313 |
-
# λκΈμ μΆμΆ
|
314 |
comment_tag = row_data.find('a', class_='cmt')
|
315 |
comments = 0
|
316 |
if comment_tag and comment_tag.find('em'):
|
@@ -319,14 +307,12 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
|
|
319 |
if ":" in date:
|
320 |
date = current_date
|
321 |
|
322 |
-
# Excel λ°μ΄ν° μμ±
|
323 |
worksheet.write_url(row, 0, full_link, link_format, title)
|
324 |
worksheet.write(row, 1, date, date_format)
|
325 |
worksheet.write_number(row, 2, views, number_format)
|
326 |
worksheet.write_number(row, 3, likes, number_format)
|
327 |
worksheet.write_number(row, 4, comments, number_format)
|
328 |
|
329 |
-
# HTML ν
μ΄λΈ λ°μ΄ν° μΆκ°
|
330 |
html_output += f""" <tr>
|
331 |
<td><a href='{full_link}' target='_blank'>{title}</a></td>
|
332 |
<td>{date}</td>
|
@@ -366,11 +352,18 @@ def crawl_with_progress(board, pages, custom_url):
|
|
366 |
try:
|
367 |
excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url)
|
368 |
if excel_file:
|
369 |
-
return excel_file, html_output, "μμ§ μλ£"
|
370 |
else:
|
371 |
-
return None, "", "μμ§ μ€ν¨"
|
372 |
except Exception as e:
|
373 |
-
return None, "", f"μ€λ₯ λ°μ: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
css = """
|
376 |
/* μ 체 컨ν
μ΄λ μ€νμΌλ§ */
|
@@ -515,21 +508,20 @@ tr:hover {
|
|
515 |
with gr.Blocks(css=css) as demo:
|
516 |
gr.Markdown("# Nμ¬ Cafe ν«λ κ²μν ν¬λ‘€λ§")
|
517 |
gr.Markdown("""
|
518 |
-
νμ΄μ§ μλ₯Ό μ
λ ₯νκ±°λ, μ§μ URLμ
|
519 |
-
(
|
520 |
-
μ΅λ νμ΄μ§μλ 50νμ΄μ§ μ
λλ€.
|
521 |
""")
|
522 |
|
523 |
with gr.Row():
|
524 |
board_select = gr.Radio(
|
525 |
-
choices=["λ§μ΄λ² λ² ", "λ§μ€νλ¦", "κ΄μ£Όλ§", "μΌνμ§λ¦μ ", "λΆμ°λ§", "μ§ν¬λ§"],
|
526 |
-
label="κ²μνμ μ ννμΈμ
|
527 |
container=True
|
528 |
)
|
529 |
|
530 |
with gr.Row():
|
531 |
inp = gr.Number(
|
532 |
-
label="μμ§ν νμ΄μ§
|
533 |
value=1,
|
534 |
minimum=1,
|
535 |
maximum=50,
|
@@ -540,9 +532,13 @@ with gr.Blocks(css=css) as demo:
|
|
540 |
custom_url = gr.Textbox(
|
541 |
label="μ§μ λ§ν¬ μ
λ ₯ (μ΅μ
)",
|
542 |
placeholder="μ: https://cafe.naver.com/ArticleList.nhn?...",
|
|
|
543 |
container=True
|
544 |
)
|
545 |
|
|
|
|
|
|
|
546 |
status = gr.Textbox(
|
547 |
label="μν",
|
548 |
value="λκΈ° μ€...",
|
|
|
72 |
|
73 |
# μ¬μλ μ€μ
|
74 |
retries = Retry(
|
75 |
+
total=5,
|
76 |
+
backoff_factor=1,
|
77 |
+
status_forcelist=[500, 502, 503, 504],
|
78 |
+
allowed_methods=["GET", "HEAD", "OPTIONS"]
|
79 |
)
|
80 |
|
|
|
81 |
session.headers.update({
|
82 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
83 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
|
93 |
'Sec-Fetch-User': '?1',
|
94 |
'Upgrade-Insecure-Requests': '1',
|
95 |
'Cache-Control': 'max-age=0',
|
96 |
+
'DNT': '1'
|
97 |
})
|
98 |
|
|
|
99 |
adapter = HTTPAdapter(
|
100 |
max_retries=retries,
|
101 |
+
pool_connections=100,
|
102 |
+
pool_maxsize=100
|
103 |
)
|
104 |
session.mount('https://', adapter)
|
105 |
session.mount('http://', adapter)
|
|
|
117 |
}
|
118 |
selected_url = urls.get(board_select)
|
119 |
if not selected_url:
|
120 |
+
logger.warning(f"Invalid board selected: {board_select}")
|
121 |
return "Invalid board selected"
|
122 |
return selected_url
|
123 |
|
|
|
141 |
return None, "<p>νμ΄μ§ μλ 1-50 μ¬μ΄μ¬μΌ ν©λλ€.</p>"
|
142 |
|
143 |
session = setup_session()
|
144 |
+
# μ§μ μ
λ ₯μ μ νν κ²½μ° custom_url μ¬μ©
|
145 |
+
if board_select == "μ§μ μ
λ ₯" and custom_url.strip():
|
146 |
base_url = custom_url.strip()
|
147 |
filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
|
148 |
else:
|
|
|
163 |
'border': 1
|
164 |
})
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
link_format = workbook.add_format({
|
167 |
'align': 'left',
|
168 |
'valign': 'vcenter',
|
169 |
'border': 1,
|
170 |
+
'font_color': '#0066cc',
|
171 |
'underline': True
|
172 |
})
|
173 |
|
|
|
189 |
for col, header in enumerate(headers):
|
190 |
worksheet.write(0, col, header, header_format)
|
191 |
|
|
|
192 |
worksheet.autofilter(0, 0, 0, len(headers) - 1)
|
193 |
|
194 |
# HTML ν
μ΄λΈ μμ
|
|
|
262 |
|
263 |
for p in range(1, page + 1):
|
264 |
try:
|
|
|
265 |
url = f"{base_url}&search.page={p}"
|
266 |
logger.info(f"[CRAWL] Fetching page {p}: {url}")
|
267 |
response = session.get(url)
|
268 |
+
time.sleep(random.uniform(0.5, 1.0))
|
|
|
269 |
|
270 |
if response.status_code != 200:
|
271 |
logger.error(f"[CRAWL] Failed to fetch page {p}. Status code: {response.status_code}")
|
|
|
299 |
likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", ""))
|
300 |
date = row_data.find('td', class_='td_date').get_text(strip=True)
|
301 |
|
|
|
302 |
comment_tag = row_data.find('a', class_='cmt')
|
303 |
comments = 0
|
304 |
if comment_tag and comment_tag.find('em'):
|
|
|
307 |
if ":" in date:
|
308 |
date = current_date
|
309 |
|
|
|
310 |
worksheet.write_url(row, 0, full_link, link_format, title)
|
311 |
worksheet.write(row, 1, date, date_format)
|
312 |
worksheet.write_number(row, 2, views, number_format)
|
313 |
worksheet.write_number(row, 3, likes, number_format)
|
314 |
worksheet.write_number(row, 4, comments, number_format)
|
315 |
|
|
|
316 |
html_output += f""" <tr>
|
317 |
<td><a href='{full_link}' target='_blank'>{title}</a></td>
|
318 |
<td>{date}</td>
|
|
|
352 |
try:
|
353 |
excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url)
|
354 |
if excel_file:
|
355 |
+
return excel_file, html_output, "μμ§ μλ£"
|
356 |
else:
|
357 |
+
return None, "", "μμ§ μ€ν¨"
|
358 |
except Exception as e:
|
359 |
+
return None, "", f"μ€λ₯ λ°μ: {str(e)}"
|
360 |
+
|
361 |
+
def update_custom_url_visibility(selected):
|
362 |
+
# "μ§μ μ
λ ₯" μ ν μ μ§μ λ§ν¬ μ
λ ₯ ν
μ€νΈλ°μ€λ₯Ό 보μ΄λλ‘ ν¨
|
363 |
+
if selected == "μ§μ μ
λ ₯":
|
364 |
+
return gr.update(visible=True)
|
365 |
+
else:
|
366 |
+
return gr.update(visible=False)
|
367 |
|
368 |
css = """
|
369 |
/* μ 체 컨ν
μ΄λ μ€νμΌλ§ */
|
|
|
508 |
with gr.Blocks(css=css) as demo:
|
509 |
gr.Markdown("# Nμ¬ Cafe ν«λ κ²μν ν¬λ‘€λ§")
|
510 |
gr.Markdown("""
|
511 |
+
νμ΄μ§ μλ₯Ό μ
λ ₯νκ±°λ, κ²μν μ ν μ 'μ§μ μ
λ ₯'μ μ ννλ©΄ μ§μ URLμ μ
λ ₯ν μ μμ΅λλ€.
|
512 |
+
(μ΅λ νμ΄μ§μλ 50νμ΄μ§ μ
λλ€.)
|
|
|
513 |
""")
|
514 |
|
515 |
with gr.Row():
|
516 |
board_select = gr.Radio(
|
517 |
+
choices=["λ§μ΄λ² λ² ", "λ§μ€νλ¦", "κ΄μ£Όλ§", "μΌνμ§λ¦μ ", "λΆμ°λ§", "μ§ν¬λ§", "μ§μ μ
λ ₯"],
|
518 |
+
label="κ²μνμ μ ννμΈμ",
|
519 |
container=True
|
520 |
)
|
521 |
|
522 |
with gr.Row():
|
523 |
inp = gr.Number(
|
524 |
+
label="μμ§ν νμ΄μ§ μ (μ΅λ 50νμ΄μ§)",
|
525 |
value=1,
|
526 |
minimum=1,
|
527 |
maximum=50,
|
|
|
532 |
custom_url = gr.Textbox(
|
533 |
label="μ§μ λ§ν¬ μ
λ ₯ (μ΅μ
)",
|
534 |
placeholder="μ: https://cafe.naver.com/ArticleList.nhn?...",
|
535 |
+
visible=False, # κΈ°λ³Έμ μ¨κΉ
|
536 |
container=True
|
537 |
)
|
538 |
|
539 |
+
# board_select κ°μ΄ λ³κ²½λ λ μ§μ μ
λ ₯ μ νμ custom_url 보μ΄λλ‘ μ
λ°μ΄νΈ
|
540 |
+
board_select.change(fn=update_custom_url_visibility, inputs=board_select, outputs=custom_url)
|
541 |
+
|
542 |
status = gr.Textbox(
|
543 |
label="μν",
|
544 |
value="λκΈ° μ€...",
|