Update app.py
Browse files
app.py
CHANGED
@@ -137,18 +137,21 @@ def validate_row_data(row_data):
|
|
137 |
return False
|
138 |
return True
|
139 |
|
140 |
-
def extract_data_to_excel_and_html(page, board_select):
|
141 |
try:
|
142 |
if not isinstance(page, (int, float)) or page < 1 or page > 50:
|
143 |
return None, "<p>νμ΄μ§ μλ 1-50 μ¬μ΄μ¬μΌ ν©λλ€.</p>"
|
144 |
|
145 |
session = setup_session()
|
146 |
-
base_url
|
147 |
-
if
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
152 |
|
153 |
workbook = xlsxwriter.Workbook(filename)
|
154 |
worksheet = workbook.add_worksheet()
|
@@ -268,6 +271,7 @@ def extract_data_to_excel_and_html(page, board_select):
|
|
268 |
|
269 |
for p in range(1, page + 1):
|
270 |
try:
|
|
|
271 |
url = f"{base_url}&search.page={p}"
|
272 |
logger.info(f"[CRAWL] Fetching page {p}: {url}")
|
273 |
response = session.get(url)
|
@@ -358,9 +362,9 @@ def extract_data_to_excel_and_html(page, board_select):
|
|
358 |
logger.error(f"[CRAWL] μ 체 ν¬λ‘€λ§ μ€ν¨: {str(e)}")
|
359 |
return None, f"<p style='color: #dc3545; padding: 10px; background-color: #f8d7da; border-radius: 4px;'>{error_message}</p>"
|
360 |
|
361 |
-
def crawl_with_progress(board, pages):
|
362 |
try:
|
363 |
-
excel_file, html_output = extract_data_to_excel_and_html(pages, board)
|
364 |
if excel_file:
|
365 |
return excel_file, html_output, "μμ§ μλ£" # status λ©μμ§ μΆκ°
|
366 |
else:
|
@@ -511,26 +515,34 @@ tr:hover {
|
|
511 |
with gr.Blocks(css=css) as demo:
|
512 |
gr.Markdown("# Nμ¬ Cafe ν«λ κ²μν ν¬λ‘€λ§")
|
513 |
gr.Markdown("""
|
514 |
-
|
|
|
515 |
μ΅λ νμ΄μ§μλ 50νμ΄μ§ μ
λλ€.
|
516 |
""")
|
517 |
|
518 |
with gr.Row():
|
519 |
board_select = gr.Radio(
|
520 |
choices=["λ§μ΄λ² λ² ", "λ§μ€νλ¦", "κ΄μ£Όλ§", "μΌνμ§λ¦μ ", "λΆμ°λ§", "μ§ν¬λ§"],
|
521 |
-
label="κ²μνμ μ ννμΈμ",
|
522 |
container=True
|
523 |
)
|
524 |
|
525 |
with gr.Row():
|
526 |
inp = gr.Number(
|
527 |
-
label="
|
528 |
value=1,
|
529 |
minimum=1,
|
530 |
maximum=50,
|
531 |
container=True
|
532 |
)
|
533 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
status = gr.Textbox(
|
535 |
label="μν",
|
536 |
value="λκΈ° μ€...",
|
@@ -544,7 +556,7 @@ with gr.Blocks(css=css) as demo:
|
|
544 |
|
545 |
btn.click(
|
546 |
fn=crawl_with_progress,
|
547 |
-
inputs=[board_select, inp],
|
548 |
outputs=[output_file, output_html, status]
|
549 |
)
|
550 |
|
|
|
137 |
return False
|
138 |
return True
|
139 |
|
140 |
+
def extract_data_to_excel_and_html(page, board_select, custom_url=""):
|
141 |
try:
|
142 |
if not isinstance(page, (int, float)) or page < 1 or page > 50:
|
143 |
return None, "<p>νμ΄μ§ μλ 1-50 μ¬μ΄μ¬μΌ ν©λλ€.</p>"
|
144 |
|
145 |
session = setup_session()
|
146 |
+
# μ¬μ©μκ° μ§μ λ§ν¬λ₯Ό μ
λ ₯ν κ²½μ°, ν΄λΉ λ§ν¬λ₯Ό base_urlλ‘ μ¬μ©
|
147 |
+
if custom_url.strip():
|
148 |
+
base_url = custom_url.strip()
|
149 |
+
filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
|
150 |
+
else:
|
151 |
+
base_url = get_base_url(board_select)
|
152 |
+
if base_url == "Invalid board selected":
|
153 |
+
return "Invalid board selected", ""
|
154 |
+
filename = f'{board_select}_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
|
155 |
|
156 |
workbook = xlsxwriter.Workbook(filename)
|
157 |
worksheet = workbook.add_worksheet()
|
|
|
271 |
|
272 |
for p in range(1, page + 1):
|
273 |
try:
|
274 |
+
# κΈ°λ³Έ URLμ νμ΄μ§ λ²νΈ νλΌλ―Έν° μΆκ°
|
275 |
url = f"{base_url}&search.page={p}"
|
276 |
logger.info(f"[CRAWL] Fetching page {p}: {url}")
|
277 |
response = session.get(url)
|
|
|
362 |
logger.error(f"[CRAWL] μ 체 ν¬λ‘€λ§ μ€ν¨: {str(e)}")
|
363 |
return None, f"<p style='color: #dc3545; padding: 10px; background-color: #f8d7da; border-radius: 4px;'>{error_message}</p>"
|
364 |
|
365 |
+
def crawl_with_progress(board, pages, custom_url):
|
366 |
try:
|
367 |
+
excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url)
|
368 |
if excel_file:
|
369 |
return excel_file, html_output, "μμ§ μλ£" # status λ©μμ§ μΆκ°
|
370 |
else:
|
|
|
515 |
with gr.Blocks(css=css) as demo:
|
516 |
gr.Markdown("# Nμ¬ Cafe ν«λ κ²μν ν¬λ‘€λ§")
|
517 |
gr.Markdown("""
|
518 |
+
νμ΄μ§ μλ₯Ό μ
λ ₯νκ±°λ, μ§μ URLμ μ
λ ₯νμλ©΄ κ²°κ³Όλ₯Ό μΆλ ₯ν©λλ€.
|
519 |
+
(κ²μν μ νμ κΈ°λ³Έ URLμ μ¬μ©ν©λλ€. μ§μ URL μ
λ ₯μ ν΄λΉ λ§ν¬λ‘ μμ§ν©λλ€.)
|
520 |
μ΅λ νμ΄μ§μλ 50νμ΄μ§ μ
λλ€.
|
521 |
""")
|
522 |
|
523 |
with gr.Row():
|
524 |
board_select = gr.Radio(
|
525 |
choices=["λ§μ΄λ² λ² ", "λ§μ€νλ¦", "κ΄μ£Όλ§", "μΌνμ§λ¦μ ", "λΆμ°λ§", "μ§ν¬λ§"],
|
526 |
+
label="κ²μνμ μ ννμΈμ (μ§μ λ§ν¬ μ
λ ₯μ΄ μμ κ²½μ°)",
|
527 |
container=True
|
528 |
)
|
529 |
|
530 |
with gr.Row():
|
531 |
inp = gr.Number(
|
532 |
+
label="μμ§ν νμ΄μ§ μλ₯Ό μ
λ ₯νμΈμ (μ΅λ 50νμ΄μ§)",
|
533 |
value=1,
|
534 |
minimum=1,
|
535 |
maximum=50,
|
536 |
container=True
|
537 |
)
|
538 |
|
539 |
+
with gr.Row():
|
540 |
+
custom_url = gr.Textbox(
|
541 |
+
label="μ§μ λ§ν¬ μ
λ ₯ (μ΅μ
)",
|
542 |
+
placeholder="μ: https://cafe.naver.com/ArticleList.nhn?...",
|
543 |
+
container=True
|
544 |
+
)
|
545 |
+
|
546 |
status = gr.Textbox(
|
547 |
label="μν",
|
548 |
value="λκΈ° μ€...",
|
|
|
556 |
|
557 |
btn.click(
|
558 |
fn=crawl_with_progress,
|
559 |
+
inputs=[board_select, inp, custom_url],
|
560 |
outputs=[output_file, output_html, status]
|
561 |
)
|
562 |
|