Kims12 commited on
Commit
b92f102
Β·
verified Β·
1 Parent(s): 981b110

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -13
app.py CHANGED
@@ -137,18 +137,21 @@ def validate_row_data(row_data):
137
  return False
138
  return True
139
 
140
- def extract_data_to_excel_and_html(page, board_select):
141
  try:
142
  if not isinstance(page, (int, float)) or page < 1 or page > 50:
143
  return None, "<p>νŽ˜μ΄μ§€ μˆ˜λŠ” 1-50 사이여야 ν•©λ‹ˆλ‹€.</p>"
144
 
145
  session = setup_session()
146
- base_url = get_base_url(board_select)
147
- if base_url == "Invalid board selected":
148
- return "Invalid board selected", ""
149
-
150
- korea_time = datetime.now(pytz.timezone('Asia/Seoul'))
151
- filename = f'{board_select}_{korea_time.strftime("%Y%m%d_%H%M%S")}.xlsx'
 
 
 
152
 
153
  workbook = xlsxwriter.Workbook(filename)
154
  worksheet = workbook.add_worksheet()
@@ -268,6 +271,7 @@ def extract_data_to_excel_and_html(page, board_select):
268
 
269
  for p in range(1, page + 1):
270
  try:
 
271
  url = f"{base_url}&search.page={p}"
272
  logger.info(f"[CRAWL] Fetching page {p}: {url}")
273
  response = session.get(url)
@@ -358,9 +362,9 @@ def extract_data_to_excel_and_html(page, board_select):
358
  logger.error(f"[CRAWL] 전체 크둀링 μ‹€νŒ¨: {str(e)}")
359
  return None, f"<p style='color: #dc3545; padding: 10px; background-color: #f8d7da; border-radius: 4px;'>{error_message}</p>"
360
 
361
- def crawl_with_progress(board, pages):
362
  try:
363
- excel_file, html_output = extract_data_to_excel_and_html(pages, board)
364
  if excel_file:
365
  return excel_file, html_output, "μˆ˜μ§‘ μ™„λ£Œ" # status λ©”μ‹œμ§€ μΆ”κ°€
366
  else:
@@ -511,26 +515,34 @@ tr:hover {
511
  with gr.Blocks(css=css) as demo:
512
  gr.Markdown("# N사 Cafe ν•«λ”œ κ²Œμ‹œνŒ 크둀링")
513
  gr.Markdown("""
514
- νŽ˜μ΄μ§€λ₯Ό μž…λ ₯ν•˜λ©΄ κ²°κ³Όλ₯Ό 좜λ ₯ν•©λ‹ˆλ‹€.
 
515
  μ΅œλŒ€ νŽ˜μ΄μ§€μˆ˜λŠ” 50νŽ˜μ΄μ§€ μž…λ‹ˆλ‹€.
516
  """)
517
 
518
  with gr.Row():
519
  board_select = gr.Radio(
520
  choices=["맘이베베", "λ§˜μŠ€ν™€λ¦­", "κ΄‘μ£Όλ§˜", "쇼핑지름신", "λΆ€μ‚°λ§˜", "μ§„ν¬λ§˜"],
521
- label="κ²Œμ‹œνŒμ„ μ„ νƒν•˜μ„Έμš”",
522
  container=True
523
  )
524
 
525
  with gr.Row():
526
  inp = gr.Number(
527
- label="μˆ˜μ§‘μ„ μ›ν•˜μ‹œλŠ” νŽ˜μ΄μ§€ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš”(μ΅œλŒ€ 50νŽ˜μ΄μ§€)",
528
  value=1,
529
  minimum=1,
530
  maximum=50,
531
  container=True
532
  )
533
 
 
 
 
 
 
 
 
534
  status = gr.Textbox(
535
  label="μƒνƒœ",
536
  value="λŒ€κΈ° 쀑...",
@@ -544,7 +556,7 @@ with gr.Blocks(css=css) as demo:
544
 
545
  btn.click(
546
  fn=crawl_with_progress,
547
- inputs=[board_select, inp],
548
  outputs=[output_file, output_html, status]
549
  )
550
 
 
137
  return False
138
  return True
139
 
140
+ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
141
  try:
142
  if not isinstance(page, (int, float)) or page < 1 or page > 50:
143
  return None, "<p>νŽ˜μ΄μ§€ μˆ˜λŠ” 1-50 사이여야 ν•©λ‹ˆλ‹€.</p>"
144
 
145
  session = setup_session()
146
+ # μ‚¬μš©μžκ°€ 직접 링크λ₯Ό μž…λ ₯ν•œ 경우, ν•΄λ‹Ή 링크λ₯Ό base_url둜 μ‚¬μš©
147
+ if custom_url.strip():
148
+ base_url = custom_url.strip()
149
+ filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
150
+ else:
151
+ base_url = get_base_url(board_select)
152
+ if base_url == "Invalid board selected":
153
+ return "Invalid board selected", ""
154
+ filename = f'{board_select}_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
155
 
156
  workbook = xlsxwriter.Workbook(filename)
157
  worksheet = workbook.add_worksheet()
 
271
 
272
  for p in range(1, page + 1):
273
  try:
274
+ # κΈ°λ³Έ URL에 νŽ˜μ΄μ§€ 번호 νŒŒλΌλ―Έν„° μΆ”κ°€
275
  url = f"{base_url}&search.page={p}"
276
  logger.info(f"[CRAWL] Fetching page {p}: {url}")
277
  response = session.get(url)
 
362
  logger.error(f"[CRAWL] 전체 크둀링 μ‹€νŒ¨: {str(e)}")
363
  return None, f"<p style='color: #dc3545; padding: 10px; background-color: #f8d7da; border-radius: 4px;'>{error_message}</p>"
364
 
365
+ def crawl_with_progress(board, pages, custom_url):
366
  try:
367
+ excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url)
368
  if excel_file:
369
  return excel_file, html_output, "μˆ˜μ§‘ μ™„λ£Œ" # status λ©”μ‹œμ§€ μΆ”κ°€
370
  else:
 
515
  with gr.Blocks(css=css) as demo:
516
  gr.Markdown("# N사 Cafe ν•«λ”œ κ²Œμ‹œνŒ 크둀링")
517
  gr.Markdown("""
518
+ νŽ˜μ΄μ§€ 수λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜, 직접 URL을 μž…λ ₯ν•˜μ‹œλ©΄ κ²°κ³Όλ₯Ό 좜λ ₯ν•©λ‹ˆλ‹€.
519
+ (κ²Œμ‹œνŒ 선택은 κΈ°λ³Έ URL을 μ‚¬μš©ν•©λ‹ˆλ‹€. 직접 URL μž…λ ₯μ‹œ ν•΄λ‹Ή 링크둜 μˆ˜μ§‘ν•©λ‹ˆλ‹€.)
520
  μ΅œλŒ€ νŽ˜μ΄μ§€μˆ˜λŠ” 50νŽ˜μ΄μ§€ μž…λ‹ˆλ‹€.
521
  """)
522
 
523
  with gr.Row():
524
  board_select = gr.Radio(
525
  choices=["맘이베베", "λ§˜μŠ€ν™€λ¦­", "κ΄‘μ£Όλ§˜", "쇼핑지름신", "λΆ€μ‚°λ§˜", "μ§„ν¬λ§˜"],
526
+ label="κ²Œμ‹œνŒμ„ μ„ νƒν•˜μ„Έμš” (직접 링크 μž…λ ₯이 없을 경우)",
527
  container=True
528
  )
529
 
530
  with gr.Row():
531
  inp = gr.Number(
532
+ label="μˆ˜μ§‘ν•  νŽ˜μ΄μ§€ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš” (μ΅œλŒ€ 50νŽ˜μ΄μ§€)",
533
  value=1,
534
  minimum=1,
535
  maximum=50,
536
  container=True
537
  )
538
 
539
+ with gr.Row():
540
+ custom_url = gr.Textbox(
541
+ label="직접 링크 μž…λ ₯ (μ˜΅μ…˜)",
542
+ placeholder="예: https://cafe.naver.com/ArticleList.nhn?...",
543
+ container=True
544
+ )
545
+
546
  status = gr.Textbox(
547
  label="μƒνƒœ",
548
  value="λŒ€κΈ° 쀑...",
 
556
 
557
  btn.click(
558
  fn=crawl_with_progress,
559
+ inputs=[board_select, inp, custom_url],
560
  outputs=[output_file, output_html, status]
561
  )
562