Kims12 commited on
Commit
8fb1af8
Β·
verified Β·
1 Parent(s): b92f102

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -35
app.py CHANGED
@@ -72,13 +72,12 @@ def setup_session():
72
 
73
  # μž¬μ‹œλ„ μ„€μ •
74
  retries = Retry(
75
- total=5, # 총 μž¬μ‹œλ„ 횟수
76
- backoff_factor=1, # μž¬μ‹œλ„ κ°„ λŒ€κΈ° μ‹œκ°„ κ³„μˆ˜
77
- status_forcelist=[500, 502, 503, 504], # μž¬μ‹œλ„ν•  HTTP μƒνƒœ μ½”λ“œ
78
- allowed_methods=["GET", "HEAD", "OPTIONS"] # μž¬μ‹œλ„ν•  HTTP λ©”μ„œλ“œ
79
  )
80
 
81
- # κΈ°λ³Έ 헀더 μ„€μ •
82
  session.headers.update({
83
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
84
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -94,14 +93,13 @@ def setup_session():
94
  'Sec-Fetch-User': '?1',
95
  'Upgrade-Insecure-Requests': '1',
96
  'Cache-Control': 'max-age=0',
97
- 'DNT': '1' # Do Not Track μš”μ²­
98
  })
99
 
100
- # HTTPS μ–΄λŒ‘ν„° μ„€μ •
101
  adapter = HTTPAdapter(
102
  max_retries=retries,
103
- pool_connections=100, # μ—°κ²° ν’€ 크기
104
- pool_maxsize=100 # μ΅œλŒ€ μ—°κ²° 수
105
  )
106
  session.mount('https://', adapter)
107
  session.mount('http://', adapter)
@@ -119,7 +117,7 @@ def get_base_url(board_select):
119
  }
120
  selected_url = urls.get(board_select)
121
  if not selected_url:
122
- logging.warning(f"Invalid board selected: {board_select}")
123
  return "Invalid board selected"
124
  return selected_url
125
 
@@ -143,8 +141,8 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
143
  return None, "<p>νŽ˜μ΄μ§€ μˆ˜λŠ” 1-50 사이여야 ν•©λ‹ˆλ‹€.</p>"
144
 
145
  session = setup_session()
146
- # μ‚¬μš©μžκ°€ 직접 링크λ₯Ό μž…λ ₯ν•œ 경우, ν•΄λ‹Ή 링크λ₯Ό base_url둜 μ‚¬μš©
147
- if custom_url.strip():
148
  base_url = custom_url.strip()
149
  filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
150
  else:
@@ -165,17 +163,11 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
165
  'border': 1
166
  })
167
 
168
- data_format = workbook.add_format({
169
- 'align': 'left',
170
- 'valign': 'vcenter',
171
- 'border': 1
172
- })
173
-
174
  link_format = workbook.add_format({
175
  'align': 'left',
176
  'valign': 'vcenter',
177
  'border': 1,
178
- 'font_color': '#0066cc', # μˆ˜μ •: 'color' -> 'font_color'
179
  'underline': True
180
  })
181
 
@@ -197,7 +189,6 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
197
  for col, header in enumerate(headers):
198
  worksheet.write(0, col, header, header_format)
199
 
200
- # ν•„ν„° μΆ”κ°€
201
  worksheet.autofilter(0, 0, 0, len(headers) - 1)
202
 
203
  # HTML ν…Œμ΄λΈ” μ‹œμž‘
@@ -271,12 +262,10 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
271
 
272
  for p in range(1, page + 1):
273
  try:
274
- # κΈ°λ³Έ URL에 νŽ˜μ΄μ§€ 번호 νŒŒλΌλ―Έν„° μΆ”κ°€
275
  url = f"{base_url}&search.page={p}"
276
  logger.info(f"[CRAWL] Fetching page {p}: {url}")
277
  response = session.get(url)
278
- delay = random.uniform(0.5, 1.0)
279
- time.sleep(delay)
280
 
281
  if response.status_code != 200:
282
  logger.error(f"[CRAWL] Failed to fetch page {p}. Status code: {response.status_code}")
@@ -310,7 +299,6 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
310
  likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", ""))
311
  date = row_data.find('td', class_='td_date').get_text(strip=True)
312
 
313
- # λŒ“κΈ€μˆ˜ μΆ”μΆœ
314
  comment_tag = row_data.find('a', class_='cmt')
315
  comments = 0
316
  if comment_tag and comment_tag.find('em'):
@@ -319,14 +307,12 @@ def extract_data_to_excel_and_html(page, board_select, custom_url=""):
319
  if ":" in date:
320
  date = current_date
321
 
322
- # Excel 데이터 μž‘μ„±
323
  worksheet.write_url(row, 0, full_link, link_format, title)
324
  worksheet.write(row, 1, date, date_format)
325
  worksheet.write_number(row, 2, views, number_format)
326
  worksheet.write_number(row, 3, likes, number_format)
327
  worksheet.write_number(row, 4, comments, number_format)
328
 
329
- # HTML ν…Œμ΄λΈ” 데이터 μΆ”κ°€
330
  html_output += f""" <tr>
331
  <td><a href='{full_link}' target='_blank'>{title}</a></td>
332
  <td>{date}</td>
@@ -366,11 +352,18 @@ def crawl_with_progress(board, pages, custom_url):
366
  try:
367
  excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url)
368
  if excel_file:
369
- return excel_file, html_output, "μˆ˜μ§‘ μ™„λ£Œ" # status λ©”μ‹œμ§€ μΆ”κ°€
370
  else:
371
- return None, "", "μˆ˜μ§‘ μ‹€νŒ¨" # μ‹€νŒ¨μ‹œμ—λ„ 3개 κ°’ λ°˜ν™˜
372
  except Exception as e:
373
- return None, "", f"였λ₯˜ λ°œμƒ: {str(e)}" # μ˜ˆμ™Έ λ°œμƒμ‹œμ—λ„ 3개 κ°’ λ°˜ν™˜
 
 
 
 
 
 
 
374
 
375
  css = """
376
  /* 전체 μ»¨ν…Œμ΄λ„ˆ μŠ€νƒ€μΌλ§ */
@@ -515,21 +508,20 @@ tr:hover {
515
  with gr.Blocks(css=css) as demo:
516
  gr.Markdown("# N사 Cafe ν•«λ”œ κ²Œμ‹œνŒ 크둀링")
517
  gr.Markdown("""
518
- νŽ˜μ΄μ§€ 수λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜, 직접 URL을 μž…λ ₯ν•˜μ‹œλ©΄ κ²°κ³Όλ₯Ό 좜λ ₯ν•©λ‹ˆλ‹€.
519
- (κ²Œμ‹œνŒ 선택은 κΈ°λ³Έ URL을 μ‚¬μš©ν•©λ‹ˆλ‹€. 직접 URL μž…λ ₯μ‹œ ν•΄λ‹Ή 링크둜 μˆ˜μ§‘ν•©λ‹ˆλ‹€.)
520
- μ΅œλŒ€ νŽ˜μ΄μ§€μˆ˜λŠ” 50νŽ˜μ΄μ§€ μž…λ‹ˆλ‹€.
521
  """)
522
 
523
  with gr.Row():
524
  board_select = gr.Radio(
525
- choices=["맘이베베", "λ§˜μŠ€ν™€λ¦­", "κ΄‘μ£Όλ§˜", "쇼핑지름신", "λΆ€μ‚°λ§˜", "μ§„ν¬λ§˜"],
526
- label="κ²Œμ‹œνŒμ„ μ„ νƒν•˜μ„Έμš” (직접 링크 μž…λ ₯이 없을 경우)",
527
  container=True
528
  )
529
 
530
  with gr.Row():
531
  inp = gr.Number(
532
- label="μˆ˜μ§‘ν•  νŽ˜μ΄μ§€ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš” (μ΅œλŒ€ 50νŽ˜μ΄μ§€)",
533
  value=1,
534
  minimum=1,
535
  maximum=50,
@@ -540,9 +532,13 @@ with gr.Blocks(css=css) as demo:
540
  custom_url = gr.Textbox(
541
  label="직접 링크 μž…λ ₯ (μ˜΅μ…˜)",
542
  placeholder="예: https://cafe.naver.com/ArticleList.nhn?...",
 
543
  container=True
544
  )
545
 
 
 
 
546
  status = gr.Textbox(
547
  label="μƒνƒœ",
548
  value="λŒ€κΈ° 쀑...",
 
72
 
73
  # μž¬μ‹œλ„ μ„€μ •
74
  retries = Retry(
75
+ total=5,
76
+ backoff_factor=1,
77
+ status_forcelist=[500, 502, 503, 504],
78
+ allowed_methods=["GET", "HEAD", "OPTIONS"]
79
  )
80
 
 
81
  session.headers.update({
82
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
83
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 
93
  'Sec-Fetch-User': '?1',
94
  'Upgrade-Insecure-Requests': '1',
95
  'Cache-Control': 'max-age=0',
96
+ 'DNT': '1'
97
  })
98
 
 
99
  adapter = HTTPAdapter(
100
  max_retries=retries,
101
+ pool_connections=100,
102
+ pool_maxsize=100
103
  )
104
  session.mount('https://', adapter)
105
  session.mount('http://', adapter)
 
117
  }
118
  selected_url = urls.get(board_select)
119
  if not selected_url:
120
+ logger.warning(f"Invalid board selected: {board_select}")
121
  return "Invalid board selected"
122
  return selected_url
123
 
 
141
  return None, "<p>νŽ˜μ΄μ§€ μˆ˜λŠ” 1-50 사이여야 ν•©λ‹ˆλ‹€.</p>"
142
 
143
  session = setup_session()
144
+ # 직접 μž…λ ₯을 μ„ νƒν•œ 경우 custom_url μ‚¬μš©
145
+ if board_select == "μ§μ ‘μž…λ ₯" and custom_url.strip():
146
  base_url = custom_url.strip()
147
  filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx'
148
  else:
 
163
  'border': 1
164
  })
165
 
 
 
 
 
 
 
166
  link_format = workbook.add_format({
167
  'align': 'left',
168
  'valign': 'vcenter',
169
  'border': 1,
170
+ 'font_color': '#0066cc',
171
  'underline': True
172
  })
173
 
 
189
  for col, header in enumerate(headers):
190
  worksheet.write(0, col, header, header_format)
191
 
 
192
  worksheet.autofilter(0, 0, 0, len(headers) - 1)
193
 
194
  # HTML ν…Œμ΄λΈ” μ‹œμž‘
 
262
 
263
  for p in range(1, page + 1):
264
  try:
 
265
  url = f"{base_url}&search.page={p}"
266
  logger.info(f"[CRAWL] Fetching page {p}: {url}")
267
  response = session.get(url)
268
+ time.sleep(random.uniform(0.5, 1.0))
 
269
 
270
  if response.status_code != 200:
271
  logger.error(f"[CRAWL] Failed to fetch page {p}. Status code: {response.status_code}")
 
299
  likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", ""))
300
  date = row_data.find('td', class_='td_date').get_text(strip=True)
301
 
 
302
  comment_tag = row_data.find('a', class_='cmt')
303
  comments = 0
304
  if comment_tag and comment_tag.find('em'):
 
307
  if ":" in date:
308
  date = current_date
309
 
 
310
  worksheet.write_url(row, 0, full_link, link_format, title)
311
  worksheet.write(row, 1, date, date_format)
312
  worksheet.write_number(row, 2, views, number_format)
313
  worksheet.write_number(row, 3, likes, number_format)
314
  worksheet.write_number(row, 4, comments, number_format)
315
 
 
316
  html_output += f""" <tr>
317
  <td><a href='{full_link}' target='_blank'>{title}</a></td>
318
  <td>{date}</td>
 
352
  try:
353
  excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url)
354
  if excel_file:
355
+ return excel_file, html_output, "μˆ˜μ§‘ μ™„λ£Œ"
356
  else:
357
+ return None, "", "μˆ˜μ§‘ μ‹€νŒ¨"
358
  except Exception as e:
359
+ return None, "", f"였λ₯˜ λ°œμƒ: {str(e)}"
360
+
361
+ def update_custom_url_visibility(selected):
362
+ # "μ§μ ‘μž…λ ₯" 선택 μ‹œ 직접 링크 μž…λ ₯ ν…μŠ€νŠΈλ°•μŠ€λ₯Ό 보이도둝 함
363
+ if selected == "μ§μ ‘μž…λ ₯":
364
+ return gr.update(visible=True)
365
+ else:
366
+ return gr.update(visible=False)
367
 
368
  css = """
369
  /* 전체 μ»¨ν…Œμ΄λ„ˆ μŠ€νƒ€μΌλ§ */
 
508
  with gr.Blocks(css=css) as demo:
509
  gr.Markdown("# N사 Cafe ν•«λ”œ κ²Œμ‹œνŒ 크둀링")
510
  gr.Markdown("""
511
+ νŽ˜μ΄μ§€ 수λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜, κ²Œμ‹œνŒ 선택 μ‹œ 'μ§μ ‘μž…λ ₯'을 μ„ νƒν•˜λ©΄ 직접 URL을 μž…λ ₯ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
512
+ (μ΅œλŒ€ νŽ˜μ΄μ§€μˆ˜λŠ” 50νŽ˜μ΄μ§€ μž…λ‹ˆλ‹€.)
 
513
  """)
514
 
515
  with gr.Row():
516
  board_select = gr.Radio(
517
+ choices=["맘이베베", "λ§˜μŠ€ν™€λ¦­", "κ΄‘μ£Όλ§˜", "쇼핑지름신", "λΆ€μ‚°λ§˜", "μ§„ν¬λ§˜", "μ§μ ‘μž…λ ₯"],
518
+ label="κ²Œμ‹œνŒμ„ μ„ νƒν•˜μ„Έμš”",
519
  container=True
520
  )
521
 
522
  with gr.Row():
523
  inp = gr.Number(
524
+ label="μˆ˜μ§‘ν•  νŽ˜μ΄μ§€ 수 (μ΅œλŒ€ 50νŽ˜μ΄μ§€)",
525
  value=1,
526
  minimum=1,
527
  maximum=50,
 
532
  custom_url = gr.Textbox(
533
  label="직접 링크 μž…λ ₯ (μ˜΅μ…˜)",
534
  placeholder="예: https://cafe.naver.com/ArticleList.nhn?...",
535
+ visible=False, # 기본은 μˆ¨κΉ€
536
  container=True
537
  )
538
 
539
+ # board_select 값이 변경될 λ•Œ μ§μ ‘μž…λ ₯ μ„ νƒμ‹œ custom_url 보이도둝 μ—…λ°μ΄νŠΈ
540
+ board_select.change(fn=update_custom_url_visibility, inputs=board_select, outputs=custom_url)
541
+
542
  status = gr.Textbox(
543
  label="μƒνƒœ",
544
  value="λŒ€κΈ° 쀑...",