openfree commited on
Commit
706ad74
ยท
verified ยท
1 Parent(s): 1bbaec5

Update content_utils.py

Browse files
Files changed (1) hide show
  1. content_utils.py +88 -0
content_utils.py CHANGED
@@ -308,6 +308,94 @@ def parse_slide_section(section: str, default_title: str) -> Dict:
308
  logger.error(f"์Šฌ๋ผ์ด๋“œ ์„น์…˜ ํŒŒ์‹ฑ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
309
  return None
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  def extract_speaker_notes_from_section(section: str) -> str:
312
  """์Šฌ๋ผ์ด๋“œ ์„น์…˜์—์„œ ๋ฐœํ‘œ์ž ๋…ธํŠธ ์ถ”์ถœ"""
313
  try:
 
308
  logger.error(f"์Šฌ๋ผ์ด๋“œ ์„น์…˜ ํŒŒ์‹ฑ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
309
  return None
310
 
311
+ def parse_slide_section_improved(section: str, default_title: str, slide_num: int) -> Dict:
312
+ """์Šฌ๋ผ์ด๋“œ ์„น์…˜์—์„œ ์ฝ˜ํ…์ธ  ํŒŒ์‹ฑ - ์Šฌ๋ผ์ด๋“œ 3 ๋ฌธ์ œ ํ•ด๊ฒฐ"""
313
+ try:
314
+ content = {
315
+ "subtitle": default_title,
316
+ "bullet_points": []
317
+ }
318
+
319
+ # ๋ถ€์ œ๋ชฉ ์ถ”์ถœ (๋” ๋งŽ์€ ํŒจํ„ด)
320
+ subtitle_patterns = [
321
+ r'๋ถ€์ œ๋ชฉ:\s*(.+?)(?=\n|$)',
322
+ r'Subtitle:\s*(.+?)(?=\n|$)',
323
+ r'๋ถ€์ œ:\s*(.+?)(?=\n|$)',
324
+ r'Sub:\s*(.+?)(?=\n|$)'
325
+ ]
326
+
327
+ for pattern in subtitle_patterns:
328
+ match = re.search(pattern, section, re.MULTILINE)
329
+ if match:
330
+ subtitle = match.group(1).strip()
331
+ if subtitle and len(subtitle) > 2:
332
+ content["subtitle"] = subtitle
333
+ break
334
+
335
+ # ๋ถˆ๋ฆฟ ํฌ์ธํŠธ ์ถ”์ถœ - ๋” ์ •ํ™•ํ•œ ํŒจํ„ด
336
+ # 1. ๋ช…์‹œ์ ์ธ "ํ•ต์‹ฌ ๋‚ด์šฉ:" ์„น์…˜ ์ฐพ๊ธฐ
337
+ content_section_match = re.search(
338
+ r'(?:ํ•ต์‹ฌ\s*๋‚ด์šฉ|Key\s*Points|๋‚ด์šฉ):\s*\n(.+?)(?=๋ฐœํ‘œ์ž|Speaker|์‹œ๊ฐ|Visual|$)',
339
+ section,
340
+ re.DOTALL | re.IGNORECASE
341
+ )
342
+
343
+ if content_section_match:
344
+ content_text = content_section_match.group(1)
345
+ lines = content_text.strip().split('\n')
346
+
347
+ for line in lines:
348
+ line = line.strip()
349
+ if not line:
350
+ continue
351
+
352
+ # ์ด๋ชจ์ง€๋กœ ์‹œ์ž‘ํ•˜๋Š” ๋ผ์ธ
353
+ if len(line) > 0 and ord(line[0]) >= 0x1F300:
354
+ content["bullet_points"].append(line)
355
+ # ๋ถˆ๋ฆฟ์œผ๋กœ ์‹œ์ž‘ํ•˜๋Š” ๋ผ์ธ
356
+ elif line.startswith(('โ€ข', '-', '*', 'ยท', 'โ–ช', 'โ–ธ')):
357
+ content["bullet_points"].append(line)
358
+ # ์˜๋ฏธ์žˆ๋Š” ๋‚ด์šฉ์ธ ๊ฒฝ์šฐ
359
+ elif len(line) > 10 and not any(skip in line for skip in [':', '์ œ๋ชฉ', 'Title', '๋…ธํŠธ']):
360
+ # ์ˆซ์ž๋กœ ์‹œ์ž‘ํ•˜๋ฉด ์ œ๊ฑฐ
361
+ clean_line = re.sub(r'^\d+[.)]\s*', '', line).strip()
362
+ if clean_line:
363
+ content["bullet_points"].append(f"โ€ข {clean_line}")
364
+
365
+ # 2. ๋ถˆ๋ฆฟ ํฌ์ธํŠธ๊ฐ€ ๋ถ€์กฑํ•˜๋ฉด ์ „์ฒด ์„น์…˜์—์„œ ์ฐพ๊ธฐ
366
+ if len(content["bullet_points"]) < 3:
367
+ lines = section.split('\n')
368
+ for line in lines:
369
+ line = line.strip()
370
+ if not line or len(content["bullet_points"]) >= 5:
371
+ continue
372
+
373
+ # ๋ฉ”ํƒ€ ์ •๋ณด ์Šคํ‚ต
374
+ if any(skip in line.lower() for skip in ['์ œ๋ชฉ:', 'title:', '๋ถ€์ œ๋ชฉ:', 'subtitle:', '๋ฐœํ‘œ์ž', 'speaker', '์‹œ๊ฐ']):
375
+ continue
376
+
377
+ # ์ด๋ชจ์ง€๋กœ ์‹œ์ž‘ํ•˜๊ฑฐ๋‚˜ ๋ถˆ๋ฆฟ์œผ๋กœ ์‹œ์ž‘
378
+ if (len(line) > 0 and ord(line[0]) >= 0x1F300) or line.startswith(('โ€ข', '-', '*')):
379
+ if line not in content["bullet_points"]:
380
+ content["bullet_points"].append(line)
381
+ # ์ผ๋ฐ˜ ํ…์ŠคํŠธ์ง€๋งŒ ์˜๋ฏธ์žˆ๋Š” ๋‚ด์šฉ
382
+ elif len(line) > 15 and ':' not in line:
383
+ formatted_line = f"โ€ข {line}"
384
+ if formatted_line not in content["bullet_points"]:
385
+ content["bullet_points"].append(formatted_line)
386
+
387
+ # ์ค‘๋ณต ์ œ๊ฑฐ
388
+ content["bullet_points"] = list(dict.fromkeys(content["bullet_points"]))[:5]
389
+
390
+ return content if len(content["bullet_points"]) > 0 else None
391
+
392
+ except Exception as e:
393
+ logger.error(f"์Šฌ๋ผ์ด๋“œ {slide_num} ํŒŒ์‹ฑ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
394
+ return None
395
+
396
+
397
+
398
+
399
  def extract_speaker_notes_from_section(section: str) -> str:
400
  """์Šฌ๋ผ์ด๋“œ ์„น์…˜์—์„œ ๋ฐœํ‘œ์ž ๋…ธํŠธ ์ถ”์ถœ"""
401
  try: