Open-GAMMA

Running

App Files Files Community

openfree commited on 1 day ago

Commit

706ad74

verified ·

1 Parent(s): 1bbaec5

Update content_utils.py

Browse files

Files changed (1) hide show

content_utils.py +88 -0

content_utils.py CHANGED Viewed

@@ -308,6 +308,94 @@ def parse_slide_section(section: str, default_title: str) -> Dict:
         logger.error(f"슬라이드 섹션 파싱 중 오류: {str(e)}")
         return None
 def extract_speaker_notes_from_section(section: str) -> str:
     """슬라이드 섹션에서 발표자 노트 추출"""
     try:

         logger.error(f"슬라이드 섹션 파싱 중 오류: {str(e)}")
         return None
+def parse_slide_section_improved(section: str, default_title: str, slide_num: int) -> Dict:
+    """슬라이드 섹션에서 콘텐츠 파싱 - 슬라이드 3 문제 해결"""
+    try:
+        content = {
+            "subtitle": default_title,
+            "bullet_points": []
+        }
+        # 부제목 추출 (더 많은 패턴)
+        subtitle_patterns = [
+            r'부제목:\s*(.+?)(?=\n|$)',
+            r'Subtitle:\s*(.+?)(?=\n|$)',
+            r'부제:\s*(.+?)(?=\n|$)',
+            r'Sub:\s*(.+?)(?=\n|$)'
+        ]
+        for pattern in subtitle_patterns:
+            match = re.search(pattern, section, re.MULTILINE)
+            if match:
+                subtitle = match.group(1).strip()
+                if subtitle and len(subtitle) > 2:
+                    content["subtitle"] = subtitle
+                    break
+        # 불릿 포인트 추출 - 더 정확한 패턴
+        # 1. 명시적인 "핵심 내용:" 섹션 찾기
+        content_section_match = re.search(
+            r'(?:핵심\s*내용|Key\s*Points|내용):\s*\n(.+?)(?=발표자|Speaker|시각|Visual|$)',
+            section,
+            re.DOTALL | re.IGNORECASE
+        )
+        if content_section_match:
+            content_text = content_section_match.group(1)
+            lines = content_text.strip().split('\n')
+            for line in lines:
+                line = line.strip()
+                if not line:
+                    continue
+                # 이모지로 시작하는 라인
+                if len(line) > 0 and ord(line[0]) >= 0x1F300:
+                    content["bullet_points"].append(line)
+                # 불릿으로 시작하는 라인
+                elif line.startswith(('•', '-', '*', '·', '▪', '▸')):
+                    content["bullet_points"].append(line)
+                # 의미있는 내용인 경우
+                elif len(line) > 10 and not any(skip in line for skip in [':', '제목', 'Title', '노트']):
+                    # 숫자로 시작하면 제거
+                    clean_line = re.sub(r'^\d+[.)]\s*', '', line).strip()
+                    if clean_line:
+                        content["bullet_points"].append(f"• {clean_line}")
+        # 2. 불릿 포인트가 부족하면 전체 섹션에서 찾기
+        if len(content["bullet_points"]) < 3:
+            lines = section.split('\n')
+            for line in lines:
+                line = line.strip()
+                if not line or len(content["bullet_points"]) >= 5:
+                    continue
+                # 메타 정보 스킵
+                if any(skip in line.lower() for skip in ['제목:', 'title:', '부제목:', 'subtitle:', '발표자', 'speaker', '시각']):
+                    continue
+                # 이모지로 시작하거나 불릿으로 시작
+                if (len(line) > 0 and ord(line[0]) >= 0x1F300) or line.startswith(('•', '-', '*')):
+                    if line not in content["bullet_points"]:
+                        content["bullet_points"].append(line)
+                # 일반 텍스트지만 의미있는 내용
+                elif len(line) > 15 and ':' not in line:
+                    formatted_line = f"• {line}"
+                    if formatted_line not in content["bullet_points"]:
+                        content["bullet_points"].append(formatted_line)
+        # 중복 제거
+        content["bullet_points"] = list(dict.fromkeys(content["bullet_points"]))[:5]
+        return content if len(content["bullet_points"]) > 0 else None
+    except Exception as e:
+        logger.error(f"슬라이드 {slide_num} 파싱 중 오류: {str(e)}")
+        return None
 def extract_speaker_notes_from_section(section: str) -> str:
     """슬라이드 섹션에서 발표자 노트 추출"""
     try: