Spaces:

aliceblue11
/

image_text

Sleeping

App Files Files Community

aliceblue11 commited on 12 days ago

Commit

ae924be

verified ·

1 Parent(s): 49aa211

Create app.py

Browse files

Files changed (1) hide show

app.py +303 -0

app.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import gradio as gr
+import base64
+import requests
+import json
+from PIL import Image
+import io
+import os
+from typing import Optional, Tuple
+class KoreanOCRApp:
+    def __init__(self):
+        self.api_key = None
+        self.project_id = None
+        self.location = "us-central1"  # Gemini 2.5 Pro가 지원되는 리전
+    def set_credentials(self, api_key: str, project_id: str) -> str:
+        """API 키와 프로젝트 ID 설정"""
+        if not api_key or not project_id:
+            return "❌ API 키와 프로젝트 ID를 모두 입력해주세요."
+        self.api_key = api_key.strip()
+        self.project_id = project_id.strip()
+        return "✅ 인증 정보가 설정되었습니다."
+    def encode_image_to_base64(self, image: Image.Image) -> str:
+        """이미지를 base64로 인코딩"""
+        buffer = io.BytesIO()
+        # PNG 형식으로 저장하여 품질 보장
+        image.save(buffer, format='PNG')
+        image_bytes = buffer.getvalue()
+        return base64.b64encode(image_bytes).decode('utf-8')
+    def call_gemini_api(self, image_base64: str) -> str:
+        """Gemini 2.5 Pro API 호출하여 한국어 텍스트 추출"""
+        if not self.api_key or not self.project_id:
+            return "❌ 먼저 API 키와 프로젝트 ID를 설정해주세요."
+        url = f"https://{self.location}-aiplatform.googleapis.com/v1/projects/{self.project_id}/locations/{self.location}/publishers/google/models/gemini-2.5-pro:generateContent"
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "contents": [{
+                "role": "user",
+                "parts": [
+                    {
+                        "text": """이 이미지에 포함된 모든 한국어 텍스트를 정확하게 추출해주세요.
+                        다음 규칙을 따라주세요:
+                        1. 이미지에서 발견되는 모든 한국어 텍스트를 순서대로 추출
+                        2. 텍스트의 위치나 레이아웃을 최대한 보존
+                        3. 줄바꿈과 문단 구분을 명확히 표시
+                        4. 특수문자, 숫자, 영어가 포함되어 있다면 그대로 유지
+                        5. 읽기 어려운 부분이 있다면 [불분명] 표시
+                        추출된 텍스트만 반환해주세요."""
+                    },
+                    {
+                        "inline_data": {
+                            "mime_type": "image/png",
+                            "data": image_base64
+                        }
+                    }
+                ]
+            }],
+            "generation_config": {
+                "temperature": 0.1,
+                "top_p": 0.8,
+                "top_k": 40,
+                "max_output_tokens": 8192
+            },
+            "safety_settings": [
+                {
+                    "category": "HARM_CATEGORY_HARASSMENT",
+                    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+                },
+                {
+                    "category": "HARM_CATEGORY_HATE_SPEECH",
+                    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+                },
+                {
+                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+                },
+                {
+                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+                }
+            ]
+        }
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            result = response.json()
+            if "candidates" in result and len(result["candidates"]) > 0:
+                content = result["candidates"][0]["content"]["parts"][0]["text"]
+                return content.strip()
+            else:
+                return "❌ 텍스트를 추출할 수 없습니다. 이미지에 한국어 텍스트가 포함되어 있는지 확인해주세요."
+        except requests.exceptions.RequestException as e:
+            return f"❌ API 호출 오류: {str(e)}"
+        except json.JSONDecodeError:
+            return "❌ API 응답 파싱 오류가 발생했습니다."
+        except KeyError as e:
+            return f"❌ 예상치 못한 API 응답 형식: {str(e)}"
+        except Exception as e:
+            return f"❌ 알 수 없는 오류: {str(e)}"
+    def process_image(self, image: Optional[Image.Image], api_key: str, project_id: str) -> Tuple[Optional[Image.Image], str]:
+        """이미지 처리 및 OCR 수행"""
+        if image is None:
+            return None, "❌ 이미지를 업로드해주세요."
+        # 인증 정보 ���정
+        auth_result = self.set_credentials(api_key, project_id)
+        if "❌" in auth_result:
+            return image, auth_result
+        try:
+            # 이미지 크기 확인 및 조정 (최대 7MB 제한)
+            img_byte_array = io.BytesIO()
+            image.save(img_byte_array, format='PNG')
+            img_size_mb = len(img_byte_array.getvalue()) / (1024 * 1024)
+            if img_size_mb > 7:
+                # 이미지 크기가 너무 크면 리사이즈
+                max_dimension = 2048
+                image.thumbnail((max_dimension, max_dimension), Image.Resampling.LANCZOS)
+            # 이미지를 base64로 인코딩
+            image_base64 = self.encode_image_to_base64(image)
+            # OCR 수행
+            extracted_text = self.call_gemini_api(image_base64)
+            # 결과 반환 (업로드된 이미지와 동일한 이미지를 표시하여 검증)
+            return image, extracted_text
+        except Exception as e:
+            return image, f"❌ 이미지 처리 중 오류가 발생했습니다: {str(e)}"
+# 전역 앱 인스턴스
+ocr_app = KoreanOCRApp()
+def create_interface():
+    """Gradio 인터페이스 생성"""
+    # CSS 스타일링
+    css = """
+    .gradio-container {
+        font-family: 'Noto Sans KR', sans-serif;
+    }
+    .main-header {
+        text-align: center;
+        color: #2c3e50;
+        margin-bottom: 20px;
+    }
+    .info-box {
+        background-color: #e8f4fd;
+        border: 1px solid #bee5eb;
+        border-radius: 8px;
+        padding: 15px;
+        margin: 10px 0;
+    }
+    .error-text {
+        color: #dc3545;
+        font-weight: bold;
+    }
+    .success-text {
+        color: #28a745;
+        font-weight: bold;
+    }
+    """
+    with gr.Blocks(css=css, title="한국어 OCR - Gemini 2.5 Pro") as interface:
+        gr.Markdown("""
+        # 🔍 한국어 OCR 텍스트 추출기
+        ### Google Gemini 2.5 Pro를 활용한 고정밀 한국어 문자 인식
+        이미지에서 한국어 텍스트를 정확하게 추출합니다. 문서, 간판, 손글씨 등 다양한 형태의 한국어를 인식할 수 있습니다.
+        """, elem_classes="main-header")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("""
+                ### 📋 사용 방법
+                1. **Google Cloud 인증 정보 입력**
+                   - API 키 (Access Token)
+                   - 프로젝트 ID
+                2. **이미지 업로드**
+                   - PNG, JPEG, WebP 지원
+                   - 최대 7MB 크기
+                3. **텍스트 추출 실행**
+                """, elem_classes="info-box")
+        # 인증 정보 입력 섹션
+        gr.Markdown("## 🔐 Google Cloud 인증 설정")
+        with gr.Row():
+            with gr.Column(scale=2):
+                api_key_input = gr.Textbox(
+                    label="Google Cloud Access Token",
+                    placeholder="Google Cloud Console에서 발급받은 Access Token을 입력하세요",
+                    type="password",
+                    lines=1
+                )
+            with gr.Column(scale=1):
+                project_id_input = gr.Textbox(
+                    label="프로젝트 ID",
+                    placeholder="Google Cloud 프로젝트 ID",
+                    lines=1
+                )
+        # 이미지 업로드 및 처리 섹션
+        gr.Markdown("## 📤 이미지 업로드 및 텍스트 추출")
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    label="📁 이미지 업로드",
+                    type="pil",
+                    sources=["upload", "clipboard"],
+                    interactive=True
+                )
+                process_btn = gr.Button(
+                    "🔍 텍스트 추출 시작",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=1):
+                output_image = gr.Image(
+                    label="📋 업로드된 이미지 확인",
+                    type="pil",
+                    interactive=False
+                )
+        # 추출된 텍스트 출력
+        gr.Markdown("## 📝 추출된 텍스트")
+        extracted_text = gr.Textbox(
+            label="인식된 한국어 텍스트",
+            placeholder="추출된 텍스트가 여기에 표시됩니다...",
+            lines=10,
+            max_lines=20,
+            interactive=True,  # 결과 편집 가능
+            show_copy_button=True
+        )
+        # 이벤트 핸들러
+        process_btn.click(
+            fn=ocr_app.process_image,
+            inputs=[input_image, api_key_input, project_id_input],
+            outputs=[output_image, extracted_text],
+            show_progress=True
+        )
+        # 추가 정보
+        gr.Markdown("""
+        ### ℹ️ 추가 정보
+        **지원하는 이미지 형식:** PNG, JPEG, WebP
+        **최대 파일 크기:** 7MB
+        **인식 가능한 텍스트:** 한국어, 영어, 숫자, 특수문자
+        **💡 팁:**
+        - 선명하고 해상도가 높은 이미지일수록 인식률이 향상됩니다
+        - 텍스트가 기울어져 있거나 왜곡된 경우 인식률이 떨어질 수 있습니다
+        - 추출된 텍스트는 편집이 가능하며 복사 버튼을 통해 클립보드에 복사할 수 있습니다
+        **🔒 개인정보 보호:**
+        - 업로드된 이미지는 서버에 저장되지 않습니다
+        - API 키는 세션 동안만 메모리에 임시 저장됩니다
+        """)
+    return interface
+# 메인 실행
+if __name__ == "__main__":
+    # 인터페이스 생성 및 실행
+    demo = create_interface()
+    # 서버 실행
+    demo.launch(
+        server_name="0.0.0.0",  # 모든 IP에서 접근 가능
+        server_port=7860,       # 포트 번호
+        share=True,             # 공개 링크 생성
+        debug=True,             # 디버그 모드
+        show_error=True,        # 오류 표시
+        inbrowser=True          # 자동으로 브라우저 열기
+    )