Spaces:

riu-rd
/

contact-center-operations

Runtime error

App Files Files Community

riu-rd commited on Jun 26

Commit

fc66c87

verified ·

1 Parent(s): 8c39717

Upload 16 files

Browse files

Files changed (16) hide show

.env.example +1 -0
.gitignore +60 -0
Dockerfile +29 -0
README.md +70 -12
api.py +244 -0
requirements.txt +26 -0
services/__pycache__/audio_diarization.cpython-311.pyc +0 -0
services/__pycache__/audio_gemini.cpython-311.pyc +0 -0
services/__pycache__/audio_whisper.cpython-311.pyc +0 -0
services/__pycache__/image_ocr_processor.cpython-311.pyc +0 -0
services/__pycache__/ocr_processor.cpython-311.pyc +0 -0
services/audio_diarization.py +323 -0
services/audio_gemini.py +91 -0
services/audio_whisper.py +81 -0
services/image_ocr_processor.py +807 -0
services/text_processor.py +1 -0

.env.example ADDED Viewed

	@@ -0,0 +1 @@


1	+ GEMINI_API_KEY= # YOUR API KEY HERE

.gitignore ADDED Viewed

	@@ -0,0 +1,60 @@

+# Environment variables
+.env
+.env.local
+.env.development
+.env.test
+.env.production
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environment
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Logs
+*.log
+logs/
+# Temporary files
+*.tmp
+*.temp

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM python:3.11.8
+WORKDIR /
+# Copy requirements.txt to the container
+COPY requirements.txt ./
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Add a non-root user to run the application
+RUN useradd -m -u 1000 user
+# Set the user and home directory environment variables
+USER user
+ENV HOME=/home/user \
+  PATH=/home/user/.local/bin:$PATH
+# Create the application directory
+WORKDIR $HOME/app
+# Copy the application code and model files
+COPY --chown=user . $HOME/app/
+# Expose the port the FastAPI app runs on
+EXPOSE 7860
+# Command to run the FastAPI app
+CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,70 @@
----
-title: Contact Center Operations
-emoji: ⚡
-colorFrom: red
-colorTo: green
-sdk: docker
-pinned: false
-license: mit
-short_description: Contact Center Operation Insights
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Document Processor Backend
+This is the FastAPI backend for the Document Processor application that uses Google's Gemini AI to extract structured information from bank documents.
+## Setup Instructions
+### 1. Environment Variables
+To use this application, you need to set up your Gemini API key:
+1. **Get your Gemini API key:**
+   - Go to [Google AI Studio](https://makersuite.google.com/app/apikey)
+   - Create a new API key
+   - Copy the API key
+2. **Create a `.env` file:**
+   ```bash
+   # In the backend directory, create a .env file
+   cp env_template.txt .env
+   ```
+3. **Edit the `.env` file:**
+   ```bash
+   # Replace 'your_actual_api_key_here' with your real API key
+   GEMINI_API_KEY=your_actual_api_key_here
+   ```
+### 2. Install Dependencies
+```bash
+# Create virtual environment
+python3 -m venv venv
+# Activate virtual environment
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+# Install dependencies
+pip install -r requirements.txt
+```
+### 3. Run the Server
+```bash
+# Make sure virtual environment is activated
+source venv/bin/activate
+# Start the server
+uvicorn main:app --reload --host 0.0.0.0 --port 8000
+```
+The server will be available at `http://localhost:8000`
+## API Endpoints
+- `GET /` - Health check
+- `GET /api/health` - Detailed health check
+- `POST /api/process` - Process uploaded documents
+## Security Notes
+- The `.env` file is automatically ignored by git to prevent accidentally committing your API key
+- Never commit your actual API key to version control
+- Keep your API key secure and don't share it publicly
+## Troubleshooting
+If you get an error about `GEMINI_API_KEY not set in environment`, make sure:
+1. You've created the `.env` file in the backend directory
+2. You've added your actual API key to the file
+3. The virtual environment is activated when running the server

api.py ADDED Viewed

	@@ -0,0 +1,244 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import RedirectResponse
+import uvicorn
+from typing import Any, Dict
+from pydantic import BaseModel
+from services.audio_whisper import process_audio_with_whisper
+from services.audio_gemini import process_audio_with_gemini
+from services.audio_diarization import process_audio_diarization, AudioDiarizationError
+from services.image_ocr_processor import process_pdf_to_image, process_document_image
+class TextRequest(BaseModel):
+    text: str
+class HelloWorldResponse(BaseModel):
+    message: str
+    received_text: str
+    status: str
+app = FastAPI(
+    title="Contact Center Operation Insights",
+    version="1.0.0"
+)
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def docs():
+    return RedirectResponse(url="/docs")
+@app.post("/audio/whisper", response_model=Dict[str, str])
+async def audio_whisper(audio: UploadFile = File(...)):
+    """
+    Transcribes and translates an audio file using OpenAI's Whisper model.
+    """
+    # Basic validation for audio content types. Whisper is robust, but this
+    # prevents obviously incorrect file types from being processed.
+    if not audio.content_type or not audio.content_type.startswith('audio/'):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file."
+        )
+    try:
+        # Read the content of the uploaded audio file into memory
+        audio_bytes = await audio.read()
+        # Call the dedicated service to process the audio
+        result = process_audio_with_whisper(audio_bytes)
+        return result
+    except Exception as e:
+        # Catch exceptions from the audio processing service or file reading
+        raise HTTPException(status_code=500, detail=f"Audio processing failed: {str(e)}")
+@app.post("/audio/gemini", response_model=Dict[str, str])
+async def audio_gemini(audio: UploadFile = File(...)):
+    """
+    Receives an audio file, transcribes it, and translates the transcription
+    to English using the Google Gemini 2.5 Pro model.
+    """
+    if not audio.content_type or not audio.content_type.startswith('audio/'):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file."
+        )
+    try:
+        audio_bytes = await audio.read()
+        result = process_audio_with_gemini(audio_bytes=audio_bytes)
+        return result
+    except Exception as e:
+        # Catches exceptions from file reading or the Gemini service
+        raise HTTPException(status_code=500, detail=f"Audio processing with Gemini failed: {str(e)}")
+@app.post("/audio/diarization")
+async def audio_diarization(audio: UploadFile = File(...)) -> Dict[str, Any]:
+    """
+    Process audio file for speaker diarization using Google Gemini 2.5 Pro.
+    This endpoint accepts audio files and returns speaker diarization results,
+    identifying different speakers and their spoken text segments throughout
+    the conversation.
+    """
+    # Validate file type - accept common audio formats
+    if not audio.content_type or not audio.content_type.startswith('audio/'):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file (WAV, MP3, MP4, M4A)."
+        )
+    # Additional validation for specific audio formats that work well with diarization
+    supported_types = [
+        'audio/wav', 'audio/wave', 'audio/x-wav',
+        'audio/mpeg', 'audio/mp3',
+        'audio/mp4', 'audio/m4a', 'audio/x-m4a'
+    ]
+    if audio.content_type not in supported_types:
+        # Still allow processing but warn about potential issues
+        pass  # Gemini is quite robust with audio formats
+    try:
+        # Read the uploaded audio file content
+        audio_bytes = await audio.read()
+        # Validate file size (optional - adjust based on your needs)
+        max_size_mb = 100  # 100MB limit
+        if len(audio_bytes) > max_size_mb * 1024 * 1024:
+            raise HTTPException(
+                status_code=400,
+                detail=f"File too large. Maximum size allowed is {max_size_mb}MB."
+            )
+        # Validate minimum file size to ensure it's not empty
+        if len(audio_bytes) < 1000:  # Less than 1KB
+            raise HTTPException(
+                status_code=400,
+                detail="File appears to be empty or too small to process."
+            )
+        # Process the audio file for speaker diarization
+        result = process_audio_diarization(
+            audio_bytes=audio_bytes,
+            filename=audio.filename # type: ignore
+        )
+        return result
+    except AudioDiarizationError as e:
+        # Handle specific diarization errors with appropriate HTTP status
+        if "API key" in str(e).lower():
+            raise HTTPException(
+                status_code=500,
+                detail="Audio diarization service configuration error. Please contact support."
+            )
+        elif "format" in str(e).lower():
+            raise HTTPException(
+                status_code=400,
+                detail=f"Audio format error: {str(e)}"
+            )
+        else:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Audio diarization failed: {str(e)}"
+            )
+    except HTTPException:
+        # Re-raise HTTP exceptions as-is
+        raise
+    except Exception as e:
+        # Catch any unexpected errors
+        raise HTTPException(
+            status_code=500,
+            detail=f"Unexpected error during audio diarization: {str(e)}"
+        )
+@app.post("/image/process-document")
+async def process_document(document: UploadFile = File(...)):
+    """
+    Process uploaded document (image or PDF) and extract information [Model: Gemini 1.5 Flash]
+    """
+    try:
+        # Read file content
+        file_bytes = await document.read()
+        # Handle different file types
+        if document.content_type.startswith('image/'): # type: ignore
+            # Process image directly
+            image_bytes = file_bytes
+        elif document.content_type == 'application/pdf':
+            # Convert PDF to image first
+            image_bytes = process_pdf_to_image(file_bytes)
+        else:
+            raise HTTPException(
+                status_code=400,
+                detail="Unsupported file type. Please upload an image (JPG, PNG, etc.) or PDF file."
+            )
+        # Process the document
+        result = process_document_image(image_bytes, document.filename)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+@app.post("/text", response_model=HelloWorldResponse)
+async def text_insights(request: TextRequest) -> HelloWorldResponse:
+    """
+    Simple text to insights endpoint
+    """
+    try:
+        # Basic validation
+        if not request.text.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="Text cannot be empty or contain only whitespace."
+            )
+        response = HelloWorldResponse(
+            message="Hello World! Text processing completed successfully.",
+            received_text=request.text,
+            status="success"
+        )
+        return response
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Unexpected error processing text: {str(e)}"
+        )
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "service": "document-processor"}
+if __name__ == "__main__":
+    uvicorn.run(
+        "api:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True,
+        reload_dirs=["."]
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Python version: 3.11
+fastapi
+uvicorn
+python-multipart
+Pillow
+torch
+transformers
+nltk
+python-dotenv
+PyPDF2
+pdf2image
+openai
+google
+google-genai
+google-api-core
+pprintpp
+pydub
+ffmpeg-python
+requests
+google-cloud-aiplatform
+librosa
+soundfile
+openai-whisper
+pydantic

services/__pycache__/audio_diarization.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

services/__pycache__/audio_gemini.cpython-311.pyc ADDED Viewed

Binary file (3.55 kB). View file

services/__pycache__/audio_whisper.cpython-311.pyc ADDED Viewed

Binary file (3.92 kB). View file

services/__pycache__/image_ocr_processor.cpython-311.pyc ADDED Viewed

Binary file (25.6 kB). View file

services/__pycache__/ocr_processor.cpython-311.pyc ADDED Viewed

Binary file (25.6 kB). View file

services/audio_diarization.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import json
+import base64
+import os
+from typing import Dict, List, Any, Tuple
+from pydub import AudioSegment
+import io
+from google import genai
+from google.genai import types
+class AudioDiarizationError(Exception):
+    """Custom exception for audio diarization errors"""
+    pass
+def get_gemini_client() -> genai.Client:
+    """
+    Initialize and return a Google Gemini API client.
+    Returns:
+        genai.Client: Authenticated Gemini client
+    Raises:
+        AudioDiarizationError: If API key is not found or client initialization fails
+    """
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise AudioDiarizationError("GEMINI_API_KEY environment variable not found")
+    try:
+        client = genai.Client(api_key=api_key)
+        return client
+    except Exception as e:
+        raise AudioDiarizationError(f"Failed to initialize Gemini client: {str(e)}")
+def get_audio_duration(audio_bytes: bytes) -> float:
+    """
+    Get the duration of audio in seconds.
+    Args:
+        audio_bytes: Raw audio file bytes
+    Returns:
+        float: Duration in seconds
+    Raises:
+        AudioDiarizationError: If audio processing fails
+    """
+    try:
+        # Create AudioSegment from bytes
+        audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
+        duration_sec = len(audio) / 1000.0
+        return duration_sec
+    except Exception as e:
+        raise AudioDiarizationError(f"Failed to process audio duration: {str(e)}")
+def detect_audio_format(audio_bytes: bytes) -> str:
+    """
+    Detect audio format from bytes.
+    Args:
+        audio_bytes: Raw audio file bytes
+    Returns:
+        str: Audio format (e.g., 'wav', 'mp3', 'mp4')
+    Raises:
+        AudioDiarizationError: If format detection fails
+    """
+    try:
+        # Try to create AudioSegment to detect format
+        audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
+        # Check file signature/magic bytes for common formats
+        if audio_bytes.startswith(b'RIFF') and b'WAVE' in audio_bytes[:12]:
+            return 'wav'
+        elif audio_bytes.startswith(b'ID3') or audio_bytes.startswith(b'\xff\xfb'):
+            return 'mp3'
+        elif audio_bytes.startswith(b'\x00\x00\x00\x20ftypM4A'):
+            return 'm4a'
+        elif audio_bytes.startswith(b'\x00\x00\x00\x18ftyp') or audio_bytes.startswith(b'\x00\x00\x00\x20ftyp'):
+            return 'mp4'
+        else:
+            # Default to wav if we can't detect
+            return 'wav'
+    except Exception as e:
+        raise AudioDiarizationError(f"Failed to detect audio format: {str(e)}")
+def create_diarization_request(audio_bytes: bytes, audio_format: str, model: str = "gemini-2.5-pro") -> Dict[str, Any]:
+    """
+    Create a diarization request for the Gemini API.
+    Args:
+        audio_bytes: Raw audio file bytes
+        audio_format: Audio file format (e.g., 'wav', 'mp3')
+        model: Gemini model to use
+    Returns:
+        Dict containing the API request configuration
+    Raises:
+        AudioDiarizationError: If request creation fails
+    """
+    try:
+        # Encode audio to base64
+        audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+        # Prepare request parts
+        audio_part = {
+            "inlineData": {
+                "mimeType": f"audio/{audio_format}",
+                "data": audio_b64
+            }
+        }
+        text_part = {
+            "text": (
+                "You are a speaker-diarization engine. "
+                "For the audio input, return a JSON object with a top-level `segments` array. "
+                "Each segment must have: `speaker` (string) and `text` (transcript)."
+            )
+        }
+        # Define JSON schema for structured response
+        schema = {
+            "type": "object",
+            "properties": {
+                "segments": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "speaker": {"type": "string"},
+                            "text": {"type": "string"}
+                        },
+                        "required": ["speaker", "text"]
+                    }
+                }
+            },
+            "required": ["segments"]
+        }
+        # Build configuration for JSON mode
+        config = types.GenerateContentConfig(
+            response_mime_type="application/json",
+            response_schema=schema
+        )
+        # Build complete request
+        request_kwargs = {
+            "model": model,
+            "contents": [audio_part, text_part],
+            "config": config
+        }
+        return request_kwargs
+    except Exception as e:
+        raise AudioDiarizationError(f"Failed to create diarization request: {str(e)}")
+def parse_diarization_response(response_text: str) -> Tuple[List[Dict[str, str]], Dict[str, Any]]:
+    """
+    Parse the Gemini API response for diarization results.
+    Args:
+        response_text: Raw JSON response text from Gemini
+    Returns:
+        Tuple of (segments_list, raw_json_dict)
+    Raises:
+        AudioDiarizationError: If JSON parsing fails
+    """
+    try:
+        raw_json = json.loads(response_text)
+        segments = raw_json.get("segments", [])
+        # Validate segments structure
+        if not isinstance(segments, list):
+            raise AudioDiarizationError("Response segments must be a list")
+        for i, segment in enumerate(segments):
+            if not isinstance(segment, dict):
+                raise AudioDiarizationError(f"Segment {i} must be a dictionary")
+            if "speaker" not in segment or "text" not in segment:
+                raise AudioDiarizationError(f"Segment {i} missing required fields 'speaker' or 'text'")
+        return segments, raw_json
+    except json.JSONDecodeError as e:
+        raise AudioDiarizationError(f"Failed to parse JSON from Gemini response: {str(e)}")
+    except Exception as e:
+        raise AudioDiarizationError(f"Failed to process diarization response: {str(e)}")
+def calculate_diarization_stats(segments: List[Dict[str, str]], duration_sec: float) -> Dict[str, Any]:
+    """
+    Calculate statistics from diarization results.
+    Args:
+        segments: List of speaker segments
+        duration_sec: Audio duration in seconds
+    Returns:
+        Dict containing diarization statistics
+    """
+    total_turns = len(segments)
+    speakers = set(segment["speaker"] for segment in segments)
+    num_speakers = len(speakers)
+    # Format duration as MM:SS
+    duration_str = f"{int(duration_sec//60):02d}:{int(duration_sec%60):02d}"
+    return {
+        "total_turns": total_turns,
+        "num_speakers": num_speakers,
+        "duration_seconds": duration_sec,
+        "duration_formatted": duration_str,
+        "speakers": sorted(list(speakers))
+    }
+def process_audio_diarization(audio_bytes: bytes, filename: str = None) -> Dict[str, Any]: # type: ignore
+    """
+    Process audio file for speaker diarization using Gemini 2.5 Pro.
+    This function takes raw audio bytes and returns a structured JSON response
+    containing speaker diarization results with segments, statistics, and metadata.
+    Args:
+        audio_bytes: Raw audio file bytes
+        filename: Optional filename for metadata
+    Returns:
+        Dict containing:
+            - segments: List of speaker segments with speaker and text
+            - statistics: Diarization statistics (speakers, turns, duration)
+            - metadata: Processing metadata
+            - raw_response: Original Gemini response
+    Raises:
+        AudioDiarizationError: If any step of the diarization process fails
+    """
+    try:
+        # Initialize Gemini client
+        client = get_gemini_client()
+        # Get audio duration and format
+        duration_sec = get_audio_duration(audio_bytes)
+        audio_format = detect_audio_format(audio_bytes)
+        # Create API request
+        request_kwargs = create_diarization_request(audio_bytes, audio_format)
+        # Make API call to Gemini
+        try:
+            response = client.models.generate_content(**request_kwargs)
+            response_text = response.text
+        except Exception as e:
+            raise AudioDiarizationError(f"Gemini API call failed: {str(e)}")
+        # Parse response
+        segments, raw_json = parse_diarization_response(response_text) # type: ignore
+        # Calculate statistics
+        stats = calculate_diarization_stats(segments, duration_sec)
+        # Build final response
+        result = {
+            "segments": segments,
+            "statistics": stats,
+            "metadata": {
+                "filename": filename,
+                "audio_format": audio_format,
+                "model_used": "gemini-2.5-pro",
+                "processing_status": "success"
+            },
+            "raw_response": raw_json
+        }
+        return result
+    except AudioDiarizationError:
+        # Re-raise our custom errors
+        raise
+    except Exception as e:
+        # Catch any unexpected errors
+        raise AudioDiarizationError(f"Unexpected error during audio diarization: {str(e)}")
+# Example usage and testing function
+def test_diarization_service():
+    """
+    Test function for the diarization service.
+    This is mainly for development and debugging purposes.
+    """
+    try:
+        # This would require an actual audio file to test
+        print("Audio diarization service loaded successfully")
+        print("Available functions:")
+        print("- process_audio_diarization(audio_bytes, filename)")
+        print("- get_gemini_client()")
+        print("- get_audio_duration(audio_bytes)")
+        print("- detect_audio_format(audio_bytes)")
+        # Check if API key is available
+        api_key = os.getenv("GEMINI_API_KEY")
+        if api_key:
+            print("✓ GEMINI_API_KEY found in environment")
+        else:
+            print("✗ GEMINI_API_KEY not found in environment")
+    except Exception as e:
+        print(f"Service test failed: {e}")
+if __name__ == "__main__":
+    test_diarization_service()

services/audio_gemini.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+from typing import Dict
+import google.genai as genai
+from dotenv import load_dotenv
+from google.genai.types import Part
+# Load environment variables from a .env file in the root directory
+load_dotenv()
+# --- Configuration ---
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+MODEL_ID = "gemini-2.5-pro"
+# --- Client Initialization ---
+if not GEMINI_API_KEY:
+    raise ValueError(
+        "GEMINI_API_KEY not found in environment variables. "
+        "Please create a .env file in the project root and set the key."
+    )
+# Configure the genai client with the API key
+client = genai.Client(api_key=GEMINI_API_KEY)
+def _transcribe_audio(audio_bytes: bytes) -> str:
+    """
+    Sends base64-encoded WAV audio to the model and returns the transcription as plain text.
+    """
+    audio_part = Part.from_bytes(data=audio_bytes, mime_type="audio/wav")
+    text_part = (
+            "You are a world-class transcription engine. "
+            "Transcribe the following audio to plain text only, with no extra formatting:\n\n"
+            "(Begin audio input)"
+    )
+    resp = client.models.generate_content(
+        model=MODEL_ID,
+        contents=[audio_part,
+                  text_part
+        ]
+    )
+    return resp.text.strip() # type: ignore
+def _translate_to_english(text: str) -> str:
+    """
+    Detects the language of the input and translates it into English.
+    """
+    prompt = (
+        "You are a world-class translation engine. "
+        "Detect the language of the following text and translate it into English. "
+        "Return ONLY the translated English text with no extra commentary:\n\n"
+        f"{text}"
+    )
+    resp = client.models.generate_content(
+        model=MODEL_ID,
+        contents=prompt
+    )
+    return resp.text.strip() # type: ignore
+def process_audio_with_gemini(audio_bytes: bytes) -> Dict[str, str]:
+    """
+    Processes an audio file by first transcribing it and then translating the
+    resulting text to English using the Gemini model.
+    This function orchestrates the transcription and translation calls.
+    Args:
+        audio_bytes: The byte content of the audio file.
+        mime_type: The MIME type of the audio file (e.g., 'audio/wav', 'audio/mp3').
+    Returns:
+        A dictionary containing the 'transcription' and 'translation'.
+    Raises:
+        Exception: If there is an error during the API calls to the Gemini model.
+    """
+    try:
+        # Step 1: Transcribe the audio using the internal helper function
+        transcription = _transcribe_audio(audio_bytes)
+        # Step 2: Translate the transcription to English if it's not empty
+        translation = ""
+        if transcription:
+            translation = _translate_to_english(transcription)
+        return {"transcription": transcription, "translation": translation}
+    except Exception as e:
+        # Re-raise the exception with more context to be caught by the API endpoint
+        raise Exception(f"Error processing audio with Gemini: {str(e)}")

services/audio_whisper.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import whisper
+import torch
+import tempfile
+import os
+from typing import Dict
+# Determine the most efficient device available (CUDA if possible, otherwise CPU)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the Whisper model once when the module is imported.
+# This is a time and resource-intensive operation, so it should not be done on every API call.
+try:
+    print(f"Loading Whisper model 'large' onto device '{DEVICE}'...")
+    model = whisper.load_model("large", device=DEVICE)
+    print("Whisper model loaded successfully.")
+except Exception as e:
+    print(f"Fatal: Error loading Whisper model: {e}")
+    model = None
+def process_audio_with_whisper(audio_bytes: bytes) -> Dict[str, str]:
+    """
+    Transcribes and translates a given audio file's bytes using the Whisper model.
+    This function saves the audio bytes to a temporary file and passes the file
+    path to Whisper for processing. This is a robust way to handle file access
+    and prevent permission errors with ffmpeg, especially on Windows.
+    Args:
+        audio_bytes: The raw bytes of the audio file (e.g., WAV, MP3).
+    Returns:
+        A dictionary containing the Tagalog transcription and English translation.
+        Example: {"transcription": "...", "translation": "..."}
+    Raises:
+        ValueError: If the Whisper model was not loaded successfully.
+        Exception: If audio processing or model inference fails.
+    """
+    if model is None:
+        raise ValueError("Whisper model is not available or failed to load.")
+    # Create a temporary file to store the audio.
+    # Using delete=False is crucial on Windows to allow other processes to open the file by its path.
+    # We will manually delete the file in the 'finally' block.
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".tmp") as temp_audio_file:
+            temp_path = temp_audio_file.name
+            # Write the uploaded audio bytes to the temporary file
+            temp_audio_file.write(audio_bytes)
+            # The file is automatically closed when exiting the 'with' block
+    except Exception as e:
+        print(f"Error creating temporary file: {e}")
+        raise
+    try:
+        # Perform transcription using the file path
+        transcription_result = model.transcribe(
+            temp_path,
+            language="tl",
+            task="transcribe"
+        )
+        # Perform translation using the same file path
+        translation_result = model.transcribe(
+            temp_path,
+            language="tl",
+            task="translate"
+        )
+        return {
+            "transcription": transcription_result.get('text', '').strip(), # type: ignore
+            "translation": translation_result.get('text', '').strip() # type: ignore
+        }
+    except Exception as e:
+        # Log and re-raise any exceptions to be handled by the FastAPI endpoint
+        print(f"An error occurred during Whisper processing: {e}")
+        raise
+    finally:
+        # Ensure the temporary file is deleted after processing
+        if 'temp_path' in locals() and os.path.exists(temp_path):
+            os.remove(temp_path)

services/image_ocr_processor.py ADDED Viewed

	@@ -0,0 +1,807 @@

+import io
+import re
+import math
+import json
+import os
+import torch
+import nltk
+from nltk.corpus import words as nltk_words
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from PIL import Image
+from google import genai
+from dotenv import load_dotenv
+from difflib import SequenceMatcher
+from io import BytesIO
+from pdf2image import convert_from_bytes
+from google.genai.types import (
+    Part
+)
+# Load environment variables
+load_dotenv()
+# Download NLTK data
+try:
+    nltk.data.find('corpora/words')
+except LookupError:
+    nltk.download("words", quiet=True)
+# ─────────────────────────────────────────────────────────────
+# 0) Process PDF to Image
+# ─────────────────────────────────────────────────────────────
+def process_pdf_to_image(pdf_bytes):
+    """
+    Convert PDF to image for processing
+    """
+    try:
+        # Convert PDF to images (first page only)
+        images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1)
+        if not images:
+            raise Exception("Could not convert PDF to image")
+        # Convert PIL image to bytes
+        img_byte_arr = io.BytesIO()
+        images[0].save(img_byte_arr, format='PNG')
+        img_byte_arr = img_byte_arr.getvalue()
+        return img_byte_arr
+    except Exception as e:
+        raise Exception(f"PDF processing failed: {str(e)}")
+# ─────────────────────────────────────────────────────────────
+# 1) API key and document prompts
+# ─────────────────────────────────────────────────────────────
+# Load environment variables and configure Gemini API
+API_KEY = os.getenv("GEMINI_API_KEY")
+if not API_KEY:
+    raise RuntimeError("GEMINI_API_KEY not set in environment. Please create a .env file with your API key.")
+# Configure the SDK
+client = genai.Client(vertexai=False, api_key=API_KEY)
+# Initialize your model
+MODEL_ID = "gemini-1.5-flash"
+# ─────────────────────────────────────────────────────────────
+# 2) Ground Truth Data
+# ─────────────────────────────────────────────────────────────
+GROUND_TRUTHS = {
+    "CIF-Good.png": '''{
+        "document_type": "CUSTOMER INFORMATION SHEET (INDIVIDUAL)",
+        "bank_name": "BPI",
+        "personal_information": {
+            "rm_no": null,
+            "last_name": "Garnet",
+            "first_name": "Lawrence",
+            "middle_name": "Dela Cruz",
+            "suffix": "III",
+            "date_of_birth": "10/21/1962",
+            "place_of_birth": "Rizal, Philippines",
+            "citizenship": null,
+            "sex": "Male",
+            "marital_status": "Married",
+            "mother_s_full_maiden_name": "Rosa H. Dela Cruz",
+            "spouse_name": "Marion V. Garnet",
+            "tin_number": null,
+            "sss_number": null,
+            "spouse_birthdate": "8/10/1965",
+            "id_presented": {
+                "id_type": "Drivers",
+                "id_number": "2961781134"
+            },
+            "no_of_children": 2,
+            "highest_educational_attainment": "College Graduate"
+        },
+        "contact_information": {
+            "mobile_no": "+63 917 926 9175",
+            "landline_no": null,
+            "email_address": "[email protected]",
+            "home_address": "Amorsolo St. Brgy. Aguinaldo",
+            "country": "Philippines",
+            "zip_code": "1366",
+            "district_town": null,
+            "city_municipality_provice": "Rizal",
+            "residence_since_mm_dd_yyyy": null,
+            "home_ownership": "Owned"
+        },
+        "financial_information": {
+            "profession_business_name": "Name",
+            "date_hired": "01/10/2012",
+            "employer_business_address": "[email protected]",
+            "position_rank": "Assistant VP",
+            "nature_of_business_self_employment": "Sales",
+            "source_of_income_wealth": {
+                "monthly_income": 110000
+            }
+        },
+        "fatca_declaration": {
+            "i_am_not_a_us_person": true,
+            "i_am_a_us_person": false,
+            "us_person_details": {
+                "us_citizen": false,
+                "us_resident_green_card": false,
+                "us_tin": false,
+                "us_id": false,
+                "w9_submitted": false,
+                "us_place_of_birth_1": null,
+                "us_place_of_birth_2": null,
+                "required_documents_submitted": {
+                    "w8_ben": null,
+                    "certificate_of_loss_of_us_nationality": null,
+                    "written_explanation_not_having_certificate_despite_renunciation": null,
+                    "written_explanation_why_us_citizenship_not_obtained_at_birth": null
+                }
+            }
+        },
+        "certification_and_authorization": {
+            "customer_signature": null,
+            "date": "02/03/25"
+        },
+        "for_bank_use_only": {
+            "remarks": null,
+            "processed_and_signature_verified_by": "Simon Eulalia",
+            "approved_by": "Ray Hernandez"
+        },
+        "form_no": "BPI-CISS IND-02222022"
+    }''',
+    "CIF-bad.jpg": '''{
+        "document_type": "CUSTOMER INFORMATION SHEET (INDIVIDUAL)",
+        "bank_name": "BPI",
+        "personal_information": {
+            "rm_no": null,
+            "last_name": "Garnet",
+            "first_name": "Lawrence",
+            "middle_name": "Dela Cruz",
+            "suffix": "III",
+            "date_of_birth": "10/21/1962",
+            "place_of_birth": "Rizal, Philippines",
+            "citizenship": null,
+            "sex": "Male",
+            "marital_status": "Married",
+            "mother_s_full_maiden_name": "Rosa H. Dela Cruz",
+            "spouse_name": "Marion V. Garnet",
+            "tin_number": null,
+            "sss_number": null,
+            "spouse_birthdate": "8/10/1965",
+            "id_presented": {
+                "id_type": "Drivers",
+                "id_number": "2961781134"
+            },
+            "no_of_children": 2,
+            "highest_educational_attainment": "College Graduate"
+        },
+        "contact_information": {
+            "mobile_no": "+63 917 926 9175",
+            "landline_no": null,
+            "email_address": "[email protected]",
+            "home_address": "Amorsolo St. Brgy. Aguinaldo",
+            "country": "Philippines",
+            "zip_code": "1366",
+            "district_town": null,
+            "city_municipality_provice": "Rizal",
+            "residence_since_mm_dd_yyyy": null,
+            "home_ownership": "Owned"
+        },
+        "financial_information": {
+            "profession_business_name": "Name",
+            "date_hired": "01/10/2012",
+            "employer_business_address": "[email protected]",
+            "position_rank": "Assistant VP",
+            "nature_of_business_self_employment": "Sales",
+            "source_of_income_wealth": {
+                "monthly_income": 110000
+            }
+        },
+        "fatca_declaration": {
+            "i_am_not_a_us_person": true,
+            "i_am_a_us_person": false,
+            "us_person_details": {
+                "us_citizen": false,
+                "us_resident_green_card": false,
+                "us_tin": false,
+                "us_id": false,
+                "w9_submitted": false,
+                "us_place_of_birth_1": null,
+                "us_place_of_birth_2": null,
+                "required_documents_submitted": {
+                    "w8_ben": null,
+                    "certificate_of_loss_of_us_nationality": null,
+                    "written_explanation_not_having_certificate_despite_renunciation": null,
+                    "written_explanation_why_us_citizenship_not_obtained_at_birth": null
+                }
+            }
+        },
+        "certification_and_authorization": {
+            "customer_signature": null,
+            "date": "02/03/25"
+        },
+        "for_bank_use_only": {
+            "remarks": null,
+            "processed_and_signature_verified_by": "Simon Eulalia",
+            "approved_by": "Ray Hernandez"
+        },
+        "form_no": "BPI-CISS IND-02222022"
+    }''',
+    "DF-Good.jpg": '''{
+        "document_type": "DEPOSIT / PAYMENT / BILLS PURCHASE FORM FRONT",
+        "copy_type": "BANK'S_COPY",
+        "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
+        "transaction_details": {
+            "date": "03/29/14",
+            "transaction_type": {
+                "deposit": true,
+                "payment": false,
+                "bills_purchase": false
+            },
+            "account_type": {
+                "savings": true,
+                "current": false
+            },
+            "currency": {
+                "peso": false,
+                "us_dollar": true,
+                "others": false
+            }
+        },
+        "account_details": {
+            "account_number": "05039947290",
+            "account_name_merchant_name": "Amaia Skies"
+        },
+        "deposit_payment_breakdown": {
+            "cash_amount": null,
+            "checks": [{
+                "amount": 1000000.0,
+                "bank": null,
+                "date": null,
+                "details": null
+            }],
+            "total_deposits_payment": null
+        },
+        "teller_validation_bank_copy": null,
+        "for_bills_purchase_accommodation": {
+            "representative_full_name": "Amie Skies",
+            "contact_number": "0917 872 0056",
+            "signature_over_printed_name": "present",
+            "form_no": "BPI-BPDEP MAN-01222020"
+        },
+        "client_s_copy_teller_validation": null
+    }''',
+    "DF-bad.jpeg": '''{
+        "document_type": "DEPOSIT / PAYMENT / BILLS PURCHASE FORM FRONT",
+        "copy_type": "BANK'S_COPY",
+        "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
+        "transaction_details": {
+            "date": "03/29/14",
+            "transaction_type": {
+                "deposit": true,
+                "payment": false,
+                "bills_purchase": false
+            },
+            "account_type": {
+                "savings": true,
+                "current": false
+            },
+            "currency": {
+                "peso": false,
+                "us_dollar": true,
+                "others": false
+            }
+        },
+        "account_details": {
+            "account_number": "05039947290",
+            "account_name_merchant_name": "Amaia Skies"
+        },
+        "deposit_payment_breakdown": {
+            "cash_amount": null,
+            "checks": [{
+                "amount": 1000000.0,
+                "bank": null,
+                "date": null,
+                "details": null
+            }],
+            "total_deposits_payment": null
+        },
+        "teller_validation_bank_copy": null,
+        "for_bills_purchase_accommodation": {
+            "representative_full_name": "Amie Skies",
+            "contact_number": "0917 872 0056",
+            "signature_over_printed_name": "present",
+            "form_no": "BPI-BPDEP MAN-01222020"
+        },
+        "client_s_copy_teller_validation": null
+    }''',
+    "DB-Good.jpg": '''{
+        "document_type": "DEPOSIT / PAYMENT SLIP BACK",
+        "bank_name": "BANK OF THE PHILIPPINE ISLANDS",
+        "sections": {
+            "check_details_top": {
+                "checks": [{
+                    "name_of_bank_branch": "Olanggapo",
+                    "check_no": "0543729",
+                    "amount": 100000.0
+                }],
+                "total_checks": null,
+                "total_cash": null,
+                "total_deposits_payment": null
+            },
+            "deposit_cash_breakdown": {
+                "items": [
+                    {"no_of_pieces": 100, "denominations": 100, "amount": 1000},
+                    {"no_of_pieces": 200, "denominations": 200, "amount": 200},
+                    {"no_of_pieces": 300, "denominations": 300, "amount": 1500},
+                    {"no_of_pieces": 500, "denominations": 400, "amount": 1250},
+                    {"no_of_pieces": 600, "denominations": 600, "amount": 1750},
+                    {"no_of_pieces": 700, "denominations": 700, "amount": 6350},
+                    {"no_of_pieces": 800, "denominations": 800, "amount": 8750}
+                ],
+                "total": 10000750000
+            },
+            "representative_information": {
+                "full_name": "Anna Banana Cruz",
+                "contact_number": "09178123775",
+                "address": "11, Tower 2, City Residences, Manila",
+                "citizenship": "Japanese",
+                "date_of_birth": "03/31/2001",
+                "place_of_birth": "Bulacan",
+                "signature": null
+            },
+            "client_copy": {
+                "document_type": "DEPOSIT / PAYMENT SLIP (CLIENT'S COPY)",
+                "for_payments_only": {
+                    "policy_plan_reference_no": null,
+                    "policy_planholder_name": null,
+                    "bp_customer_number": "03756245"
+                },
+                "check_details": {
+                    "checks": [{
+                        "bank_branch_name": "P. Tuazon",
+                        "check_no": "0347345",
+                        "amount": 100200200
+                    }],
+                    "total_checks": 800000,
+                    "total_cash": 20000,
+                    "total_deposits_payment": 820000
+                }
+            }
+        }
+    }''',
+    "DB-Bad.jpg": '''{
+        "document_type": "DEPOSIT / PAYMENT SLIP BACK",
+        "bank_name": "BANK OF THE PHILIPPINE ISLANDS",
+        "sections": {
+            "check_details_top": {
+                "checks": [{
+                    "name_of_bank_branch": "Olanggapo",
+                    "check_no": "0543729",
+                    "amount": 100000.0
+                }],
+                "total_checks": null,
+                "total_cash": null,
+                "total_deposits_payment": null
+            },
+            "deposit_cash_breakdown": {
+                "items": [
+                    {"no_of_pieces": 100, "denominations": 100, "amount": 1000},
+                    {"no_of_pieces": 200, "denominations": 200, "amount": 200},
+                    {"no_of_pieces": 300, "denominations": 300, "amount": 1500},
+                    {"no_of_pieces": 500, "denominations": 400, "amount": 1250},
+                    {"no_of_pieces": 600, "denominations": 600, "amount": 1750},
+                    {"no_of_pieces": 700, "denominations": 700, "amount": 6350},
+                    {"no_of_pieces": 800, "denominations": 800, "amount": 8750}
+                ],
+                "total": 10000750000
+            },
+            "representative_information": {
+                "full_name": "Anna Banana Cruz",
+                "contact_number": "09178123775",
+                "address": "11, Tower 2, City Residences, Manila",
+                "citizenship": "Japanese",
+                "date_of_birth": "03/31/2001",
+                "place_of_birth": "Bulacan",
+                "signature": null
+            },
+            "client_copy": {
+                "document_type": "DEPOSIT / PAYMENT SLIP (CLIENT'S COPY)",
+                "for_payments_only": {
+                    "policy_plan_reference_no": null,
+                    "policy_planholder_name": null,
+                    "bp_customer_number": "03756245"
+                },
+                "check_details": {
+                    "checks": [{
+                        "bank_branch_name": "P. Tuazon",
+                        "check_no": "0347345",
+                        "amount": 100200200
+                    }],
+                    "total_checks": 800000,
+                    "total_cash": 20000,
+                    "total_deposits_payment": 820000
+                }
+            }
+        }
+    }''',
+    "WF-Good.jpg": '''{
+        "document_type": "WITHDRAWAL SLIP",
+        "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
+        "withdrawal_slip_details": {
+            "currency_type": "US DOLLAR",
+            "account_type": "CURRENT",
+            "account_number": "3456777799",
+            "account_name": "Maxine Yu",
+            "teller_validation": null
+        },
+        "withdrawal_amount": {
+            "amount_in_numbers": "USD 50,000"
+        },
+        "depositor_information": {
+            "signature_of_depositor": "present",
+            "date": null
+        },
+        "withdrawal_through_representative": {
+            "name_in_print": "Mark Garcia",
+            "signature_of_representative": "present",
+            "contact_no": "0918 251 0226",
+            "depositor_authorization_signatures": [
+                {"signature": "present", "date": "05/19/25"},
+                {"signature": "present", "date": "05/19/25"}
+            ]
+        },
+        "payment_received_by": {
+            "signature": "present",
+            "name": "Marco Polo"
+        },
+        "bank_use_only": {
+            "remarks": null,
+            "verified_by": null,
+            "approved_by": null
+        },
+        "form_no": "BPI-WDL OTC-01222020"
+    }''',
+    "WF-Bad.jpg": '''{
+        "document_type": "WITHDRAWAL SLIP",
+        "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
+        "withdrawal_slip_details": {
+            "currency_type": "US DOLLAR",
+            "account_type": "CURRENT",
+            "account_number": "3456777799",
+            "account_name": "Maxine Yu",
+            "teller_validation": null
+        },
+        "withdrawal_amount": {
+            "amount_in_numbers": "USD 50,000"
+        },
+        "depositor_information": {
+            "signature_of_depositor": "present",
+            "date": null
+        },
+        "withdrawal_through_representative": {
+            "name_in_print": "Mark Garcia",
+            "signature_of_representative": "present",
+            "contact_no": "0918 251 0226",
+            "depositor_authorization_signatures": [
+                {"signature": "present", "date": "05/19/25"},
+                {"signature": "present", "date": "05/19/25"}
+            ]
+        },
+        "payment_received_by": {
+            "signature": "present",
+            "name": "Marco Polo"
+        },
+        "bank_use_only": {
+            "remarks": null,
+            "verified_by": null,
+            "approved_by": null
+        },
+        "form_no": "BPI-WDL OTC-01222020"
+    }''',
+    "WB-Good.jpg": '''{
+        "document_type": "WITHDRAWAL SLIP BACK",
+        "denominations_breakdown": {
+            "items": [
+                {"no_of_pieces": 1, "denomination": 100, "amount": 100},
+                {"no_of_pieces": 2, "denomination": 500, "amount": 1000},
+                {"no_of_pieces": 3, "denomination": 1000, "amount": 3000}
+            ],
+            "total": null
+        },
+        "representative_information": {
+            "full_name": "Mark Garcia",
+            "contact_number": "0918 251 3372",
+            "address": "1F Tower 1, SMDC, Camarines, Sur",
+            "citizenship": "American",
+            "date_of_birth": "12/15/2001",
+            "place_of_birth": "Bicol",
+            "signature": "present"
+        }
+    }''',
+    "WB-bad.jpeg": '''{
+        "document_type": "WITHDRAWAL SLIP BACK",
+        "denominations_breakdown": {
+            "items": [
+                {"no_of_pieces": 1, "denomination": 100, "amount": 100},
+                {"no_of_pieces": 2, "denomination": 500, "amount": 1000},
+                {"no_of_pieces": 3, "denomination": 1000, "amount": 3000}
+            ],
+            "total": null
+        },
+        "representative_information": {
+            "full_name": "Mark Garcia",
+            "contact_number": "0918 251 3372",
+            "address": "1F Tower 1, SMDC, Camarines, Sur",
+            "citizenship": "American",
+            "date_of_birth": "12/15/2001",
+            "place_of_birth": "Bicol",
+            "signature": "present"
+        }
+    }'''
+}
+# ─────────────────────────────────────────────────────────────
+# 3) Evaluation + helper functions
+# ─────────────────────────────────────────────────────────────
+def compute_cer(gt, pred):
+    """Compute Character Error Rate."""
+    m, n = len(gt), len(pred)
+    dp = [[0]*(n+1) for _ in range(m+1)]
+    for i in range(m+1):
+        dp[i][0] = i
+    for j in range(n+1):
+        dp[0][j] = j
+    for i in range(1,m+1):
+        for j in range(1,n+1):
+            cost = 0 if gt[i-1]==pred[j-1] else 1
+            dp[i][j] = min(dp[i-1][j]+1, dp[i][j-1]+1, dp[i-1][j-1]+cost)
+    return dp[m][n]/max(m,1)
+def extract_flat(o, parent=""):
+    """Extract flat key-value pairs from nested JSON."""
+    out = []
+    if isinstance(o, dict):
+        for k,v in o.items():
+            key = f"{parent}.{k}" if parent else k
+            out += extract_flat(v, key)
+    elif isinstance(o, list):
+        for i,v in enumerate(o):
+            out += extract_flat(v, f"{parent}[{i}]")
+    else:
+        out.append((parent, str(o)))
+    return out
+def compute_field_accuracy(gt_json, pred_json):
+    """Compute strict field accuracy."""
+    try:
+        gt = dict(extract_flat(json.loads(gt_json)))
+        pr = dict(extract_flat(json.loads(pred_json)))
+    except:
+        return 0.0
+    total = len(gt)
+    correct = sum(1 for k,v in gt.items() if pr.get(k)==v)
+    return correct / total if total else 0.0
+def field_matches(gt, pred, max_err_pct=0.1):
+    """Check if fields match with fuzzy matching."""
+    gt = re.sub(r'[^\w\s]', '', str(gt).lower().strip())
+    pred = re.sub(r'[^\w\s]', '', str(pred).lower().strip())
+    if not gt and not pred:
+        return True
+    return (1 - SequenceMatcher(None, gt, pred).ratio()) <= max_err_pct
+def compute_fuzzy_field_accuracy(gt_json, pred_json):
+    """Compute fuzzy field accuracy."""
+    try:
+        gt = dict(extract_flat(json.loads(gt_json)))
+        pr = dict(extract_flat(json.loads(pred_json)))
+    except:
+        return 0.0
+    total = len(gt)
+    correct = sum(1 for k,v in gt.items() if field_matches(v, pr.get(k, "")))
+    return correct / total if total else 0.0
+def canonicalize(js):
+    """Canonicalize JSON string."""
+    return json.dumps(json.loads(js), sort_keys=True, separators=(',', ':'))
+def clean_json_string(js):
+    """Clean JSON string by removing markdown formatting."""
+    return re.sub(r'```(?:json)?\s*|\s*```', '', js.strip(), flags=re.DOTALL)
+def extract_values_from_jsonlike(text):
+    """Extract all string values from JSON-like text."""
+    text = re.sub(r'[{}[\]",:]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def compute_spelling_error_rate(text):
+    """Compute spelling error rate using NLTK words corpus."""
+    words = text.lower().split()
+    if not words:
+        return 0.0
+    english_words = set(nltk_words.words())
+    misspelled = sum(1 for word in words if word.isalpha() and word not in english_words)
+    return misspelled / len(words)
+def compute_perplexity(text):
+    """Compute perplexity using GPT-2 model."""
+    try:
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2LMHeadModel.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+        inputs = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = model(inputs, labels=inputs)
+            loss = outputs.loss
+        return math.exp(loss.item())
+    except Exception as e:
+        print(f"Error computing perplexity: {e}")
+        return float('inf')
+def compute_refined_metrics(text):
+    """Compute refined spelling error rate with additional checks."""
+    words = text.lower().split()
+    if not words:
+        return 0.0
+    english_words = set(nltk_words.words())
+    errors = 0
+    for word in words:
+        if not word.isalpha():
+            continue
+        if word not in english_words:
+            corrected = word.replace('0', 'o').replace('1', 'l').replace('5', 's')
+            if corrected not in english_words:
+                errors += 1
+    return errors / len(words)
+# ─────────────────────────────────────────────────────────────
+# 4) Main processing function
+# ─────────────────────────────────────────────────────────────
+def process_document_image(image_bytes, filename=None):
+    """Process a document image and return extracted information and metrics."""
+    try:
+        # 1) load
+        image = Image.open(BytesIO(image_bytes))
+        img_format = image.format or "PNG"
+        # 2) compress & resize loop → ensure <4 MB
+        buf = BytesIO()
+        image.save(buf, format=img_format, optimize=True, quality=85)
+        while buf.getbuffer().nbytes > 4_000_000:
+            w, h = image.size
+            image = image.resize((int(w * 0.8), int(h * 0.8)), Image.LANCZOS) # type: ignore
+            buf = BytesIO()
+            image.save(buf, format=img_format, optimize=True, quality=85)
+        img_bytes = buf.getvalue()  # this is your final image payload
+        ocr_prompt = "Extract all visible printed and handwritten text from this scanned bank document image."
+        image_part = {
+        "inlineData": {
+            "mimeType": "image/png",
+            "data": img_bytes
+        }
+        }
+        response = client.models.generate_content(
+            model=MODEL_ID,
+            contents=[
+                Part.from_bytes(data=img_bytes, mime_type="image/png"),
+                ocr_prompt
+            ])
+        raw_text = response.text.strip() # type: ignore
+        print("--- Raw OCR Text ---\n", raw_text[:1000], "\n")
+        # Extract JSON with Gemini from OCR
+        schema_prompt = (
+            "You are a JSON extractor for bank forms. Given the OCR text from a scanned image, "
+            "output ONLY valid JSON matching the correct schema, using null for blanks.\n\n"
+            "--- CIF Example:\n" + GROUND_TRUTHS["CIF-Good.png"] + "\n\n"
+            "--- DF Example:\n" + GROUND_TRUTHS["DF-Good.jpg"] + "\n\n"
+            "--- DB Example:\n" + GROUND_TRUTHS["DB-Good.jpg"] + "\n\n"
+            "--- WF Example:\n" + GROUND_TRUTHS["WF-Good.jpg"] + "\n\n"
+            "--- WB Example:\n" + GROUND_TRUTHS["WB-Good.jpg"] + "\n\n"
+            "Now extract JSON from this OCR text:\n" + raw_text
+        )
+        final = client.models.generate_content(
+            model=MODEL_ID,
+            contents=[schema_prompt]
+        )
+        pred_json = clean_json_string(final.text)
+        print("--- Extracted JSON ---\n", pred_json)
+        # Parse the extracted JSON
+        try:
+            extracted_data = json.loads(pred_json)
+        except json.JSONDecodeError:
+            extracted_data = {
+                "document_type": "unknown",
+                "raw_text": pred_json
+            }
+        # Compute basic metrics
+        clean_text = extract_values_from_jsonlike(pred_json)
+        ser = compute_spelling_error_rate(clean_text)
+        try:
+            ppl = compute_perplexity(clean_text)
+        except:
+            ppl = float("inf")
+        refined_ser = compute_refined_metrics(clean_text)
+        # Evaluate against ground truth if available
+        cer_score = 0.0
+        strict_accuracy = 0.0
+        fuzzy_accuracy = 0.0
+        if filename and filename in GROUND_TRUTHS:
+            gt_json = clean_json_string(GROUND_TRUTHS[filename])
+            try:
+                gt_can = canonicalize(gt_json)
+                pred_can = canonicalize(pred_json)
+                cer_score = compute_cer(gt_can, pred_can)
+                strict_accuracy = compute_field_accuracy(gt_json, pred_json)
+                fuzzy_accuracy = compute_fuzzy_field_accuracy(gt_json, pred_json)
+            except Exception as e:
+                print(f"Error in evaluation: {e}")
+        else:
+            print("⚠️ No ground truth available for this file.")
+        # Prepare metrics with proper handling of infinite values
+        metrics = {
+            "ser": ser,
+            "ppl": 999999.0 if ppl == float("inf") else ppl,  # Replace inf with large finite value
+            "refined_ser": refined_ser,
+            "cer": cer_score,
+            "strict_field_accuracy": strict_accuracy,
+            "fuzzy_field_accuracy": fuzzy_accuracy
+        }
+        return {
+            "document_type": extracted_data.get("document_type", "unknown"),
+            "extracted": extracted_data,
+            "metrics": metrics,
+            "raw_text": raw_text,
+            "extracted_json": pred_json
+        }
+    except Exception as e:
+        print(f"Error processing document: {e}")
+        return {
+            "error": str(e),
+            "document_type": "unknown",
+            "extracted": {},
+            "metrics": {
+                "ser": 0.0,
+                "ppl": 999999.0,  # Replace inf with large finite value
+                "refined_ser": 0.0,
+                "cer": 0.0,
+                "strict_field_accuracy": 0.0,
+                "fuzzy_field_accuracy": 0.0
+            }
+        }

services/text_processor.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # PLACEHOLDER