riu-rd commited on
Commit
fc66c87
·
verified ·
1 Parent(s): 8c39717

Upload 16 files

Browse files
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ GEMINI_API_KEY= # YOUR API KEY HERE
.gitignore ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .env.local
4
+ .env.development
5
+ .env.test
6
+ .env.production
7
+
8
+ # Python
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+ *.so
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # Virtual environment
32
+ venv/
33
+ env/
34
+ ENV/
35
+ env.bak/
36
+ venv.bak/
37
+
38
+ # IDE
39
+ .vscode/
40
+ .idea/
41
+ *.swp
42
+ *.swo
43
+ *~
44
+
45
+ # OS
46
+ .DS_Store
47
+ .DS_Store?
48
+ ._*
49
+ .Spotlight-V100
50
+ .Trashes
51
+ ehthumbs.db
52
+ Thumbs.db
53
+
54
+ # Logs
55
+ *.log
56
+ logs/
57
+
58
+ # Temporary files
59
+ *.tmp
60
+ *.temp
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.8
2
+
3
+ WORKDIR /
4
+
5
+ # Copy requirements.txt to the container
6
+ COPY requirements.txt ./
7
+
8
+ # Install Python dependencies
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # Add a non-root user to run the application
12
+ RUN useradd -m -u 1000 user
13
+
14
+ # Set the user and home directory environment variables
15
+ USER user
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
18
+
19
+ # Create the application directory
20
+ WORKDIR $HOME/app
21
+
22
+ # Copy the application code and model files
23
+ COPY --chown=user . $HOME/app/
24
+
25
+ # Expose the port the FastAPI app runs on
26
+ EXPOSE 7860
27
+
28
+ # Command to run the FastAPI app
29
+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,70 @@
1
- ---
2
- title: Contact Center Operations
3
- emoji:
4
- colorFrom: red
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: Contact Center Operation Insights
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Document Processor Backend
2
+
3
+ This is the FastAPI backend for the Document Processor application that uses Google's Gemini AI to extract structured information from bank documents.
4
+
5
+ ## Setup Instructions
6
+
7
+ ### 1. Environment Variables
8
+
9
+ To use this application, you need to set up your Gemini API key:
10
+
11
+ 1. **Get your Gemini API key:**
12
+ - Go to [Google AI Studio](https://makersuite.google.com/app/apikey)
13
+ - Create a new API key
14
+ - Copy the API key
15
+
16
+ 2. **Create a `.env` file:**
17
+ ```bash
18
+ # In the backend directory, create a .env file
19
+ cp env_template.txt .env
20
+ ```
21
+
22
+ 3. **Edit the `.env` file:**
23
+ ```bash
24
+ # Replace 'your_actual_api_key_here' with your real API key
25
+ GEMINI_API_KEY=your_actual_api_key_here
26
+ ```
27
+
28
+ ### 2. Install Dependencies
29
+
30
+ ```bash
31
+ # Create virtual environment
32
+ python3 -m venv venv
33
+
34
+ # Activate virtual environment
35
+ source venv/bin/activate # On Windows: venv\Scripts\activate
36
+
37
+ # Install dependencies
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ ### 3. Run the Server
42
+
43
+ ```bash
44
+ # Make sure virtual environment is activated
45
+ source venv/bin/activate
46
+
47
+ # Start the server
48
+ uvicorn main:app --reload --host 0.0.0.0 --port 8000
49
+ ```
50
+
51
+ The server will be available at `http://localhost:8000`
52
+
53
+ ## API Endpoints
54
+
55
+ - `GET /` - Health check
56
+ - `GET /api/health` - Detailed health check
57
+ - `POST /api/process` - Process uploaded documents
58
+
59
+ ## Security Notes
60
+
61
+ - The `.env` file is automatically ignored by git to prevent accidentally committing your API key
62
+ - Never commit your actual API key to version control
63
+ - Keep your API key secure and don't share it publicly
64
+
65
+ ## Troubleshooting
66
+
67
+ If you get an error about `GEMINI_API_KEY not set in environment`, make sure:
68
+ 1. You've created the `.env` file in the backend directory
69
+ 2. You've added your actual API key to the file
70
+ 3. The virtual environment is activated when running the server
api.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import RedirectResponse
4
+ import uvicorn
5
+ from typing import Any, Dict
6
+ from pydantic import BaseModel
7
+
8
+ from services.audio_whisper import process_audio_with_whisper
9
+ from services.audio_gemini import process_audio_with_gemini
10
+ from services.audio_diarization import process_audio_diarization, AudioDiarizationError
11
+ from services.image_ocr_processor import process_pdf_to_image, process_document_image
12
+
13
+ class TextRequest(BaseModel):
14
+ text: str
15
+
16
+
17
+ class HelloWorldResponse(BaseModel):
18
+ message: str
19
+ received_text: str
20
+ status: str
21
+
22
+ app = FastAPI(
23
+ title="Contact Center Operation Insights",
24
+ version="1.0.0"
25
+ )
26
+
27
+ # Configure CORS
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=["*"],
31
+ allow_credentials=False,
32
+ allow_methods=["*"],
33
+ allow_headers=["*"],
34
+ )
35
+
36
+ @app.get("/")
37
+ async def docs():
38
+ return RedirectResponse(url="/docs")
39
+
40
+ @app.post("/audio/whisper", response_model=Dict[str, str])
41
+ async def audio_whisper(audio: UploadFile = File(...)):
42
+ """
43
+ Transcribes and translates an audio file using OpenAI's Whisper model.
44
+ """
45
+ # Basic validation for audio content types. Whisper is robust, but this
46
+ # prevents obviously incorrect file types from being processed.
47
+ if not audio.content_type or not audio.content_type.startswith('audio/'):
48
+ raise HTTPException(
49
+ status_code=400,
50
+ detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file."
51
+ )
52
+
53
+ try:
54
+ # Read the content of the uploaded audio file into memory
55
+ audio_bytes = await audio.read()
56
+
57
+ # Call the dedicated service to process the audio
58
+ result = process_audio_with_whisper(audio_bytes)
59
+
60
+ return result
61
+
62
+ except Exception as e:
63
+ # Catch exceptions from the audio processing service or file reading
64
+ raise HTTPException(status_code=500, detail=f"Audio processing failed: {str(e)}")
65
+
66
+ @app.post("/audio/gemini", response_model=Dict[str, str])
67
+ async def audio_gemini(audio: UploadFile = File(...)):
68
+ """
69
+ Receives an audio file, transcribes it, and translates the transcription
70
+ to English using the Google Gemini 2.5 Pro model.
71
+ """
72
+ if not audio.content_type or not audio.content_type.startswith('audio/'):
73
+ raise HTTPException(
74
+ status_code=400,
75
+ detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file."
76
+ )
77
+
78
+ try:
79
+ audio_bytes = await audio.read()
80
+
81
+ result = process_audio_with_gemini(audio_bytes=audio_bytes)
82
+
83
+ return result
84
+
85
+ except Exception as e:
86
+ # Catches exceptions from file reading or the Gemini service
87
+ raise HTTPException(status_code=500, detail=f"Audio processing with Gemini failed: {str(e)}")
88
+
89
+ @app.post("/audio/diarization")
90
+ async def audio_diarization(audio: UploadFile = File(...)) -> Dict[str, Any]:
91
+ """
92
+ Process audio file for speaker diarization using Google Gemini 2.5 Pro.
93
+
94
+ This endpoint accepts audio files and returns speaker diarization results,
95
+ identifying different speakers and their spoken text segments throughout
96
+ the conversation.
97
+ """
98
+ # Validate file type - accept common audio formats
99
+ if not audio.content_type or not audio.content_type.startswith('audio/'):
100
+ raise HTTPException(
101
+ status_code=400,
102
+ detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file (WAV, MP3, MP4, M4A)."
103
+ )
104
+
105
+ # Additional validation for specific audio formats that work well with diarization
106
+ supported_types = [
107
+ 'audio/wav', 'audio/wave', 'audio/x-wav',
108
+ 'audio/mpeg', 'audio/mp3',
109
+ 'audio/mp4', 'audio/m4a', 'audio/x-m4a'
110
+ ]
111
+
112
+ if audio.content_type not in supported_types:
113
+ # Still allow processing but warn about potential issues
114
+ pass # Gemini is quite robust with audio formats
115
+
116
+ try:
117
+ # Read the uploaded audio file content
118
+ audio_bytes = await audio.read()
119
+
120
+ # Validate file size (optional - adjust based on your needs)
121
+ max_size_mb = 100 # 100MB limit
122
+ if len(audio_bytes) > max_size_mb * 1024 * 1024:
123
+ raise HTTPException(
124
+ status_code=400,
125
+ detail=f"File too large. Maximum size allowed is {max_size_mb}MB."
126
+ )
127
+
128
+ # Validate minimum file size to ensure it's not empty
129
+ if len(audio_bytes) < 1000: # Less than 1KB
130
+ raise HTTPException(
131
+ status_code=400,
132
+ detail="File appears to be empty or too small to process."
133
+ )
134
+
135
+ # Process the audio file for speaker diarization
136
+ result = process_audio_diarization(
137
+ audio_bytes=audio_bytes,
138
+ filename=audio.filename # type: ignore
139
+ )
140
+
141
+ return result
142
+
143
+ except AudioDiarizationError as e:
144
+ # Handle specific diarization errors with appropriate HTTP status
145
+ if "API key" in str(e).lower():
146
+ raise HTTPException(
147
+ status_code=500,
148
+ detail="Audio diarization service configuration error. Please contact support."
149
+ )
150
+ elif "format" in str(e).lower():
151
+ raise HTTPException(
152
+ status_code=400,
153
+ detail=f"Audio format error: {str(e)}"
154
+ )
155
+ else:
156
+ raise HTTPException(
157
+ status_code=500,
158
+ detail=f"Audio diarization failed: {str(e)}"
159
+ )
160
+
161
+ except HTTPException:
162
+ # Re-raise HTTP exceptions as-is
163
+ raise
164
+
165
+ except Exception as e:
166
+ # Catch any unexpected errors
167
+ raise HTTPException(
168
+ status_code=500,
169
+ detail=f"Unexpected error during audio diarization: {str(e)}"
170
+ )
171
+
172
+ @app.post("/image/process-document")
173
+ async def process_document(document: UploadFile = File(...)):
174
+ """
175
+ Process uploaded document (image or PDF) and extract information [Model: Gemini 1.5 Flash]
176
+ """
177
+ try:
178
+ # Read file content
179
+ file_bytes = await document.read()
180
+
181
+ # Handle different file types
182
+ if document.content_type.startswith('image/'): # type: ignore
183
+ # Process image directly
184
+ image_bytes = file_bytes
185
+ elif document.content_type == 'application/pdf':
186
+ # Convert PDF to image first
187
+ image_bytes = process_pdf_to_image(file_bytes)
188
+ else:
189
+ raise HTTPException(
190
+ status_code=400,
191
+ detail="Unsupported file type. Please upload an image (JPG, PNG, etc.) or PDF file."
192
+ )
193
+
194
+ # Process the document
195
+ result = process_document_image(image_bytes, document.filename)
196
+
197
+ return result
198
+
199
+ except Exception as e:
200
+ raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
201
+
202
+ @app.post("/text", response_model=HelloWorldResponse)
203
+ async def text_insights(request: TextRequest) -> HelloWorldResponse:
204
+ """
205
+ Simple text to insights endpoint
206
+ """
207
+ try:
208
+ # Basic validation
209
+ if not request.text.strip():
210
+ raise HTTPException(
211
+ status_code=400,
212
+ detail="Text cannot be empty or contain only whitespace."
213
+ )
214
+
215
+ response = HelloWorldResponse(
216
+ message="Hello World! Text processing completed successfully.",
217
+ received_text=request.text,
218
+ status="success"
219
+ )
220
+
221
+ return response
222
+
223
+ except HTTPException:
224
+ raise
225
+
226
+ except Exception as e:
227
+ raise HTTPException(
228
+ status_code=500,
229
+ detail=f"Unexpected error processing text: {str(e)}"
230
+ )
231
+
232
+ @app.get("/health")
233
+ async def health_check():
234
+ """Health check endpoint"""
235
+ return {"status": "healthy", "service": "document-processor"}
236
+
237
+ if __name__ == "__main__":
238
+ uvicorn.run(
239
+ "api:app",
240
+ host="0.0.0.0",
241
+ port=8000,
242
+ reload=True,
243
+ reload_dirs=["."]
244
+ )
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python version: 3.11
2
+ fastapi
3
+ uvicorn
4
+
5
+ python-multipart
6
+ Pillow
7
+ torch
8
+ transformers
9
+ nltk
10
+ python-dotenv
11
+ PyPDF2
12
+ pdf2image
13
+ openai
14
+
15
+ google
16
+ google-genai
17
+ google-api-core
18
+ pprintpp
19
+ pydub
20
+ ffmpeg-python
21
+ requests
22
+ google-cloud-aiplatform
23
+ librosa
24
+ soundfile
25
+ openai-whisper
26
+ pydantic
services/__pycache__/audio_diarization.cpython-311.pyc ADDED
Binary file (12.6 kB). View file
 
services/__pycache__/audio_gemini.cpython-311.pyc ADDED
Binary file (3.55 kB). View file
 
services/__pycache__/audio_whisper.cpython-311.pyc ADDED
Binary file (3.92 kB). View file
 
services/__pycache__/image_ocr_processor.cpython-311.pyc ADDED
Binary file (25.6 kB). View file
 
services/__pycache__/ocr_processor.cpython-311.pyc ADDED
Binary file (25.6 kB). View file
 
services/audio_diarization.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import base64
3
+ import os
4
+ from typing import Dict, List, Any, Tuple
5
+ from pydub import AudioSegment
6
+ import io
7
+
8
+ from google import genai
9
+ from google.genai import types
10
+
11
+
12
+ class AudioDiarizationError(Exception):
13
+ """Custom exception for audio diarization errors"""
14
+ pass
15
+
16
+
17
+ def get_gemini_client() -> genai.Client:
18
+ """
19
+ Initialize and return a Google Gemini API client.
20
+
21
+ Returns:
22
+ genai.Client: Authenticated Gemini client
23
+
24
+ Raises:
25
+ AudioDiarizationError: If API key is not found or client initialization fails
26
+ """
27
+ api_key = os.getenv("GEMINI_API_KEY")
28
+ if not api_key:
29
+ raise AudioDiarizationError("GEMINI_API_KEY environment variable not found")
30
+
31
+ try:
32
+ client = genai.Client(api_key=api_key)
33
+ return client
34
+ except Exception as e:
35
+ raise AudioDiarizationError(f"Failed to initialize Gemini client: {str(e)}")
36
+
37
+
38
+ def get_audio_duration(audio_bytes: bytes) -> float:
39
+ """
40
+ Get the duration of audio in seconds.
41
+
42
+ Args:
43
+ audio_bytes: Raw audio file bytes
44
+
45
+ Returns:
46
+ float: Duration in seconds
47
+
48
+ Raises:
49
+ AudioDiarizationError: If audio processing fails
50
+ """
51
+ try:
52
+ # Create AudioSegment from bytes
53
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
54
+ duration_sec = len(audio) / 1000.0
55
+ return duration_sec
56
+ except Exception as e:
57
+ raise AudioDiarizationError(f"Failed to process audio duration: {str(e)}")
58
+
59
+
60
+ def detect_audio_format(audio_bytes: bytes) -> str:
61
+ """
62
+ Detect audio format from bytes.
63
+
64
+ Args:
65
+ audio_bytes: Raw audio file bytes
66
+
67
+ Returns:
68
+ str: Audio format (e.g., 'wav', 'mp3', 'mp4')
69
+
70
+ Raises:
71
+ AudioDiarizationError: If format detection fails
72
+ """
73
+ try:
74
+ # Try to create AudioSegment to detect format
75
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
76
+
77
+ # Check file signature/magic bytes for common formats
78
+ if audio_bytes.startswith(b'RIFF') and b'WAVE' in audio_bytes[:12]:
79
+ return 'wav'
80
+ elif audio_bytes.startswith(b'ID3') or audio_bytes.startswith(b'\xff\xfb'):
81
+ return 'mp3'
82
+ elif audio_bytes.startswith(b'\x00\x00\x00\x20ftypM4A'):
83
+ return 'm4a'
84
+ elif audio_bytes.startswith(b'\x00\x00\x00\x18ftyp') or audio_bytes.startswith(b'\x00\x00\x00\x20ftyp'):
85
+ return 'mp4'
86
+ else:
87
+ # Default to wav if we can't detect
88
+ return 'wav'
89
+ except Exception as e:
90
+ raise AudioDiarizationError(f"Failed to detect audio format: {str(e)}")
91
+
92
+
93
+ def create_diarization_request(audio_bytes: bytes, audio_format: str, model: str = "gemini-2.5-pro") -> Dict[str, Any]:
94
+ """
95
+ Create a diarization request for the Gemini API.
96
+
97
+ Args:
98
+ audio_bytes: Raw audio file bytes
99
+ audio_format: Audio file format (e.g., 'wav', 'mp3')
100
+ model: Gemini model to use
101
+
102
+ Returns:
103
+ Dict containing the API request configuration
104
+
105
+ Raises:
106
+ AudioDiarizationError: If request creation fails
107
+ """
108
+ try:
109
+ # Encode audio to base64
110
+ audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
111
+
112
+ # Prepare request parts
113
+ audio_part = {
114
+ "inlineData": {
115
+ "mimeType": f"audio/{audio_format}",
116
+ "data": audio_b64
117
+ }
118
+ }
119
+
120
+ text_part = {
121
+ "text": (
122
+ "You are a speaker-diarization engine. "
123
+ "For the audio input, return a JSON object with a top-level `segments` array. "
124
+ "Each segment must have: `speaker` (string) and `text` (transcript)."
125
+ )
126
+ }
127
+
128
+ # Define JSON schema for structured response
129
+ schema = {
130
+ "type": "object",
131
+ "properties": {
132
+ "segments": {
133
+ "type": "array",
134
+ "items": {
135
+ "type": "object",
136
+ "properties": {
137
+ "speaker": {"type": "string"},
138
+ "text": {"type": "string"}
139
+ },
140
+ "required": ["speaker", "text"]
141
+ }
142
+ }
143
+ },
144
+ "required": ["segments"]
145
+ }
146
+
147
+ # Build configuration for JSON mode
148
+ config = types.GenerateContentConfig(
149
+ response_mime_type="application/json",
150
+ response_schema=schema
151
+ )
152
+
153
+ # Build complete request
154
+ request_kwargs = {
155
+ "model": model,
156
+ "contents": [audio_part, text_part],
157
+ "config": config
158
+ }
159
+
160
+ return request_kwargs
161
+
162
+ except Exception as e:
163
+ raise AudioDiarizationError(f"Failed to create diarization request: {str(e)}")
164
+
165
+
166
+ def parse_diarization_response(response_text: str) -> Tuple[List[Dict[str, str]], Dict[str, Any]]:
167
+ """
168
+ Parse the Gemini API response for diarization results.
169
+
170
+ Args:
171
+ response_text: Raw JSON response text from Gemini
172
+
173
+ Returns:
174
+ Tuple of (segments_list, raw_json_dict)
175
+
176
+ Raises:
177
+ AudioDiarizationError: If JSON parsing fails
178
+ """
179
+ try:
180
+ raw_json = json.loads(response_text)
181
+ segments = raw_json.get("segments", [])
182
+
183
+ # Validate segments structure
184
+ if not isinstance(segments, list):
185
+ raise AudioDiarizationError("Response segments must be a list")
186
+
187
+ for i, segment in enumerate(segments):
188
+ if not isinstance(segment, dict):
189
+ raise AudioDiarizationError(f"Segment {i} must be a dictionary")
190
+ if "speaker" not in segment or "text" not in segment:
191
+ raise AudioDiarizationError(f"Segment {i} missing required fields 'speaker' or 'text'")
192
+
193
+ return segments, raw_json
194
+
195
+ except json.JSONDecodeError as e:
196
+ raise AudioDiarizationError(f"Failed to parse JSON from Gemini response: {str(e)}")
197
+ except Exception as e:
198
+ raise AudioDiarizationError(f"Failed to process diarization response: {str(e)}")
199
+
200
+
201
+ def calculate_diarization_stats(segments: List[Dict[str, str]], duration_sec: float) -> Dict[str, Any]:
202
+ """
203
+ Calculate statistics from diarization results.
204
+
205
+ Args:
206
+ segments: List of speaker segments
207
+ duration_sec: Audio duration in seconds
208
+
209
+ Returns:
210
+ Dict containing diarization statistics
211
+ """
212
+ total_turns = len(segments)
213
+ speakers = set(segment["speaker"] for segment in segments)
214
+ num_speakers = len(speakers)
215
+
216
+ # Format duration as MM:SS
217
+ duration_str = f"{int(duration_sec//60):02d}:{int(duration_sec%60):02d}"
218
+
219
+ return {
220
+ "total_turns": total_turns,
221
+ "num_speakers": num_speakers,
222
+ "duration_seconds": duration_sec,
223
+ "duration_formatted": duration_str,
224
+ "speakers": sorted(list(speakers))
225
+ }
226
+
227
+
228
+ def process_audio_diarization(audio_bytes: bytes, filename: str = None) -> Dict[str, Any]: # type: ignore
229
+ """
230
+ Process audio file for speaker diarization using Gemini 2.5 Pro.
231
+
232
+ This function takes raw audio bytes and returns a structured JSON response
233
+ containing speaker diarization results with segments, statistics, and metadata.
234
+
235
+ Args:
236
+ audio_bytes: Raw audio file bytes
237
+ filename: Optional filename for metadata
238
+
239
+ Returns:
240
+ Dict containing:
241
+ - segments: List of speaker segments with speaker and text
242
+ - statistics: Diarization statistics (speakers, turns, duration)
243
+ - metadata: Processing metadata
244
+ - raw_response: Original Gemini response
245
+
246
+ Raises:
247
+ AudioDiarizationError: If any step of the diarization process fails
248
+ """
249
+ try:
250
+ # Initialize Gemini client
251
+ client = get_gemini_client()
252
+
253
+ # Get audio duration and format
254
+ duration_sec = get_audio_duration(audio_bytes)
255
+ audio_format = detect_audio_format(audio_bytes)
256
+
257
+ # Create API request
258
+ request_kwargs = create_diarization_request(audio_bytes, audio_format)
259
+
260
+ # Make API call to Gemini
261
+ try:
262
+ response = client.models.generate_content(**request_kwargs)
263
+ response_text = response.text
264
+ except Exception as e:
265
+ raise AudioDiarizationError(f"Gemini API call failed: {str(e)}")
266
+
267
+ # Parse response
268
+ segments, raw_json = parse_diarization_response(response_text) # type: ignore
269
+
270
+ # Calculate statistics
271
+ stats = calculate_diarization_stats(segments, duration_sec)
272
+
273
+ # Build final response
274
+ result = {
275
+ "segments": segments,
276
+ "statistics": stats,
277
+ "metadata": {
278
+ "filename": filename,
279
+ "audio_format": audio_format,
280
+ "model_used": "gemini-2.5-pro",
281
+ "processing_status": "success"
282
+ },
283
+ "raw_response": raw_json
284
+ }
285
+
286
+ return result
287
+
288
+ except AudioDiarizationError:
289
+ # Re-raise our custom errors
290
+ raise
291
+ except Exception as e:
292
+ # Catch any unexpected errors
293
+ raise AudioDiarizationError(f"Unexpected error during audio diarization: {str(e)}")
294
+
295
+
296
+ # Example usage and testing function
297
+ def test_diarization_service():
298
+ """
299
+ Test function for the diarization service.
300
+ This is mainly for development and debugging purposes.
301
+ """
302
+ try:
303
+ # This would require an actual audio file to test
304
+ print("Audio diarization service loaded successfully")
305
+ print("Available functions:")
306
+ print("- process_audio_diarization(audio_bytes, filename)")
307
+ print("- get_gemini_client()")
308
+ print("- get_audio_duration(audio_bytes)")
309
+ print("- detect_audio_format(audio_bytes)")
310
+
311
+ # Check if API key is available
312
+ api_key = os.getenv("GEMINI_API_KEY")
313
+ if api_key:
314
+ print("✓ GEMINI_API_KEY found in environment")
315
+ else:
316
+ print("✗ GEMINI_API_KEY not found in environment")
317
+
318
+ except Exception as e:
319
+ print(f"Service test failed: {e}")
320
+
321
+
322
+ if __name__ == "__main__":
323
+ test_diarization_service()
services/audio_gemini.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict
3
+
4
+ import google.genai as genai
5
+ from dotenv import load_dotenv
6
+ from google.genai.types import Part
7
+
8
+ # Load environment variables from a .env file in the root directory
9
+ load_dotenv()
10
+
11
+ # --- Configuration ---
12
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
13
+ MODEL_ID = "gemini-2.5-pro"
14
+
15
+ # --- Client Initialization ---
16
+ if not GEMINI_API_KEY:
17
+ raise ValueError(
18
+ "GEMINI_API_KEY not found in environment variables. "
19
+ "Please create a .env file in the project root and set the key."
20
+ )
21
+
22
+ # Configure the genai client with the API key
23
+ client = genai.Client(api_key=GEMINI_API_KEY)
24
+
25
+ def _transcribe_audio(audio_bytes: bytes) -> str:
26
+ """
27
+ Sends base64-encoded WAV audio to the model and returns the transcription as plain text.
28
+ """
29
+ audio_part = Part.from_bytes(data=audio_bytes, mime_type="audio/wav")
30
+ text_part = (
31
+ "You are a world-class transcription engine. "
32
+ "Transcribe the following audio to plain text only, with no extra formatting:\n\n"
33
+ "(Begin audio input)"
34
+ )
35
+
36
+ resp = client.models.generate_content(
37
+ model=MODEL_ID,
38
+ contents=[audio_part,
39
+ text_part
40
+ ]
41
+ )
42
+ return resp.text.strip() # type: ignore
43
+
44
+
45
+ def _translate_to_english(text: str) -> str:
46
+ """
47
+ Detects the language of the input and translates it into English.
48
+ """
49
+ prompt = (
50
+ "You are a world-class translation engine. "
51
+ "Detect the language of the following text and translate it into English. "
52
+ "Return ONLY the translated English text with no extra commentary:\n\n"
53
+ f"{text}"
54
+ )
55
+ resp = client.models.generate_content(
56
+ model=MODEL_ID,
57
+ contents=prompt
58
+ )
59
+ return resp.text.strip() # type: ignore
60
+
61
+
62
+ def process_audio_with_gemini(audio_bytes: bytes) -> Dict[str, str]:
63
+ """
64
+ Processes an audio file by first transcribing it and then translating the
65
+ resulting text to English using the Gemini model.
66
+
67
+ This function orchestrates the transcription and translation calls.
68
+
69
+ Args:
70
+ audio_bytes: The byte content of the audio file.
71
+ mime_type: The MIME type of the audio file (e.g., 'audio/wav', 'audio/mp3').
72
+
73
+ Returns:
74
+ A dictionary containing the 'transcription' and 'translation'.
75
+
76
+ Raises:
77
+ Exception: If there is an error during the API calls to the Gemini model.
78
+ """
79
+ try:
80
+ # Step 1: Transcribe the audio using the internal helper function
81
+ transcription = _transcribe_audio(audio_bytes)
82
+
83
+ # Step 2: Translate the transcription to English if it's not empty
84
+ translation = ""
85
+ if transcription:
86
+ translation = _translate_to_english(transcription)
87
+
88
+ return {"transcription": transcription, "translation": translation}
89
+ except Exception as e:
90
+ # Re-raise the exception with more context to be caught by the API endpoint
91
+ raise Exception(f"Error processing audio with Gemini: {str(e)}")
services/audio_whisper.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import torch
3
+ import tempfile
4
+ import os
5
+ from typing import Dict
6
+
7
+ # Determine the most efficient device available (CUDA if possible, otherwise CPU)
8
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ # Load the Whisper model once when the module is imported.
11
+ # This is a time and resource-intensive operation, so it should not be done on every API call.
12
+ try:
13
+ print(f"Loading Whisper model 'large' onto device '{DEVICE}'...")
14
+ model = whisper.load_model("large", device=DEVICE)
15
+ print("Whisper model loaded successfully.")
16
+ except Exception as e:
17
+ print(f"Fatal: Error loading Whisper model: {e}")
18
+ model = None
19
+
20
+ def process_audio_with_whisper(audio_bytes: bytes) -> Dict[str, str]:
21
+ """
22
+ Transcribes and translates a given audio file's bytes using the Whisper model.
23
+
24
+ This function saves the audio bytes to a temporary file and passes the file
25
+ path to Whisper for processing. This is a robust way to handle file access
26
+ and prevent permission errors with ffmpeg, especially on Windows.
27
+
28
+ Args:
29
+ audio_bytes: The raw bytes of the audio file (e.g., WAV, MP3).
30
+
31
+ Returns:
32
+ A dictionary containing the Tagalog transcription and English translation.
33
+ Example: {"transcription": "...", "translation": "..."}
34
+
35
+ Raises:
36
+ ValueError: If the Whisper model was not loaded successfully.
37
+ Exception: If audio processing or model inference fails.
38
+ """
39
+ if model is None:
40
+ raise ValueError("Whisper model is not available or failed to load.")
41
+
42
+ # Create a temporary file to store the audio.
43
+ # Using delete=False is crucial on Windows to allow other processes to open the file by its path.
44
+ # We will manually delete the file in the 'finally' block.
45
+ try:
46
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".tmp") as temp_audio_file:
47
+ temp_path = temp_audio_file.name
48
+ # Write the uploaded audio bytes to the temporary file
49
+ temp_audio_file.write(audio_bytes)
50
+ # The file is automatically closed when exiting the 'with' block
51
+ except Exception as e:
52
+ print(f"Error creating temporary file: {e}")
53
+ raise
54
+
55
+ try:
56
+ # Perform transcription using the file path
57
+ transcription_result = model.transcribe(
58
+ temp_path,
59
+ language="tl",
60
+ task="transcribe"
61
+ )
62
+
63
+ # Perform translation using the same file path
64
+ translation_result = model.transcribe(
65
+ temp_path,
66
+ language="tl",
67
+ task="translate"
68
+ )
69
+
70
+ return {
71
+ "transcription": transcription_result.get('text', '').strip(), # type: ignore
72
+ "translation": translation_result.get('text', '').strip() # type: ignore
73
+ }
74
+ except Exception as e:
75
+ # Log and re-raise any exceptions to be handled by the FastAPI endpoint
76
+ print(f"An error occurred during Whisper processing: {e}")
77
+ raise
78
+ finally:
79
+ # Ensure the temporary file is deleted after processing
80
+ if 'temp_path' in locals() and os.path.exists(temp_path):
81
+ os.remove(temp_path)
services/image_ocr_processor.py ADDED
@@ -0,0 +1,807 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import re
3
+ import math
4
+ import json
5
+ import os
6
+ import torch
7
+ import nltk
8
+ from nltk.corpus import words as nltk_words
9
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
10
+ from PIL import Image
11
+ from google import genai
12
+ from dotenv import load_dotenv
13
+ from difflib import SequenceMatcher
14
+ from io import BytesIO
15
+ from pdf2image import convert_from_bytes
16
+
17
+ from google.genai.types import (
18
+ Part
19
+ )
20
+
21
+ # Load environment variables
22
+ load_dotenv()
23
+
24
+ # Download NLTK data
25
+ try:
26
+ nltk.data.find('corpora/words')
27
+ except LookupError:
28
+ nltk.download("words", quiet=True)
29
+
30
+ # ─────────────────────────────────────────────────────────────
31
+ # 0) Process PDF to Image
32
+ # ─────────────────────────────────────────────────────────────
33
+ def process_pdf_to_image(pdf_bytes):
34
+ """
35
+ Convert PDF to image for processing
36
+ """
37
+ try:
38
+ # Convert PDF to images (first page only)
39
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1)
40
+ if not images:
41
+ raise Exception("Could not convert PDF to image")
42
+
43
+ # Convert PIL image to bytes
44
+ img_byte_arr = io.BytesIO()
45
+ images[0].save(img_byte_arr, format='PNG')
46
+ img_byte_arr = img_byte_arr.getvalue()
47
+
48
+ return img_byte_arr
49
+ except Exception as e:
50
+ raise Exception(f"PDF processing failed: {str(e)}")
51
+
52
+ # ─────────────────────────────────────────────────────────────
53
+ # 1) API key and document prompts
54
+ # ─────────────────────────────────────────────────────────────
55
+
56
+ # Load environment variables and configure Gemini API
57
+ API_KEY = os.getenv("GEMINI_API_KEY")
58
+ if not API_KEY:
59
+ raise RuntimeError("GEMINI_API_KEY not set in environment. Please create a .env file with your API key.")
60
+
61
+ # Configure the SDK
62
+ client = genai.Client(vertexai=False, api_key=API_KEY)
63
+
64
+ # Initialize your model
65
+ MODEL_ID = "gemini-1.5-flash"
66
+
67
+ # ─────────────────────────────────────────────────────────────
68
+ # 2) Ground Truth Data
69
+ # ─────────────────────────────────────────────────────────────
70
+
71
+ GROUND_TRUTHS = {
72
+ "CIF-Good.png": '''{
73
+ "document_type": "CUSTOMER INFORMATION SHEET (INDIVIDUAL)",
74
+ "bank_name": "BPI",
75
+ "personal_information": {
76
+ "rm_no": null,
77
+ "last_name": "Garnet",
78
+ "first_name": "Lawrence",
79
+ "middle_name": "Dela Cruz",
80
+ "suffix": "III",
81
+ "date_of_birth": "10/21/1962",
82
+ "place_of_birth": "Rizal, Philippines",
83
+ "citizenship": null,
84
+ "sex": "Male",
85
+ "marital_status": "Married",
86
+ "mother_s_full_maiden_name": "Rosa H. Dela Cruz",
87
+ "spouse_name": "Marion V. Garnet",
88
+ "tin_number": null,
89
+ "sss_number": null,
90
+ "spouse_birthdate": "8/10/1965",
91
+ "id_presented": {
92
+ "id_type": "Drivers",
93
+ "id_number": "2961781134"
94
+ },
95
+ "no_of_children": 2,
96
+ "highest_educational_attainment": "College Graduate"
97
+ },
98
+ "contact_information": {
99
+ "mobile_no": "+63 917 926 9175",
100
+ "landline_no": null,
101
+ "email_address": "[email protected]",
102
+ "home_address": "Amorsolo St. Brgy. Aguinaldo",
103
+ "country": "Philippines",
104
+ "zip_code": "1366",
105
+ "district_town": null,
106
+ "city_municipality_provice": "Rizal",
107
+ "residence_since_mm_dd_yyyy": null,
108
+ "home_ownership": "Owned"
109
+ },
110
+ "financial_information": {
111
+ "profession_business_name": "Name",
112
+ "date_hired": "01/10/2012",
113
+ "employer_business_address": "[email protected]",
114
+ "position_rank": "Assistant VP",
115
+ "nature_of_business_self_employment": "Sales",
116
+ "source_of_income_wealth": {
117
+ "monthly_income": 110000
118
+ }
119
+ },
120
+ "fatca_declaration": {
121
+ "i_am_not_a_us_person": true,
122
+ "i_am_a_us_person": false,
123
+ "us_person_details": {
124
+ "us_citizen": false,
125
+ "us_resident_green_card": false,
126
+ "us_tin": false,
127
+ "us_id": false,
128
+ "w9_submitted": false,
129
+ "us_place_of_birth_1": null,
130
+ "us_place_of_birth_2": null,
131
+ "required_documents_submitted": {
132
+ "w8_ben": null,
133
+ "certificate_of_loss_of_us_nationality": null,
134
+ "written_explanation_not_having_certificate_despite_renunciation": null,
135
+ "written_explanation_why_us_citizenship_not_obtained_at_birth": null
136
+ }
137
+ }
138
+ },
139
+ "certification_and_authorization": {
140
+ "customer_signature": null,
141
+ "date": "02/03/25"
142
+ },
143
+ "for_bank_use_only": {
144
+ "remarks": null,
145
+ "processed_and_signature_verified_by": "Simon Eulalia",
146
+ "approved_by": "Ray Hernandez"
147
+ },
148
+ "form_no": "BPI-CISS IND-02222022"
149
+ }''',
150
+
151
+ "CIF-bad.jpg": '''{
152
+ "document_type": "CUSTOMER INFORMATION SHEET (INDIVIDUAL)",
153
+ "bank_name": "BPI",
154
+ "personal_information": {
155
+ "rm_no": null,
156
+ "last_name": "Garnet",
157
+ "first_name": "Lawrence",
158
+ "middle_name": "Dela Cruz",
159
+ "suffix": "III",
160
+ "date_of_birth": "10/21/1962",
161
+ "place_of_birth": "Rizal, Philippines",
162
+ "citizenship": null,
163
+ "sex": "Male",
164
+ "marital_status": "Married",
165
+ "mother_s_full_maiden_name": "Rosa H. Dela Cruz",
166
+ "spouse_name": "Marion V. Garnet",
167
+ "tin_number": null,
168
+ "sss_number": null,
169
+ "spouse_birthdate": "8/10/1965",
170
+ "id_presented": {
171
+ "id_type": "Drivers",
172
+ "id_number": "2961781134"
173
+ },
174
+ "no_of_children": 2,
175
+ "highest_educational_attainment": "College Graduate"
176
+ },
177
+ "contact_information": {
178
+ "mobile_no": "+63 917 926 9175",
179
+ "landline_no": null,
180
+ "email_address": "[email protected]",
181
+ "home_address": "Amorsolo St. Brgy. Aguinaldo",
182
+ "country": "Philippines",
183
+ "zip_code": "1366",
184
+ "district_town": null,
185
+ "city_municipality_provice": "Rizal",
186
+ "residence_since_mm_dd_yyyy": null,
187
+ "home_ownership": "Owned"
188
+ },
189
+ "financial_information": {
190
+ "profession_business_name": "Name",
191
+ "date_hired": "01/10/2012",
192
+ "employer_business_address": "[email protected]",
193
+ "position_rank": "Assistant VP",
194
+ "nature_of_business_self_employment": "Sales",
195
+ "source_of_income_wealth": {
196
+ "monthly_income": 110000
197
+ }
198
+ },
199
+ "fatca_declaration": {
200
+ "i_am_not_a_us_person": true,
201
+ "i_am_a_us_person": false,
202
+ "us_person_details": {
203
+ "us_citizen": false,
204
+ "us_resident_green_card": false,
205
+ "us_tin": false,
206
+ "us_id": false,
207
+ "w9_submitted": false,
208
+ "us_place_of_birth_1": null,
209
+ "us_place_of_birth_2": null,
210
+ "required_documents_submitted": {
211
+ "w8_ben": null,
212
+ "certificate_of_loss_of_us_nationality": null,
213
+ "written_explanation_not_having_certificate_despite_renunciation": null,
214
+ "written_explanation_why_us_citizenship_not_obtained_at_birth": null
215
+ }
216
+ }
217
+ },
218
+ "certification_and_authorization": {
219
+ "customer_signature": null,
220
+ "date": "02/03/25"
221
+ },
222
+ "for_bank_use_only": {
223
+ "remarks": null,
224
+ "processed_and_signature_verified_by": "Simon Eulalia",
225
+ "approved_by": "Ray Hernandez"
226
+ },
227
+ "form_no": "BPI-CISS IND-02222022"
228
+ }''',
229
+
230
+ "DF-Good.jpg": '''{
231
+ "document_type": "DEPOSIT / PAYMENT / BILLS PURCHASE FORM FRONT",
232
+ "copy_type": "BANK'S_COPY",
233
+ "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
234
+ "transaction_details": {
235
+ "date": "03/29/14",
236
+ "transaction_type": {
237
+ "deposit": true,
238
+ "payment": false,
239
+ "bills_purchase": false
240
+ },
241
+ "account_type": {
242
+ "savings": true,
243
+ "current": false
244
+ },
245
+ "currency": {
246
+ "peso": false,
247
+ "us_dollar": true,
248
+ "others": false
249
+ }
250
+ },
251
+ "account_details": {
252
+ "account_number": "05039947290",
253
+ "account_name_merchant_name": "Amaia Skies"
254
+ },
255
+ "deposit_payment_breakdown": {
256
+ "cash_amount": null,
257
+ "checks": [{
258
+ "amount": 1000000.0,
259
+ "bank": null,
260
+ "date": null,
261
+ "details": null
262
+ }],
263
+ "total_deposits_payment": null
264
+ },
265
+ "teller_validation_bank_copy": null,
266
+ "for_bills_purchase_accommodation": {
267
+ "representative_full_name": "Amie Skies",
268
+ "contact_number": "0917 872 0056",
269
+ "signature_over_printed_name": "present",
270
+ "form_no": "BPI-BPDEP MAN-01222020"
271
+ },
272
+ "client_s_copy_teller_validation": null
273
+ }''',
274
+
275
+ "DF-bad.jpeg": '''{
276
+ "document_type": "DEPOSIT / PAYMENT / BILLS PURCHASE FORM FRONT",
277
+ "copy_type": "BANK'S_COPY",
278
+ "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
279
+ "transaction_details": {
280
+ "date": "03/29/14",
281
+ "transaction_type": {
282
+ "deposit": true,
283
+ "payment": false,
284
+ "bills_purchase": false
285
+ },
286
+ "account_type": {
287
+ "savings": true,
288
+ "current": false
289
+ },
290
+ "currency": {
291
+ "peso": false,
292
+ "us_dollar": true,
293
+ "others": false
294
+ }
295
+ },
296
+ "account_details": {
297
+ "account_number": "05039947290",
298
+ "account_name_merchant_name": "Amaia Skies"
299
+ },
300
+ "deposit_payment_breakdown": {
301
+ "cash_amount": null,
302
+ "checks": [{
303
+ "amount": 1000000.0,
304
+ "bank": null,
305
+ "date": null,
306
+ "details": null
307
+ }],
308
+ "total_deposits_payment": null
309
+ },
310
+ "teller_validation_bank_copy": null,
311
+ "for_bills_purchase_accommodation": {
312
+ "representative_full_name": "Amie Skies",
313
+ "contact_number": "0917 872 0056",
314
+ "signature_over_printed_name": "present",
315
+ "form_no": "BPI-BPDEP MAN-01222020"
316
+ },
317
+ "client_s_copy_teller_validation": null
318
+ }''',
319
+
320
+ "DB-Good.jpg": '''{
321
+ "document_type": "DEPOSIT / PAYMENT SLIP BACK",
322
+ "bank_name": "BANK OF THE PHILIPPINE ISLANDS",
323
+ "sections": {
324
+ "check_details_top": {
325
+ "checks": [{
326
+ "name_of_bank_branch": "Olanggapo",
327
+ "check_no": "0543729",
328
+ "amount": 100000.0
329
+ }],
330
+ "total_checks": null,
331
+ "total_cash": null,
332
+ "total_deposits_payment": null
333
+ },
334
+ "deposit_cash_breakdown": {
335
+ "items": [
336
+ {"no_of_pieces": 100, "denominations": 100, "amount": 1000},
337
+ {"no_of_pieces": 200, "denominations": 200, "amount": 200},
338
+ {"no_of_pieces": 300, "denominations": 300, "amount": 1500},
339
+ {"no_of_pieces": 500, "denominations": 400, "amount": 1250},
340
+ {"no_of_pieces": 600, "denominations": 600, "amount": 1750},
341
+ {"no_of_pieces": 700, "denominations": 700, "amount": 6350},
342
+ {"no_of_pieces": 800, "denominations": 800, "amount": 8750}
343
+ ],
344
+ "total": 10000750000
345
+ },
346
+ "representative_information": {
347
+ "full_name": "Anna Banana Cruz",
348
+ "contact_number": "09178123775",
349
+ "address": "11, Tower 2, City Residences, Manila",
350
+ "citizenship": "Japanese",
351
+ "date_of_birth": "03/31/2001",
352
+ "place_of_birth": "Bulacan",
353
+ "signature": null
354
+ },
355
+ "client_copy": {
356
+ "document_type": "DEPOSIT / PAYMENT SLIP (CLIENT'S COPY)",
357
+ "for_payments_only": {
358
+ "policy_plan_reference_no": null,
359
+ "policy_planholder_name": null,
360
+ "bp_customer_number": "03756245"
361
+ },
362
+ "check_details": {
363
+ "checks": [{
364
+ "bank_branch_name": "P. Tuazon",
365
+ "check_no": "0347345",
366
+ "amount": 100200200
367
+ }],
368
+ "total_checks": 800000,
369
+ "total_cash": 20000,
370
+ "total_deposits_payment": 820000
371
+ }
372
+ }
373
+ }
374
+ }''',
375
+
376
+ "DB-Bad.jpg": '''{
377
+ "document_type": "DEPOSIT / PAYMENT SLIP BACK",
378
+ "bank_name": "BANK OF THE PHILIPPINE ISLANDS",
379
+ "sections": {
380
+ "check_details_top": {
381
+ "checks": [{
382
+ "name_of_bank_branch": "Olanggapo",
383
+ "check_no": "0543729",
384
+ "amount": 100000.0
385
+ }],
386
+ "total_checks": null,
387
+ "total_cash": null,
388
+ "total_deposits_payment": null
389
+ },
390
+ "deposit_cash_breakdown": {
391
+ "items": [
392
+ {"no_of_pieces": 100, "denominations": 100, "amount": 1000},
393
+ {"no_of_pieces": 200, "denominations": 200, "amount": 200},
394
+ {"no_of_pieces": 300, "denominations": 300, "amount": 1500},
395
+ {"no_of_pieces": 500, "denominations": 400, "amount": 1250},
396
+ {"no_of_pieces": 600, "denominations": 600, "amount": 1750},
397
+ {"no_of_pieces": 700, "denominations": 700, "amount": 6350},
398
+ {"no_of_pieces": 800, "denominations": 800, "amount": 8750}
399
+ ],
400
+ "total": 10000750000
401
+ },
402
+ "representative_information": {
403
+ "full_name": "Anna Banana Cruz",
404
+ "contact_number": "09178123775",
405
+ "address": "11, Tower 2, City Residences, Manila",
406
+ "citizenship": "Japanese",
407
+ "date_of_birth": "03/31/2001",
408
+ "place_of_birth": "Bulacan",
409
+ "signature": null
410
+ },
411
+ "client_copy": {
412
+ "document_type": "DEPOSIT / PAYMENT SLIP (CLIENT'S COPY)",
413
+ "for_payments_only": {
414
+ "policy_plan_reference_no": null,
415
+ "policy_planholder_name": null,
416
+ "bp_customer_number": "03756245"
417
+ },
418
+ "check_details": {
419
+ "checks": [{
420
+ "bank_branch_name": "P. Tuazon",
421
+ "check_no": "0347345",
422
+ "amount": 100200200
423
+ }],
424
+ "total_checks": 800000,
425
+ "total_cash": 20000,
426
+ "total_deposits_payment": 820000
427
+ }
428
+ }
429
+ }
430
+ }''',
431
+
432
+ "WF-Good.jpg": '''{
433
+ "document_type": "WITHDRAWAL SLIP",
434
+ "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
435
+ "withdrawal_slip_details": {
436
+ "currency_type": "US DOLLAR",
437
+ "account_type": "CURRENT",
438
+ "account_number": "3456777799",
439
+ "account_name": "Maxine Yu",
440
+ "teller_validation": null
441
+ },
442
+ "withdrawal_amount": {
443
+ "amount_in_numbers": "USD 50,000"
444
+ },
445
+ "depositor_information": {
446
+ "signature_of_depositor": "present",
447
+ "date": null
448
+ },
449
+ "withdrawal_through_representative": {
450
+ "name_in_print": "Mark Garcia",
451
+ "signature_of_representative": "present",
452
+ "contact_no": "0918 251 0226",
453
+ "depositor_authorization_signatures": [
454
+ {"signature": "present", "date": "05/19/25"},
455
+ {"signature": "present", "date": "05/19/25"}
456
+ ]
457
+ },
458
+ "payment_received_by": {
459
+ "signature": "present",
460
+ "name": "Marco Polo"
461
+ },
462
+ "bank_use_only": {
463
+ "remarks": null,
464
+ "verified_by": null,
465
+ "approved_by": null
466
+ },
467
+ "form_no": "BPI-WDL OTC-01222020"
468
+ }''',
469
+
470
+ "WF-Bad.jpg": '''{
471
+ "document_type": "WITHDRAWAL SLIP",
472
+ "bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
473
+ "withdrawal_slip_details": {
474
+ "currency_type": "US DOLLAR",
475
+ "account_type": "CURRENT",
476
+ "account_number": "3456777799",
477
+ "account_name": "Maxine Yu",
478
+ "teller_validation": null
479
+ },
480
+ "withdrawal_amount": {
481
+ "amount_in_numbers": "USD 50,000"
482
+ },
483
+ "depositor_information": {
484
+ "signature_of_depositor": "present",
485
+ "date": null
486
+ },
487
+ "withdrawal_through_representative": {
488
+ "name_in_print": "Mark Garcia",
489
+ "signature_of_representative": "present",
490
+ "contact_no": "0918 251 0226",
491
+ "depositor_authorization_signatures": [
492
+ {"signature": "present", "date": "05/19/25"},
493
+ {"signature": "present", "date": "05/19/25"}
494
+ ]
495
+ },
496
+ "payment_received_by": {
497
+ "signature": "present",
498
+ "name": "Marco Polo"
499
+ },
500
+ "bank_use_only": {
501
+ "remarks": null,
502
+ "verified_by": null,
503
+ "approved_by": null
504
+ },
505
+ "form_no": "BPI-WDL OTC-01222020"
506
+ }''',
507
+
508
+ "WB-Good.jpg": '''{
509
+ "document_type": "WITHDRAWAL SLIP BACK",
510
+ "denominations_breakdown": {
511
+ "items": [
512
+ {"no_of_pieces": 1, "denomination": 100, "amount": 100},
513
+ {"no_of_pieces": 2, "denomination": 500, "amount": 1000},
514
+ {"no_of_pieces": 3, "denomination": 1000, "amount": 3000}
515
+ ],
516
+ "total": null
517
+ },
518
+ "representative_information": {
519
+ "full_name": "Mark Garcia",
520
+ "contact_number": "0918 251 3372",
521
+ "address": "1F Tower 1, SMDC, Camarines, Sur",
522
+ "citizenship": "American",
523
+ "date_of_birth": "12/15/2001",
524
+ "place_of_birth": "Bicol",
525
+ "signature": "present"
526
+ }
527
+ }''',
528
+
529
+ "WB-bad.jpeg": '''{
530
+ "document_type": "WITHDRAWAL SLIP BACK",
531
+ "denominations_breakdown": {
532
+ "items": [
533
+ {"no_of_pieces": 1, "denomination": 100, "amount": 100},
534
+ {"no_of_pieces": 2, "denomination": 500, "amount": 1000},
535
+ {"no_of_pieces": 3, "denomination": 1000, "amount": 3000}
536
+ ],
537
+ "total": null
538
+ },
539
+ "representative_information": {
540
+ "full_name": "Mark Garcia",
541
+ "contact_number": "0918 251 3372",
542
+ "address": "1F Tower 1, SMDC, Camarines, Sur",
543
+ "citizenship": "American",
544
+ "date_of_birth": "12/15/2001",
545
+ "place_of_birth": "Bicol",
546
+ "signature": "present"
547
+ }
548
+ }'''
549
+ }
550
+
551
+ # ─────────────────────────────────────────────────────────────
552
+ # 3) Evaluation + helper functions
553
+ # ─────────────────────────────────────────────────────────────
554
+
555
+ def compute_cer(gt, pred):
556
+ """Compute Character Error Rate."""
557
+ m, n = len(gt), len(pred)
558
+ dp = [[0]*(n+1) for _ in range(m+1)]
559
+ for i in range(m+1):
560
+ dp[i][0] = i
561
+ for j in range(n+1):
562
+ dp[0][j] = j
563
+ for i in range(1,m+1):
564
+ for j in range(1,n+1):
565
+ cost = 0 if gt[i-1]==pred[j-1] else 1
566
+ dp[i][j] = min(dp[i-1][j]+1, dp[i][j-1]+1, dp[i-1][j-1]+cost)
567
+ return dp[m][n]/max(m,1)
568
+
569
+ def extract_flat(o, parent=""):
570
+ """Extract flat key-value pairs from nested JSON."""
571
+ out = []
572
+ if isinstance(o, dict):
573
+ for k,v in o.items():
574
+ key = f"{parent}.{k}" if parent else k
575
+ out += extract_flat(v, key)
576
+ elif isinstance(o, list):
577
+ for i,v in enumerate(o):
578
+ out += extract_flat(v, f"{parent}[{i}]")
579
+ else:
580
+ out.append((parent, str(o)))
581
+ return out
582
+
583
+ def compute_field_accuracy(gt_json, pred_json):
584
+ """Compute strict field accuracy."""
585
+ try:
586
+ gt = dict(extract_flat(json.loads(gt_json)))
587
+ pr = dict(extract_flat(json.loads(pred_json)))
588
+ except:
589
+ return 0.0
590
+ total = len(gt)
591
+ correct = sum(1 for k,v in gt.items() if pr.get(k)==v)
592
+ return correct / total if total else 0.0
593
+
594
+ def field_matches(gt, pred, max_err_pct=0.1):
595
+ """Check if fields match with fuzzy matching."""
596
+ gt = re.sub(r'[^\w\s]', '', str(gt).lower().strip())
597
+ pred = re.sub(r'[^\w\s]', '', str(pred).lower().strip())
598
+ if not gt and not pred:
599
+ return True
600
+ return (1 - SequenceMatcher(None, gt, pred).ratio()) <= max_err_pct
601
+
602
+ def compute_fuzzy_field_accuracy(gt_json, pred_json):
603
+ """Compute fuzzy field accuracy."""
604
+ try:
605
+ gt = dict(extract_flat(json.loads(gt_json)))
606
+ pr = dict(extract_flat(json.loads(pred_json)))
607
+ except:
608
+ return 0.0
609
+ total = len(gt)
610
+ correct = sum(1 for k,v in gt.items() if field_matches(v, pr.get(k, "")))
611
+ return correct / total if total else 0.0
612
+
613
+ def canonicalize(js):
614
+ """Canonicalize JSON string."""
615
+ return json.dumps(json.loads(js), sort_keys=True, separators=(',', ':'))
616
+
617
+ def clean_json_string(js):
618
+ """Clean JSON string by removing markdown formatting."""
619
+ return re.sub(r'```(?:json)?\s*|\s*```', '', js.strip(), flags=re.DOTALL)
620
+
621
+ def extract_values_from_jsonlike(text):
622
+ """Extract all string values from JSON-like text."""
623
+ text = re.sub(r'[{}[\]",:]', ' ', text)
624
+ text = re.sub(r'\s+', ' ', text).strip()
625
+ return text
626
+
627
+ def compute_spelling_error_rate(text):
628
+ """Compute spelling error rate using NLTK words corpus."""
629
+ words = text.lower().split()
630
+ if not words:
631
+ return 0.0
632
+
633
+ english_words = set(nltk_words.words())
634
+ misspelled = sum(1 for word in words if word.isalpha() and word not in english_words)
635
+ return misspelled / len(words)
636
+
637
+ def compute_perplexity(text):
638
+ """Compute perplexity using GPT-2 model."""
639
+ try:
640
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
641
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
642
+
643
+ tokenizer.pad_token = tokenizer.eos_token
644
+ inputs = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
645
+
646
+ with torch.no_grad():
647
+ outputs = model(inputs, labels=inputs)
648
+ loss = outputs.loss
649
+
650
+ return math.exp(loss.item())
651
+ except Exception as e:
652
+ print(f"Error computing perplexity: {e}")
653
+ return float('inf')
654
+
655
+ def compute_refined_metrics(text):
656
+ """Compute refined spelling error rate with additional checks."""
657
+ words = text.lower().split()
658
+ if not words:
659
+ return 0.0
660
+
661
+ english_words = set(nltk_words.words())
662
+
663
+ errors = 0
664
+ for word in words:
665
+ if not word.isalpha():
666
+ continue
667
+
668
+ if word not in english_words:
669
+ corrected = word.replace('0', 'o').replace('1', 'l').replace('5', 's')
670
+ if corrected not in english_words:
671
+ errors += 1
672
+
673
+ return errors / len(words)
674
+
675
+ # ─────────────────────────────────────────────────────────────
676
+ # 4) Main processing function
677
+ # ─────────────────────────────────────────────────────────────
678
+
679
+ def process_document_image(image_bytes, filename=None):
680
+ """Process a document image and return extracted information and metrics."""
681
+ try:
682
+ # 1) load
683
+ image = Image.open(BytesIO(image_bytes))
684
+ img_format = image.format or "PNG"
685
+
686
+ # 2) compress & resize loop → ensure <4 MB
687
+ buf = BytesIO()
688
+ image.save(buf, format=img_format, optimize=True, quality=85)
689
+ while buf.getbuffer().nbytes > 4_000_000:
690
+ w, h = image.size
691
+ image = image.resize((int(w * 0.8), int(h * 0.8)), Image.LANCZOS) # type: ignore
692
+ buf = BytesIO()
693
+ image.save(buf, format=img_format, optimize=True, quality=85)
694
+
695
+ img_bytes = buf.getvalue() # this is your final image payload
696
+
697
+ ocr_prompt = "Extract all visible printed and handwritten text from this scanned bank document image."
698
+
699
+ image_part = {
700
+ "inlineData": {
701
+ "mimeType": "image/png",
702
+ "data": img_bytes
703
+ }
704
+ }
705
+
706
+ response = client.models.generate_content(
707
+ model=MODEL_ID,
708
+ contents=[
709
+ Part.from_bytes(data=img_bytes, mime_type="image/png"),
710
+ ocr_prompt
711
+ ])
712
+
713
+ raw_text = response.text.strip() # type: ignore
714
+ print("--- Raw OCR Text ---\n", raw_text[:1000], "\n")
715
+
716
+ # Extract JSON with Gemini from OCR
717
+ schema_prompt = (
718
+ "You are a JSON extractor for bank forms. Given the OCR text from a scanned image, "
719
+ "output ONLY valid JSON matching the correct schema, using null for blanks.\n\n"
720
+ "--- CIF Example:\n" + GROUND_TRUTHS["CIF-Good.png"] + "\n\n"
721
+ "--- DF Example:\n" + GROUND_TRUTHS["DF-Good.jpg"] + "\n\n"
722
+ "--- DB Example:\n" + GROUND_TRUTHS["DB-Good.jpg"] + "\n\n"
723
+ "--- WF Example:\n" + GROUND_TRUTHS["WF-Good.jpg"] + "\n\n"
724
+ "--- WB Example:\n" + GROUND_TRUTHS["WB-Good.jpg"] + "\n\n"
725
+ "Now extract JSON from this OCR text:\n" + raw_text
726
+ )
727
+
728
+ final = client.models.generate_content(
729
+ model=MODEL_ID,
730
+ contents=[schema_prompt]
731
+ )
732
+
733
+ pred_json = clean_json_string(final.text)
734
+
735
+ print("--- Extracted JSON ---\n", pred_json)
736
+
737
+ # Parse the extracted JSON
738
+ try:
739
+ extracted_data = json.loads(pred_json)
740
+ except json.JSONDecodeError:
741
+ extracted_data = {
742
+ "document_type": "unknown",
743
+ "raw_text": pred_json
744
+ }
745
+
746
+ # Compute basic metrics
747
+ clean_text = extract_values_from_jsonlike(pred_json)
748
+ ser = compute_spelling_error_rate(clean_text)
749
+
750
+ try:
751
+ ppl = compute_perplexity(clean_text)
752
+ except:
753
+ ppl = float("inf")
754
+
755
+ refined_ser = compute_refined_metrics(clean_text)
756
+
757
+ # Evaluate against ground truth if available
758
+ cer_score = 0.0
759
+ strict_accuracy = 0.0
760
+ fuzzy_accuracy = 0.0
761
+
762
+ if filename and filename in GROUND_TRUTHS:
763
+ gt_json = clean_json_string(GROUND_TRUTHS[filename])
764
+ try:
765
+ gt_can = canonicalize(gt_json)
766
+ pred_can = canonicalize(pred_json)
767
+ cer_score = compute_cer(gt_can, pred_can)
768
+ strict_accuracy = compute_field_accuracy(gt_json, pred_json)
769
+ fuzzy_accuracy = compute_fuzzy_field_accuracy(gt_json, pred_json)
770
+ except Exception as e:
771
+ print(f"Error in evaluation: {e}")
772
+ else:
773
+ print("⚠️ No ground truth available for this file.")
774
+
775
+ # Prepare metrics with proper handling of infinite values
776
+ metrics = {
777
+ "ser": ser,
778
+ "ppl": 999999.0 if ppl == float("inf") else ppl, # Replace inf with large finite value
779
+ "refined_ser": refined_ser,
780
+ "cer": cer_score,
781
+ "strict_field_accuracy": strict_accuracy,
782
+ "fuzzy_field_accuracy": fuzzy_accuracy
783
+ }
784
+
785
+ return {
786
+ "document_type": extracted_data.get("document_type", "unknown"),
787
+ "extracted": extracted_data,
788
+ "metrics": metrics,
789
+ "raw_text": raw_text,
790
+ "extracted_json": pred_json
791
+ }
792
+
793
+ except Exception as e:
794
+ print(f"Error processing document: {e}")
795
+ return {
796
+ "error": str(e),
797
+ "document_type": "unknown",
798
+ "extracted": {},
799
+ "metrics": {
800
+ "ser": 0.0,
801
+ "ppl": 999999.0, # Replace inf with large finite value
802
+ "refined_ser": 0.0,
803
+ "cer": 0.0,
804
+ "strict_field_accuracy": 0.0,
805
+ "fuzzy_field_accuracy": 0.0
806
+ }
807
+ }
services/text_processor.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # PLACEHOLDER