Spaces:
Runtime error
Runtime error
Upload 16 files
Browse files- .env.example +1 -0
- .gitignore +60 -0
- Dockerfile +29 -0
- README.md +70 -12
- api.py +244 -0
- requirements.txt +26 -0
- services/__pycache__/audio_diarization.cpython-311.pyc +0 -0
- services/__pycache__/audio_gemini.cpython-311.pyc +0 -0
- services/__pycache__/audio_whisper.cpython-311.pyc +0 -0
- services/__pycache__/image_ocr_processor.cpython-311.pyc +0 -0
- services/__pycache__/ocr_processor.cpython-311.pyc +0 -0
- services/audio_diarization.py +323 -0
- services/audio_gemini.py +91 -0
- services/audio_whisper.py +81 -0
- services/image_ocr_processor.py +807 -0
- services/text_processor.py +1 -0
.env.example
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
GEMINI_API_KEY= # YOUR API KEY HERE
|
.gitignore
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Environment variables
|
2 |
+
.env
|
3 |
+
.env.local
|
4 |
+
.env.development
|
5 |
+
.env.test
|
6 |
+
.env.production
|
7 |
+
|
8 |
+
# Python
|
9 |
+
__pycache__/
|
10 |
+
*.py[cod]
|
11 |
+
*$py.class
|
12 |
+
*.so
|
13 |
+
.Python
|
14 |
+
build/
|
15 |
+
develop-eggs/
|
16 |
+
dist/
|
17 |
+
downloads/
|
18 |
+
eggs/
|
19 |
+
.eggs/
|
20 |
+
lib/
|
21 |
+
lib64/
|
22 |
+
parts/
|
23 |
+
sdist/
|
24 |
+
var/
|
25 |
+
wheels/
|
26 |
+
*.egg-info/
|
27 |
+
.installed.cfg
|
28 |
+
*.egg
|
29 |
+
MANIFEST
|
30 |
+
|
31 |
+
# Virtual environment
|
32 |
+
venv/
|
33 |
+
env/
|
34 |
+
ENV/
|
35 |
+
env.bak/
|
36 |
+
venv.bak/
|
37 |
+
|
38 |
+
# IDE
|
39 |
+
.vscode/
|
40 |
+
.idea/
|
41 |
+
*.swp
|
42 |
+
*.swo
|
43 |
+
*~
|
44 |
+
|
45 |
+
# OS
|
46 |
+
.DS_Store
|
47 |
+
.DS_Store?
|
48 |
+
._*
|
49 |
+
.Spotlight-V100
|
50 |
+
.Trashes
|
51 |
+
ehthumbs.db
|
52 |
+
Thumbs.db
|
53 |
+
|
54 |
+
# Logs
|
55 |
+
*.log
|
56 |
+
logs/
|
57 |
+
|
58 |
+
# Temporary files
|
59 |
+
*.tmp
|
60 |
+
*.temp
|
Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11.8
|
2 |
+
|
3 |
+
WORKDIR /
|
4 |
+
|
5 |
+
# Copy requirements.txt to the container
|
6 |
+
COPY requirements.txt ./
|
7 |
+
|
8 |
+
# Install Python dependencies
|
9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
10 |
+
|
11 |
+
# Add a non-root user to run the application
|
12 |
+
RUN useradd -m -u 1000 user
|
13 |
+
|
14 |
+
# Set the user and home directory environment variables
|
15 |
+
USER user
|
16 |
+
ENV HOME=/home/user \
|
17 |
+
PATH=/home/user/.local/bin:$PATH
|
18 |
+
|
19 |
+
# Create the application directory
|
20 |
+
WORKDIR $HOME/app
|
21 |
+
|
22 |
+
# Copy the application code and model files
|
23 |
+
COPY --chown=user . $HOME/app/
|
24 |
+
|
25 |
+
# Expose the port the FastAPI app runs on
|
26 |
+
EXPOSE 7860
|
27 |
+
|
28 |
+
# Command to run the FastAPI app
|
29 |
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,12 +1,70 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Document Processor Backend
|
2 |
+
|
3 |
+
This is the FastAPI backend for the Document Processor application that uses Google's Gemini AI to extract structured information from bank documents.
|
4 |
+
|
5 |
+
## Setup Instructions
|
6 |
+
|
7 |
+
### 1. Environment Variables
|
8 |
+
|
9 |
+
To use this application, you need to set up your Gemini API key:
|
10 |
+
|
11 |
+
1. **Get your Gemini API key:**
|
12 |
+
- Go to [Google AI Studio](https://makersuite.google.com/app/apikey)
|
13 |
+
- Create a new API key
|
14 |
+
- Copy the API key
|
15 |
+
|
16 |
+
2. **Create a `.env` file:**
|
17 |
+
```bash
|
18 |
+
# In the backend directory, create a .env file
|
19 |
+
cp env_template.txt .env
|
20 |
+
```
|
21 |
+
|
22 |
+
3. **Edit the `.env` file:**
|
23 |
+
```bash
|
24 |
+
# Replace 'your_actual_api_key_here' with your real API key
|
25 |
+
GEMINI_API_KEY=your_actual_api_key_here
|
26 |
+
```
|
27 |
+
|
28 |
+
### 2. Install Dependencies
|
29 |
+
|
30 |
+
```bash
|
31 |
+
# Create virtual environment
|
32 |
+
python3 -m venv venv
|
33 |
+
|
34 |
+
# Activate virtual environment
|
35 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
36 |
+
|
37 |
+
# Install dependencies
|
38 |
+
pip install -r requirements.txt
|
39 |
+
```
|
40 |
+
|
41 |
+
### 3. Run the Server
|
42 |
+
|
43 |
+
```bash
|
44 |
+
# Make sure virtual environment is activated
|
45 |
+
source venv/bin/activate
|
46 |
+
|
47 |
+
# Start the server
|
48 |
+
uvicorn main:app --reload --host 0.0.0.0 --port 8000
|
49 |
+
```
|
50 |
+
|
51 |
+
The server will be available at `http://localhost:8000`
|
52 |
+
|
53 |
+
## API Endpoints
|
54 |
+
|
55 |
+
- `GET /` - Health check
|
56 |
+
- `GET /api/health` - Detailed health check
|
57 |
+
- `POST /api/process` - Process uploaded documents
|
58 |
+
|
59 |
+
## Security Notes
|
60 |
+
|
61 |
+
- The `.env` file is automatically ignored by git to prevent accidentally committing your API key
|
62 |
+
- Never commit your actual API key to version control
|
63 |
+
- Keep your API key secure and don't share it publicly
|
64 |
+
|
65 |
+
## Troubleshooting
|
66 |
+
|
67 |
+
If you get an error about `GEMINI_API_KEY not set in environment`, make sure:
|
68 |
+
1. You've created the `.env` file in the backend directory
|
69 |
+
2. You've added your actual API key to the file
|
70 |
+
3. The virtual environment is activated when running the server
|
api.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from fastapi.responses import RedirectResponse
|
4 |
+
import uvicorn
|
5 |
+
from typing import Any, Dict
|
6 |
+
from pydantic import BaseModel
|
7 |
+
|
8 |
+
from services.audio_whisper import process_audio_with_whisper
|
9 |
+
from services.audio_gemini import process_audio_with_gemini
|
10 |
+
from services.audio_diarization import process_audio_diarization, AudioDiarizationError
|
11 |
+
from services.image_ocr_processor import process_pdf_to_image, process_document_image
|
12 |
+
|
13 |
+
class TextRequest(BaseModel):
|
14 |
+
text: str
|
15 |
+
|
16 |
+
|
17 |
+
class HelloWorldResponse(BaseModel):
|
18 |
+
message: str
|
19 |
+
received_text: str
|
20 |
+
status: str
|
21 |
+
|
22 |
+
app = FastAPI(
|
23 |
+
title="Contact Center Operation Insights",
|
24 |
+
version="1.0.0"
|
25 |
+
)
|
26 |
+
|
27 |
+
# Configure CORS
|
28 |
+
app.add_middleware(
|
29 |
+
CORSMiddleware,
|
30 |
+
allow_origins=["*"],
|
31 |
+
allow_credentials=False,
|
32 |
+
allow_methods=["*"],
|
33 |
+
allow_headers=["*"],
|
34 |
+
)
|
35 |
+
|
36 |
+
@app.get("/")
|
37 |
+
async def docs():
|
38 |
+
return RedirectResponse(url="/docs")
|
39 |
+
|
40 |
+
@app.post("/audio/whisper", response_model=Dict[str, str])
|
41 |
+
async def audio_whisper(audio: UploadFile = File(...)):
|
42 |
+
"""
|
43 |
+
Transcribes and translates an audio file using OpenAI's Whisper model.
|
44 |
+
"""
|
45 |
+
# Basic validation for audio content types. Whisper is robust, but this
|
46 |
+
# prevents obviously incorrect file types from being processed.
|
47 |
+
if not audio.content_type or not audio.content_type.startswith('audio/'):
|
48 |
+
raise HTTPException(
|
49 |
+
status_code=400,
|
50 |
+
detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file."
|
51 |
+
)
|
52 |
+
|
53 |
+
try:
|
54 |
+
# Read the content of the uploaded audio file into memory
|
55 |
+
audio_bytes = await audio.read()
|
56 |
+
|
57 |
+
# Call the dedicated service to process the audio
|
58 |
+
result = process_audio_with_whisper(audio_bytes)
|
59 |
+
|
60 |
+
return result
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
# Catch exceptions from the audio processing service or file reading
|
64 |
+
raise HTTPException(status_code=500, detail=f"Audio processing failed: {str(e)}")
|
65 |
+
|
66 |
+
@app.post("/audio/gemini", response_model=Dict[str, str])
|
67 |
+
async def audio_gemini(audio: UploadFile = File(...)):
|
68 |
+
"""
|
69 |
+
Receives an audio file, transcribes it, and translates the transcription
|
70 |
+
to English using the Google Gemini 2.5 Pro model.
|
71 |
+
"""
|
72 |
+
if not audio.content_type or not audio.content_type.startswith('audio/'):
|
73 |
+
raise HTTPException(
|
74 |
+
status_code=400,
|
75 |
+
detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file."
|
76 |
+
)
|
77 |
+
|
78 |
+
try:
|
79 |
+
audio_bytes = await audio.read()
|
80 |
+
|
81 |
+
result = process_audio_with_gemini(audio_bytes=audio_bytes)
|
82 |
+
|
83 |
+
return result
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
# Catches exceptions from file reading or the Gemini service
|
87 |
+
raise HTTPException(status_code=500, detail=f"Audio processing with Gemini failed: {str(e)}")
|
88 |
+
|
89 |
+
@app.post("/audio/diarization")
|
90 |
+
async def audio_diarization(audio: UploadFile = File(...)) -> Dict[str, Any]:
|
91 |
+
"""
|
92 |
+
Process audio file for speaker diarization using Google Gemini 2.5 Pro.
|
93 |
+
|
94 |
+
This endpoint accepts audio files and returns speaker diarization results,
|
95 |
+
identifying different speakers and their spoken text segments throughout
|
96 |
+
the conversation.
|
97 |
+
"""
|
98 |
+
# Validate file type - accept common audio formats
|
99 |
+
if not audio.content_type or not audio.content_type.startswith('audio/'):
|
100 |
+
raise HTTPException(
|
101 |
+
status_code=400,
|
102 |
+
detail=f"Unsupported file type: '{audio.content_type}'. Please upload a valid audio file (WAV, MP3, MP4, M4A)."
|
103 |
+
)
|
104 |
+
|
105 |
+
# Additional validation for specific audio formats that work well with diarization
|
106 |
+
supported_types = [
|
107 |
+
'audio/wav', 'audio/wave', 'audio/x-wav',
|
108 |
+
'audio/mpeg', 'audio/mp3',
|
109 |
+
'audio/mp4', 'audio/m4a', 'audio/x-m4a'
|
110 |
+
]
|
111 |
+
|
112 |
+
if audio.content_type not in supported_types:
|
113 |
+
# Still allow processing but warn about potential issues
|
114 |
+
pass # Gemini is quite robust with audio formats
|
115 |
+
|
116 |
+
try:
|
117 |
+
# Read the uploaded audio file content
|
118 |
+
audio_bytes = await audio.read()
|
119 |
+
|
120 |
+
# Validate file size (optional - adjust based on your needs)
|
121 |
+
max_size_mb = 100 # 100MB limit
|
122 |
+
if len(audio_bytes) > max_size_mb * 1024 * 1024:
|
123 |
+
raise HTTPException(
|
124 |
+
status_code=400,
|
125 |
+
detail=f"File too large. Maximum size allowed is {max_size_mb}MB."
|
126 |
+
)
|
127 |
+
|
128 |
+
# Validate minimum file size to ensure it's not empty
|
129 |
+
if len(audio_bytes) < 1000: # Less than 1KB
|
130 |
+
raise HTTPException(
|
131 |
+
status_code=400,
|
132 |
+
detail="File appears to be empty or too small to process."
|
133 |
+
)
|
134 |
+
|
135 |
+
# Process the audio file for speaker diarization
|
136 |
+
result = process_audio_diarization(
|
137 |
+
audio_bytes=audio_bytes,
|
138 |
+
filename=audio.filename # type: ignore
|
139 |
+
)
|
140 |
+
|
141 |
+
return result
|
142 |
+
|
143 |
+
except AudioDiarizationError as e:
|
144 |
+
# Handle specific diarization errors with appropriate HTTP status
|
145 |
+
if "API key" in str(e).lower():
|
146 |
+
raise HTTPException(
|
147 |
+
status_code=500,
|
148 |
+
detail="Audio diarization service configuration error. Please contact support."
|
149 |
+
)
|
150 |
+
elif "format" in str(e).lower():
|
151 |
+
raise HTTPException(
|
152 |
+
status_code=400,
|
153 |
+
detail=f"Audio format error: {str(e)}"
|
154 |
+
)
|
155 |
+
else:
|
156 |
+
raise HTTPException(
|
157 |
+
status_code=500,
|
158 |
+
detail=f"Audio diarization failed: {str(e)}"
|
159 |
+
)
|
160 |
+
|
161 |
+
except HTTPException:
|
162 |
+
# Re-raise HTTP exceptions as-is
|
163 |
+
raise
|
164 |
+
|
165 |
+
except Exception as e:
|
166 |
+
# Catch any unexpected errors
|
167 |
+
raise HTTPException(
|
168 |
+
status_code=500,
|
169 |
+
detail=f"Unexpected error during audio diarization: {str(e)}"
|
170 |
+
)
|
171 |
+
|
172 |
+
@app.post("/image/process-document")
|
173 |
+
async def process_document(document: UploadFile = File(...)):
|
174 |
+
"""
|
175 |
+
Process uploaded document (image or PDF) and extract information [Model: Gemini 1.5 Flash]
|
176 |
+
"""
|
177 |
+
try:
|
178 |
+
# Read file content
|
179 |
+
file_bytes = await document.read()
|
180 |
+
|
181 |
+
# Handle different file types
|
182 |
+
if document.content_type.startswith('image/'): # type: ignore
|
183 |
+
# Process image directly
|
184 |
+
image_bytes = file_bytes
|
185 |
+
elif document.content_type == 'application/pdf':
|
186 |
+
# Convert PDF to image first
|
187 |
+
image_bytes = process_pdf_to_image(file_bytes)
|
188 |
+
else:
|
189 |
+
raise HTTPException(
|
190 |
+
status_code=400,
|
191 |
+
detail="Unsupported file type. Please upload an image (JPG, PNG, etc.) or PDF file."
|
192 |
+
)
|
193 |
+
|
194 |
+
# Process the document
|
195 |
+
result = process_document_image(image_bytes, document.filename)
|
196 |
+
|
197 |
+
return result
|
198 |
+
|
199 |
+
except Exception as e:
|
200 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
201 |
+
|
202 |
+
@app.post("/text", response_model=HelloWorldResponse)
|
203 |
+
async def text_insights(request: TextRequest) -> HelloWorldResponse:
|
204 |
+
"""
|
205 |
+
Simple text to insights endpoint
|
206 |
+
"""
|
207 |
+
try:
|
208 |
+
# Basic validation
|
209 |
+
if not request.text.strip():
|
210 |
+
raise HTTPException(
|
211 |
+
status_code=400,
|
212 |
+
detail="Text cannot be empty or contain only whitespace."
|
213 |
+
)
|
214 |
+
|
215 |
+
response = HelloWorldResponse(
|
216 |
+
message="Hello World! Text processing completed successfully.",
|
217 |
+
received_text=request.text,
|
218 |
+
status="success"
|
219 |
+
)
|
220 |
+
|
221 |
+
return response
|
222 |
+
|
223 |
+
except HTTPException:
|
224 |
+
raise
|
225 |
+
|
226 |
+
except Exception as e:
|
227 |
+
raise HTTPException(
|
228 |
+
status_code=500,
|
229 |
+
detail=f"Unexpected error processing text: {str(e)}"
|
230 |
+
)
|
231 |
+
|
232 |
+
@app.get("/health")
|
233 |
+
async def health_check():
|
234 |
+
"""Health check endpoint"""
|
235 |
+
return {"status": "healthy", "service": "document-processor"}
|
236 |
+
|
237 |
+
if __name__ == "__main__":
|
238 |
+
uvicorn.run(
|
239 |
+
"api:app",
|
240 |
+
host="0.0.0.0",
|
241 |
+
port=8000,
|
242 |
+
reload=True,
|
243 |
+
reload_dirs=["."]
|
244 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python version: 3.11
|
2 |
+
fastapi
|
3 |
+
uvicorn
|
4 |
+
|
5 |
+
python-multipart
|
6 |
+
Pillow
|
7 |
+
torch
|
8 |
+
transformers
|
9 |
+
nltk
|
10 |
+
python-dotenv
|
11 |
+
PyPDF2
|
12 |
+
pdf2image
|
13 |
+
openai
|
14 |
+
|
15 |
+
google
|
16 |
+
google-genai
|
17 |
+
google-api-core
|
18 |
+
pprintpp
|
19 |
+
pydub
|
20 |
+
ffmpeg-python
|
21 |
+
requests
|
22 |
+
google-cloud-aiplatform
|
23 |
+
librosa
|
24 |
+
soundfile
|
25 |
+
openai-whisper
|
26 |
+
pydantic
|
services/__pycache__/audio_diarization.cpython-311.pyc
ADDED
Binary file (12.6 kB). View file
|
|
services/__pycache__/audio_gemini.cpython-311.pyc
ADDED
Binary file (3.55 kB). View file
|
|
services/__pycache__/audio_whisper.cpython-311.pyc
ADDED
Binary file (3.92 kB). View file
|
|
services/__pycache__/image_ocr_processor.cpython-311.pyc
ADDED
Binary file (25.6 kB). View file
|
|
services/__pycache__/ocr_processor.cpython-311.pyc
ADDED
Binary file (25.6 kB). View file
|
|
services/audio_diarization.py
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import base64
|
3 |
+
import os
|
4 |
+
from typing import Dict, List, Any, Tuple
|
5 |
+
from pydub import AudioSegment
|
6 |
+
import io
|
7 |
+
|
8 |
+
from google import genai
|
9 |
+
from google.genai import types
|
10 |
+
|
11 |
+
|
12 |
+
class AudioDiarizationError(Exception):
|
13 |
+
"""Custom exception for audio diarization errors"""
|
14 |
+
pass
|
15 |
+
|
16 |
+
|
17 |
+
def get_gemini_client() -> genai.Client:
|
18 |
+
"""
|
19 |
+
Initialize and return a Google Gemini API client.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
genai.Client: Authenticated Gemini client
|
23 |
+
|
24 |
+
Raises:
|
25 |
+
AudioDiarizationError: If API key is not found or client initialization fails
|
26 |
+
"""
|
27 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
28 |
+
if not api_key:
|
29 |
+
raise AudioDiarizationError("GEMINI_API_KEY environment variable not found")
|
30 |
+
|
31 |
+
try:
|
32 |
+
client = genai.Client(api_key=api_key)
|
33 |
+
return client
|
34 |
+
except Exception as e:
|
35 |
+
raise AudioDiarizationError(f"Failed to initialize Gemini client: {str(e)}")
|
36 |
+
|
37 |
+
|
38 |
+
def get_audio_duration(audio_bytes: bytes) -> float:
|
39 |
+
"""
|
40 |
+
Get the duration of audio in seconds.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
audio_bytes: Raw audio file bytes
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
float: Duration in seconds
|
47 |
+
|
48 |
+
Raises:
|
49 |
+
AudioDiarizationError: If audio processing fails
|
50 |
+
"""
|
51 |
+
try:
|
52 |
+
# Create AudioSegment from bytes
|
53 |
+
audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
|
54 |
+
duration_sec = len(audio) / 1000.0
|
55 |
+
return duration_sec
|
56 |
+
except Exception as e:
|
57 |
+
raise AudioDiarizationError(f"Failed to process audio duration: {str(e)}")
|
58 |
+
|
59 |
+
|
60 |
+
def detect_audio_format(audio_bytes: bytes) -> str:
|
61 |
+
"""
|
62 |
+
Detect audio format from bytes.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
audio_bytes: Raw audio file bytes
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
str: Audio format (e.g., 'wav', 'mp3', 'mp4')
|
69 |
+
|
70 |
+
Raises:
|
71 |
+
AudioDiarizationError: If format detection fails
|
72 |
+
"""
|
73 |
+
try:
|
74 |
+
# Try to create AudioSegment to detect format
|
75 |
+
audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
|
76 |
+
|
77 |
+
# Check file signature/magic bytes for common formats
|
78 |
+
if audio_bytes.startswith(b'RIFF') and b'WAVE' in audio_bytes[:12]:
|
79 |
+
return 'wav'
|
80 |
+
elif audio_bytes.startswith(b'ID3') or audio_bytes.startswith(b'\xff\xfb'):
|
81 |
+
return 'mp3'
|
82 |
+
elif audio_bytes.startswith(b'\x00\x00\x00\x20ftypM4A'):
|
83 |
+
return 'm4a'
|
84 |
+
elif audio_bytes.startswith(b'\x00\x00\x00\x18ftyp') or audio_bytes.startswith(b'\x00\x00\x00\x20ftyp'):
|
85 |
+
return 'mp4'
|
86 |
+
else:
|
87 |
+
# Default to wav if we can't detect
|
88 |
+
return 'wav'
|
89 |
+
except Exception as e:
|
90 |
+
raise AudioDiarizationError(f"Failed to detect audio format: {str(e)}")
|
91 |
+
|
92 |
+
|
93 |
+
def create_diarization_request(audio_bytes: bytes, audio_format: str, model: str = "gemini-2.5-pro") -> Dict[str, Any]:
|
94 |
+
"""
|
95 |
+
Create a diarization request for the Gemini API.
|
96 |
+
|
97 |
+
Args:
|
98 |
+
audio_bytes: Raw audio file bytes
|
99 |
+
audio_format: Audio file format (e.g., 'wav', 'mp3')
|
100 |
+
model: Gemini model to use
|
101 |
+
|
102 |
+
Returns:
|
103 |
+
Dict containing the API request configuration
|
104 |
+
|
105 |
+
Raises:
|
106 |
+
AudioDiarizationError: If request creation fails
|
107 |
+
"""
|
108 |
+
try:
|
109 |
+
# Encode audio to base64
|
110 |
+
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
111 |
+
|
112 |
+
# Prepare request parts
|
113 |
+
audio_part = {
|
114 |
+
"inlineData": {
|
115 |
+
"mimeType": f"audio/{audio_format}",
|
116 |
+
"data": audio_b64
|
117 |
+
}
|
118 |
+
}
|
119 |
+
|
120 |
+
text_part = {
|
121 |
+
"text": (
|
122 |
+
"You are a speaker-diarization engine. "
|
123 |
+
"For the audio input, return a JSON object with a top-level `segments` array. "
|
124 |
+
"Each segment must have: `speaker` (string) and `text` (transcript)."
|
125 |
+
)
|
126 |
+
}
|
127 |
+
|
128 |
+
# Define JSON schema for structured response
|
129 |
+
schema = {
|
130 |
+
"type": "object",
|
131 |
+
"properties": {
|
132 |
+
"segments": {
|
133 |
+
"type": "array",
|
134 |
+
"items": {
|
135 |
+
"type": "object",
|
136 |
+
"properties": {
|
137 |
+
"speaker": {"type": "string"},
|
138 |
+
"text": {"type": "string"}
|
139 |
+
},
|
140 |
+
"required": ["speaker", "text"]
|
141 |
+
}
|
142 |
+
}
|
143 |
+
},
|
144 |
+
"required": ["segments"]
|
145 |
+
}
|
146 |
+
|
147 |
+
# Build configuration for JSON mode
|
148 |
+
config = types.GenerateContentConfig(
|
149 |
+
response_mime_type="application/json",
|
150 |
+
response_schema=schema
|
151 |
+
)
|
152 |
+
|
153 |
+
# Build complete request
|
154 |
+
request_kwargs = {
|
155 |
+
"model": model,
|
156 |
+
"contents": [audio_part, text_part],
|
157 |
+
"config": config
|
158 |
+
}
|
159 |
+
|
160 |
+
return request_kwargs
|
161 |
+
|
162 |
+
except Exception as e:
|
163 |
+
raise AudioDiarizationError(f"Failed to create diarization request: {str(e)}")
|
164 |
+
|
165 |
+
|
166 |
+
def parse_diarization_response(response_text: str) -> Tuple[List[Dict[str, str]], Dict[str, Any]]:
|
167 |
+
"""
|
168 |
+
Parse the Gemini API response for diarization results.
|
169 |
+
|
170 |
+
Args:
|
171 |
+
response_text: Raw JSON response text from Gemini
|
172 |
+
|
173 |
+
Returns:
|
174 |
+
Tuple of (segments_list, raw_json_dict)
|
175 |
+
|
176 |
+
Raises:
|
177 |
+
AudioDiarizationError: If JSON parsing fails
|
178 |
+
"""
|
179 |
+
try:
|
180 |
+
raw_json = json.loads(response_text)
|
181 |
+
segments = raw_json.get("segments", [])
|
182 |
+
|
183 |
+
# Validate segments structure
|
184 |
+
if not isinstance(segments, list):
|
185 |
+
raise AudioDiarizationError("Response segments must be a list")
|
186 |
+
|
187 |
+
for i, segment in enumerate(segments):
|
188 |
+
if not isinstance(segment, dict):
|
189 |
+
raise AudioDiarizationError(f"Segment {i} must be a dictionary")
|
190 |
+
if "speaker" not in segment or "text" not in segment:
|
191 |
+
raise AudioDiarizationError(f"Segment {i} missing required fields 'speaker' or 'text'")
|
192 |
+
|
193 |
+
return segments, raw_json
|
194 |
+
|
195 |
+
except json.JSONDecodeError as e:
|
196 |
+
raise AudioDiarizationError(f"Failed to parse JSON from Gemini response: {str(e)}")
|
197 |
+
except Exception as e:
|
198 |
+
raise AudioDiarizationError(f"Failed to process diarization response: {str(e)}")
|
199 |
+
|
200 |
+
|
201 |
+
def calculate_diarization_stats(segments: List[Dict[str, str]], duration_sec: float) -> Dict[str, Any]:
|
202 |
+
"""
|
203 |
+
Calculate statistics from diarization results.
|
204 |
+
|
205 |
+
Args:
|
206 |
+
segments: List of speaker segments
|
207 |
+
duration_sec: Audio duration in seconds
|
208 |
+
|
209 |
+
Returns:
|
210 |
+
Dict containing diarization statistics
|
211 |
+
"""
|
212 |
+
total_turns = len(segments)
|
213 |
+
speakers = set(segment["speaker"] for segment in segments)
|
214 |
+
num_speakers = len(speakers)
|
215 |
+
|
216 |
+
# Format duration as MM:SS
|
217 |
+
duration_str = f"{int(duration_sec//60):02d}:{int(duration_sec%60):02d}"
|
218 |
+
|
219 |
+
return {
|
220 |
+
"total_turns": total_turns,
|
221 |
+
"num_speakers": num_speakers,
|
222 |
+
"duration_seconds": duration_sec,
|
223 |
+
"duration_formatted": duration_str,
|
224 |
+
"speakers": sorted(list(speakers))
|
225 |
+
}
|
226 |
+
|
227 |
+
|
228 |
+
def process_audio_diarization(audio_bytes: bytes, filename: str = None) -> Dict[str, Any]: # type: ignore
|
229 |
+
"""
|
230 |
+
Process audio file for speaker diarization using Gemini 2.5 Pro.
|
231 |
+
|
232 |
+
This function takes raw audio bytes and returns a structured JSON response
|
233 |
+
containing speaker diarization results with segments, statistics, and metadata.
|
234 |
+
|
235 |
+
Args:
|
236 |
+
audio_bytes: Raw audio file bytes
|
237 |
+
filename: Optional filename for metadata
|
238 |
+
|
239 |
+
Returns:
|
240 |
+
Dict containing:
|
241 |
+
- segments: List of speaker segments with speaker and text
|
242 |
+
- statistics: Diarization statistics (speakers, turns, duration)
|
243 |
+
- metadata: Processing metadata
|
244 |
+
- raw_response: Original Gemini response
|
245 |
+
|
246 |
+
Raises:
|
247 |
+
AudioDiarizationError: If any step of the diarization process fails
|
248 |
+
"""
|
249 |
+
try:
|
250 |
+
# Initialize Gemini client
|
251 |
+
client = get_gemini_client()
|
252 |
+
|
253 |
+
# Get audio duration and format
|
254 |
+
duration_sec = get_audio_duration(audio_bytes)
|
255 |
+
audio_format = detect_audio_format(audio_bytes)
|
256 |
+
|
257 |
+
# Create API request
|
258 |
+
request_kwargs = create_diarization_request(audio_bytes, audio_format)
|
259 |
+
|
260 |
+
# Make API call to Gemini
|
261 |
+
try:
|
262 |
+
response = client.models.generate_content(**request_kwargs)
|
263 |
+
response_text = response.text
|
264 |
+
except Exception as e:
|
265 |
+
raise AudioDiarizationError(f"Gemini API call failed: {str(e)}")
|
266 |
+
|
267 |
+
# Parse response
|
268 |
+
segments, raw_json = parse_diarization_response(response_text) # type: ignore
|
269 |
+
|
270 |
+
# Calculate statistics
|
271 |
+
stats = calculate_diarization_stats(segments, duration_sec)
|
272 |
+
|
273 |
+
# Build final response
|
274 |
+
result = {
|
275 |
+
"segments": segments,
|
276 |
+
"statistics": stats,
|
277 |
+
"metadata": {
|
278 |
+
"filename": filename,
|
279 |
+
"audio_format": audio_format,
|
280 |
+
"model_used": "gemini-2.5-pro",
|
281 |
+
"processing_status": "success"
|
282 |
+
},
|
283 |
+
"raw_response": raw_json
|
284 |
+
}
|
285 |
+
|
286 |
+
return result
|
287 |
+
|
288 |
+
except AudioDiarizationError:
|
289 |
+
# Re-raise our custom errors
|
290 |
+
raise
|
291 |
+
except Exception as e:
|
292 |
+
# Catch any unexpected errors
|
293 |
+
raise AudioDiarizationError(f"Unexpected error during audio diarization: {str(e)}")
|
294 |
+
|
295 |
+
|
296 |
+
# Example usage and testing function
|
297 |
+
def test_diarization_service():
|
298 |
+
"""
|
299 |
+
Test function for the diarization service.
|
300 |
+
This is mainly for development and debugging purposes.
|
301 |
+
"""
|
302 |
+
try:
|
303 |
+
# This would require an actual audio file to test
|
304 |
+
print("Audio diarization service loaded successfully")
|
305 |
+
print("Available functions:")
|
306 |
+
print("- process_audio_diarization(audio_bytes, filename)")
|
307 |
+
print("- get_gemini_client()")
|
308 |
+
print("- get_audio_duration(audio_bytes)")
|
309 |
+
print("- detect_audio_format(audio_bytes)")
|
310 |
+
|
311 |
+
# Check if API key is available
|
312 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
313 |
+
if api_key:
|
314 |
+
print("✓ GEMINI_API_KEY found in environment")
|
315 |
+
else:
|
316 |
+
print("✗ GEMINI_API_KEY not found in environment")
|
317 |
+
|
318 |
+
except Exception as e:
|
319 |
+
print(f"Service test failed: {e}")
|
320 |
+
|
321 |
+
|
322 |
+
if __name__ == "__main__":
|
323 |
+
test_diarization_service()
|
services/audio_gemini.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Dict
|
3 |
+
|
4 |
+
import google.genai as genai
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from google.genai.types import Part
|
7 |
+
|
8 |
+
# Load environment variables from a .env file in the root directory
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
# --- Configuration ---
|
12 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
13 |
+
MODEL_ID = "gemini-2.5-pro"
|
14 |
+
|
15 |
+
# --- Client Initialization ---
|
16 |
+
if not GEMINI_API_KEY:
|
17 |
+
raise ValueError(
|
18 |
+
"GEMINI_API_KEY not found in environment variables. "
|
19 |
+
"Please create a .env file in the project root and set the key."
|
20 |
+
)
|
21 |
+
|
22 |
+
# Configure the genai client with the API key
|
23 |
+
client = genai.Client(api_key=GEMINI_API_KEY)
|
24 |
+
|
25 |
+
def _transcribe_audio(audio_bytes: bytes) -> str:
|
26 |
+
"""
|
27 |
+
Sends base64-encoded WAV audio to the model and returns the transcription as plain text.
|
28 |
+
"""
|
29 |
+
audio_part = Part.from_bytes(data=audio_bytes, mime_type="audio/wav")
|
30 |
+
text_part = (
|
31 |
+
"You are a world-class transcription engine. "
|
32 |
+
"Transcribe the following audio to plain text only, with no extra formatting:\n\n"
|
33 |
+
"(Begin audio input)"
|
34 |
+
)
|
35 |
+
|
36 |
+
resp = client.models.generate_content(
|
37 |
+
model=MODEL_ID,
|
38 |
+
contents=[audio_part,
|
39 |
+
text_part
|
40 |
+
]
|
41 |
+
)
|
42 |
+
return resp.text.strip() # type: ignore
|
43 |
+
|
44 |
+
|
45 |
+
def _translate_to_english(text: str) -> str:
|
46 |
+
"""
|
47 |
+
Detects the language of the input and translates it into English.
|
48 |
+
"""
|
49 |
+
prompt = (
|
50 |
+
"You are a world-class translation engine. "
|
51 |
+
"Detect the language of the following text and translate it into English. "
|
52 |
+
"Return ONLY the translated English text with no extra commentary:\n\n"
|
53 |
+
f"{text}"
|
54 |
+
)
|
55 |
+
resp = client.models.generate_content(
|
56 |
+
model=MODEL_ID,
|
57 |
+
contents=prompt
|
58 |
+
)
|
59 |
+
return resp.text.strip() # type: ignore
|
60 |
+
|
61 |
+
|
62 |
+
def process_audio_with_gemini(audio_bytes: bytes) -> Dict[str, str]:
|
63 |
+
"""
|
64 |
+
Processes an audio file by first transcribing it and then translating the
|
65 |
+
resulting text to English using the Gemini model.
|
66 |
+
|
67 |
+
This function orchestrates the transcription and translation calls.
|
68 |
+
|
69 |
+
Args:
|
70 |
+
audio_bytes: The byte content of the audio file.
|
71 |
+
mime_type: The MIME type of the audio file (e.g., 'audio/wav', 'audio/mp3').
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
A dictionary containing the 'transcription' and 'translation'.
|
75 |
+
|
76 |
+
Raises:
|
77 |
+
Exception: If there is an error during the API calls to the Gemini model.
|
78 |
+
"""
|
79 |
+
try:
|
80 |
+
# Step 1: Transcribe the audio using the internal helper function
|
81 |
+
transcription = _transcribe_audio(audio_bytes)
|
82 |
+
|
83 |
+
# Step 2: Translate the transcription to English if it's not empty
|
84 |
+
translation = ""
|
85 |
+
if transcription:
|
86 |
+
translation = _translate_to_english(transcription)
|
87 |
+
|
88 |
+
return {"transcription": transcription, "translation": translation}
|
89 |
+
except Exception as e:
|
90 |
+
# Re-raise the exception with more context to be caught by the API endpoint
|
91 |
+
raise Exception(f"Error processing audio with Gemini: {str(e)}")
|
services/audio_whisper.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import torch
|
3 |
+
import tempfile
|
4 |
+
import os
|
5 |
+
from typing import Dict
|
6 |
+
|
7 |
+
# Determine the most efficient device available (CUDA if possible, otherwise CPU)
|
8 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
|
10 |
+
# Load the Whisper model once when the module is imported.
|
11 |
+
# This is a time and resource-intensive operation, so it should not be done on every API call.
|
12 |
+
try:
|
13 |
+
print(f"Loading Whisper model 'large' onto device '{DEVICE}'...")
|
14 |
+
model = whisper.load_model("large", device=DEVICE)
|
15 |
+
print("Whisper model loaded successfully.")
|
16 |
+
except Exception as e:
|
17 |
+
print(f"Fatal: Error loading Whisper model: {e}")
|
18 |
+
model = None
|
19 |
+
|
20 |
+
def process_audio_with_whisper(audio_bytes: bytes) -> Dict[str, str]:
|
21 |
+
"""
|
22 |
+
Transcribes and translates a given audio file's bytes using the Whisper model.
|
23 |
+
|
24 |
+
This function saves the audio bytes to a temporary file and passes the file
|
25 |
+
path to Whisper for processing. This is a robust way to handle file access
|
26 |
+
and prevent permission errors with ffmpeg, especially on Windows.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
audio_bytes: The raw bytes of the audio file (e.g., WAV, MP3).
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
A dictionary containing the Tagalog transcription and English translation.
|
33 |
+
Example: {"transcription": "...", "translation": "..."}
|
34 |
+
|
35 |
+
Raises:
|
36 |
+
ValueError: If the Whisper model was not loaded successfully.
|
37 |
+
Exception: If audio processing or model inference fails.
|
38 |
+
"""
|
39 |
+
if model is None:
|
40 |
+
raise ValueError("Whisper model is not available or failed to load.")
|
41 |
+
|
42 |
+
# Create a temporary file to store the audio.
|
43 |
+
# Using delete=False is crucial on Windows to allow other processes to open the file by its path.
|
44 |
+
# We will manually delete the file in the 'finally' block.
|
45 |
+
try:
|
46 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".tmp") as temp_audio_file:
|
47 |
+
temp_path = temp_audio_file.name
|
48 |
+
# Write the uploaded audio bytes to the temporary file
|
49 |
+
temp_audio_file.write(audio_bytes)
|
50 |
+
# The file is automatically closed when exiting the 'with' block
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error creating temporary file: {e}")
|
53 |
+
raise
|
54 |
+
|
55 |
+
try:
|
56 |
+
# Perform transcription using the file path
|
57 |
+
transcription_result = model.transcribe(
|
58 |
+
temp_path,
|
59 |
+
language="tl",
|
60 |
+
task="transcribe"
|
61 |
+
)
|
62 |
+
|
63 |
+
# Perform translation using the same file path
|
64 |
+
translation_result = model.transcribe(
|
65 |
+
temp_path,
|
66 |
+
language="tl",
|
67 |
+
task="translate"
|
68 |
+
)
|
69 |
+
|
70 |
+
return {
|
71 |
+
"transcription": transcription_result.get('text', '').strip(), # type: ignore
|
72 |
+
"translation": translation_result.get('text', '').strip() # type: ignore
|
73 |
+
}
|
74 |
+
except Exception as e:
|
75 |
+
# Log and re-raise any exceptions to be handled by the FastAPI endpoint
|
76 |
+
print(f"An error occurred during Whisper processing: {e}")
|
77 |
+
raise
|
78 |
+
finally:
|
79 |
+
# Ensure the temporary file is deleted after processing
|
80 |
+
if 'temp_path' in locals() and os.path.exists(temp_path):
|
81 |
+
os.remove(temp_path)
|
services/image_ocr_processor.py
ADDED
@@ -0,0 +1,807 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import re
|
3 |
+
import math
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
+
import nltk
|
8 |
+
from nltk.corpus import words as nltk_words
|
9 |
+
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
10 |
+
from PIL import Image
|
11 |
+
from google import genai
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
from difflib import SequenceMatcher
|
14 |
+
from io import BytesIO
|
15 |
+
from pdf2image import convert_from_bytes
|
16 |
+
|
17 |
+
from google.genai.types import (
|
18 |
+
Part
|
19 |
+
)
|
20 |
+
|
21 |
+
# Load environment variables
|
22 |
+
load_dotenv()
|
23 |
+
|
24 |
+
# Download NLTK data
|
25 |
+
try:
|
26 |
+
nltk.data.find('corpora/words')
|
27 |
+
except LookupError:
|
28 |
+
nltk.download("words", quiet=True)
|
29 |
+
|
30 |
+
# ─────────────────────────────────────────────────────────────
|
31 |
+
# 0) Process PDF to Image
|
32 |
+
# ─────────────────────────────────────────────────────────────
|
33 |
+
def process_pdf_to_image(pdf_bytes):
|
34 |
+
"""
|
35 |
+
Convert PDF to image for processing
|
36 |
+
"""
|
37 |
+
try:
|
38 |
+
# Convert PDF to images (first page only)
|
39 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1)
|
40 |
+
if not images:
|
41 |
+
raise Exception("Could not convert PDF to image")
|
42 |
+
|
43 |
+
# Convert PIL image to bytes
|
44 |
+
img_byte_arr = io.BytesIO()
|
45 |
+
images[0].save(img_byte_arr, format='PNG')
|
46 |
+
img_byte_arr = img_byte_arr.getvalue()
|
47 |
+
|
48 |
+
return img_byte_arr
|
49 |
+
except Exception as e:
|
50 |
+
raise Exception(f"PDF processing failed: {str(e)}")
|
51 |
+
|
52 |
+
# ─────────────────────────────────────────────────────────────
|
53 |
+
# 1) API key and document prompts
|
54 |
+
# ─────────────────────────────────────────────────────────────
|
55 |
+
|
56 |
+
# Load environment variables and configure Gemini API
|
57 |
+
API_KEY = os.getenv("GEMINI_API_KEY")
|
58 |
+
if not API_KEY:
|
59 |
+
raise RuntimeError("GEMINI_API_KEY not set in environment. Please create a .env file with your API key.")
|
60 |
+
|
61 |
+
# Configure the SDK
|
62 |
+
client = genai.Client(vertexai=False, api_key=API_KEY)
|
63 |
+
|
64 |
+
# Initialize your model
|
65 |
+
MODEL_ID = "gemini-1.5-flash"
|
66 |
+
|
67 |
+
# ─────────────────────────────────────────────────────────────
|
68 |
+
# 2) Ground Truth Data
|
69 |
+
# ─────────────────────────────────────────────────────────────
|
70 |
+
|
71 |
+
GROUND_TRUTHS = {
|
72 |
+
"CIF-Good.png": '''{
|
73 |
+
"document_type": "CUSTOMER INFORMATION SHEET (INDIVIDUAL)",
|
74 |
+
"bank_name": "BPI",
|
75 |
+
"personal_information": {
|
76 |
+
"rm_no": null,
|
77 |
+
"last_name": "Garnet",
|
78 |
+
"first_name": "Lawrence",
|
79 |
+
"middle_name": "Dela Cruz",
|
80 |
+
"suffix": "III",
|
81 |
+
"date_of_birth": "10/21/1962",
|
82 |
+
"place_of_birth": "Rizal, Philippines",
|
83 |
+
"citizenship": null,
|
84 |
+
"sex": "Male",
|
85 |
+
"marital_status": "Married",
|
86 |
+
"mother_s_full_maiden_name": "Rosa H. Dela Cruz",
|
87 |
+
"spouse_name": "Marion V. Garnet",
|
88 |
+
"tin_number": null,
|
89 |
+
"sss_number": null,
|
90 |
+
"spouse_birthdate": "8/10/1965",
|
91 |
+
"id_presented": {
|
92 |
+
"id_type": "Drivers",
|
93 |
+
"id_number": "2961781134"
|
94 |
+
},
|
95 |
+
"no_of_children": 2,
|
96 |
+
"highest_educational_attainment": "College Graduate"
|
97 |
+
},
|
98 |
+
"contact_information": {
|
99 |
+
"mobile_no": "+63 917 926 9175",
|
100 |
+
"landline_no": null,
|
101 |
+
"email_address": "[email protected]",
|
102 |
+
"home_address": "Amorsolo St. Brgy. Aguinaldo",
|
103 |
+
"country": "Philippines",
|
104 |
+
"zip_code": "1366",
|
105 |
+
"district_town": null,
|
106 |
+
"city_municipality_provice": "Rizal",
|
107 |
+
"residence_since_mm_dd_yyyy": null,
|
108 |
+
"home_ownership": "Owned"
|
109 |
+
},
|
110 |
+
"financial_information": {
|
111 |
+
"profession_business_name": "Name",
|
112 |
+
"date_hired": "01/10/2012",
|
113 |
+
"employer_business_address": "[email protected]",
|
114 |
+
"position_rank": "Assistant VP",
|
115 |
+
"nature_of_business_self_employment": "Sales",
|
116 |
+
"source_of_income_wealth": {
|
117 |
+
"monthly_income": 110000
|
118 |
+
}
|
119 |
+
},
|
120 |
+
"fatca_declaration": {
|
121 |
+
"i_am_not_a_us_person": true,
|
122 |
+
"i_am_a_us_person": false,
|
123 |
+
"us_person_details": {
|
124 |
+
"us_citizen": false,
|
125 |
+
"us_resident_green_card": false,
|
126 |
+
"us_tin": false,
|
127 |
+
"us_id": false,
|
128 |
+
"w9_submitted": false,
|
129 |
+
"us_place_of_birth_1": null,
|
130 |
+
"us_place_of_birth_2": null,
|
131 |
+
"required_documents_submitted": {
|
132 |
+
"w8_ben": null,
|
133 |
+
"certificate_of_loss_of_us_nationality": null,
|
134 |
+
"written_explanation_not_having_certificate_despite_renunciation": null,
|
135 |
+
"written_explanation_why_us_citizenship_not_obtained_at_birth": null
|
136 |
+
}
|
137 |
+
}
|
138 |
+
},
|
139 |
+
"certification_and_authorization": {
|
140 |
+
"customer_signature": null,
|
141 |
+
"date": "02/03/25"
|
142 |
+
},
|
143 |
+
"for_bank_use_only": {
|
144 |
+
"remarks": null,
|
145 |
+
"processed_and_signature_verified_by": "Simon Eulalia",
|
146 |
+
"approved_by": "Ray Hernandez"
|
147 |
+
},
|
148 |
+
"form_no": "BPI-CISS IND-02222022"
|
149 |
+
}''',
|
150 |
+
|
151 |
+
"CIF-bad.jpg": '''{
|
152 |
+
"document_type": "CUSTOMER INFORMATION SHEET (INDIVIDUAL)",
|
153 |
+
"bank_name": "BPI",
|
154 |
+
"personal_information": {
|
155 |
+
"rm_no": null,
|
156 |
+
"last_name": "Garnet",
|
157 |
+
"first_name": "Lawrence",
|
158 |
+
"middle_name": "Dela Cruz",
|
159 |
+
"suffix": "III",
|
160 |
+
"date_of_birth": "10/21/1962",
|
161 |
+
"place_of_birth": "Rizal, Philippines",
|
162 |
+
"citizenship": null,
|
163 |
+
"sex": "Male",
|
164 |
+
"marital_status": "Married",
|
165 |
+
"mother_s_full_maiden_name": "Rosa H. Dela Cruz",
|
166 |
+
"spouse_name": "Marion V. Garnet",
|
167 |
+
"tin_number": null,
|
168 |
+
"sss_number": null,
|
169 |
+
"spouse_birthdate": "8/10/1965",
|
170 |
+
"id_presented": {
|
171 |
+
"id_type": "Drivers",
|
172 |
+
"id_number": "2961781134"
|
173 |
+
},
|
174 |
+
"no_of_children": 2,
|
175 |
+
"highest_educational_attainment": "College Graduate"
|
176 |
+
},
|
177 |
+
"contact_information": {
|
178 |
+
"mobile_no": "+63 917 926 9175",
|
179 |
+
"landline_no": null,
|
180 |
+
"email_address": "[email protected]",
|
181 |
+
"home_address": "Amorsolo St. Brgy. Aguinaldo",
|
182 |
+
"country": "Philippines",
|
183 |
+
"zip_code": "1366",
|
184 |
+
"district_town": null,
|
185 |
+
"city_municipality_provice": "Rizal",
|
186 |
+
"residence_since_mm_dd_yyyy": null,
|
187 |
+
"home_ownership": "Owned"
|
188 |
+
},
|
189 |
+
"financial_information": {
|
190 |
+
"profession_business_name": "Name",
|
191 |
+
"date_hired": "01/10/2012",
|
192 |
+
"employer_business_address": "[email protected]",
|
193 |
+
"position_rank": "Assistant VP",
|
194 |
+
"nature_of_business_self_employment": "Sales",
|
195 |
+
"source_of_income_wealth": {
|
196 |
+
"monthly_income": 110000
|
197 |
+
}
|
198 |
+
},
|
199 |
+
"fatca_declaration": {
|
200 |
+
"i_am_not_a_us_person": true,
|
201 |
+
"i_am_a_us_person": false,
|
202 |
+
"us_person_details": {
|
203 |
+
"us_citizen": false,
|
204 |
+
"us_resident_green_card": false,
|
205 |
+
"us_tin": false,
|
206 |
+
"us_id": false,
|
207 |
+
"w9_submitted": false,
|
208 |
+
"us_place_of_birth_1": null,
|
209 |
+
"us_place_of_birth_2": null,
|
210 |
+
"required_documents_submitted": {
|
211 |
+
"w8_ben": null,
|
212 |
+
"certificate_of_loss_of_us_nationality": null,
|
213 |
+
"written_explanation_not_having_certificate_despite_renunciation": null,
|
214 |
+
"written_explanation_why_us_citizenship_not_obtained_at_birth": null
|
215 |
+
}
|
216 |
+
}
|
217 |
+
},
|
218 |
+
"certification_and_authorization": {
|
219 |
+
"customer_signature": null,
|
220 |
+
"date": "02/03/25"
|
221 |
+
},
|
222 |
+
"for_bank_use_only": {
|
223 |
+
"remarks": null,
|
224 |
+
"processed_and_signature_verified_by": "Simon Eulalia",
|
225 |
+
"approved_by": "Ray Hernandez"
|
226 |
+
},
|
227 |
+
"form_no": "BPI-CISS IND-02222022"
|
228 |
+
}''',
|
229 |
+
|
230 |
+
"DF-Good.jpg": '''{
|
231 |
+
"document_type": "DEPOSIT / PAYMENT / BILLS PURCHASE FORM FRONT",
|
232 |
+
"copy_type": "BANK'S_COPY",
|
233 |
+
"bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
|
234 |
+
"transaction_details": {
|
235 |
+
"date": "03/29/14",
|
236 |
+
"transaction_type": {
|
237 |
+
"deposit": true,
|
238 |
+
"payment": false,
|
239 |
+
"bills_purchase": false
|
240 |
+
},
|
241 |
+
"account_type": {
|
242 |
+
"savings": true,
|
243 |
+
"current": false
|
244 |
+
},
|
245 |
+
"currency": {
|
246 |
+
"peso": false,
|
247 |
+
"us_dollar": true,
|
248 |
+
"others": false
|
249 |
+
}
|
250 |
+
},
|
251 |
+
"account_details": {
|
252 |
+
"account_number": "05039947290",
|
253 |
+
"account_name_merchant_name": "Amaia Skies"
|
254 |
+
},
|
255 |
+
"deposit_payment_breakdown": {
|
256 |
+
"cash_amount": null,
|
257 |
+
"checks": [{
|
258 |
+
"amount": 1000000.0,
|
259 |
+
"bank": null,
|
260 |
+
"date": null,
|
261 |
+
"details": null
|
262 |
+
}],
|
263 |
+
"total_deposits_payment": null
|
264 |
+
},
|
265 |
+
"teller_validation_bank_copy": null,
|
266 |
+
"for_bills_purchase_accommodation": {
|
267 |
+
"representative_full_name": "Amie Skies",
|
268 |
+
"contact_number": "0917 872 0056",
|
269 |
+
"signature_over_printed_name": "present",
|
270 |
+
"form_no": "BPI-BPDEP MAN-01222020"
|
271 |
+
},
|
272 |
+
"client_s_copy_teller_validation": null
|
273 |
+
}''',
|
274 |
+
|
275 |
+
"DF-bad.jpeg": '''{
|
276 |
+
"document_type": "DEPOSIT / PAYMENT / BILLS PURCHASE FORM FRONT",
|
277 |
+
"copy_type": "BANK'S_COPY",
|
278 |
+
"bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
|
279 |
+
"transaction_details": {
|
280 |
+
"date": "03/29/14",
|
281 |
+
"transaction_type": {
|
282 |
+
"deposit": true,
|
283 |
+
"payment": false,
|
284 |
+
"bills_purchase": false
|
285 |
+
},
|
286 |
+
"account_type": {
|
287 |
+
"savings": true,
|
288 |
+
"current": false
|
289 |
+
},
|
290 |
+
"currency": {
|
291 |
+
"peso": false,
|
292 |
+
"us_dollar": true,
|
293 |
+
"others": false
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"account_details": {
|
297 |
+
"account_number": "05039947290",
|
298 |
+
"account_name_merchant_name": "Amaia Skies"
|
299 |
+
},
|
300 |
+
"deposit_payment_breakdown": {
|
301 |
+
"cash_amount": null,
|
302 |
+
"checks": [{
|
303 |
+
"amount": 1000000.0,
|
304 |
+
"bank": null,
|
305 |
+
"date": null,
|
306 |
+
"details": null
|
307 |
+
}],
|
308 |
+
"total_deposits_payment": null
|
309 |
+
},
|
310 |
+
"teller_validation_bank_copy": null,
|
311 |
+
"for_bills_purchase_accommodation": {
|
312 |
+
"representative_full_name": "Amie Skies",
|
313 |
+
"contact_number": "0917 872 0056",
|
314 |
+
"signature_over_printed_name": "present",
|
315 |
+
"form_no": "BPI-BPDEP MAN-01222020"
|
316 |
+
},
|
317 |
+
"client_s_copy_teller_validation": null
|
318 |
+
}''',
|
319 |
+
|
320 |
+
"DB-Good.jpg": '''{
|
321 |
+
"document_type": "DEPOSIT / PAYMENT SLIP BACK",
|
322 |
+
"bank_name": "BANK OF THE PHILIPPINE ISLANDS",
|
323 |
+
"sections": {
|
324 |
+
"check_details_top": {
|
325 |
+
"checks": [{
|
326 |
+
"name_of_bank_branch": "Olanggapo",
|
327 |
+
"check_no": "0543729",
|
328 |
+
"amount": 100000.0
|
329 |
+
}],
|
330 |
+
"total_checks": null,
|
331 |
+
"total_cash": null,
|
332 |
+
"total_deposits_payment": null
|
333 |
+
},
|
334 |
+
"deposit_cash_breakdown": {
|
335 |
+
"items": [
|
336 |
+
{"no_of_pieces": 100, "denominations": 100, "amount": 1000},
|
337 |
+
{"no_of_pieces": 200, "denominations": 200, "amount": 200},
|
338 |
+
{"no_of_pieces": 300, "denominations": 300, "amount": 1500},
|
339 |
+
{"no_of_pieces": 500, "denominations": 400, "amount": 1250},
|
340 |
+
{"no_of_pieces": 600, "denominations": 600, "amount": 1750},
|
341 |
+
{"no_of_pieces": 700, "denominations": 700, "amount": 6350},
|
342 |
+
{"no_of_pieces": 800, "denominations": 800, "amount": 8750}
|
343 |
+
],
|
344 |
+
"total": 10000750000
|
345 |
+
},
|
346 |
+
"representative_information": {
|
347 |
+
"full_name": "Anna Banana Cruz",
|
348 |
+
"contact_number": "09178123775",
|
349 |
+
"address": "11, Tower 2, City Residences, Manila",
|
350 |
+
"citizenship": "Japanese",
|
351 |
+
"date_of_birth": "03/31/2001",
|
352 |
+
"place_of_birth": "Bulacan",
|
353 |
+
"signature": null
|
354 |
+
},
|
355 |
+
"client_copy": {
|
356 |
+
"document_type": "DEPOSIT / PAYMENT SLIP (CLIENT'S COPY)",
|
357 |
+
"for_payments_only": {
|
358 |
+
"policy_plan_reference_no": null,
|
359 |
+
"policy_planholder_name": null,
|
360 |
+
"bp_customer_number": "03756245"
|
361 |
+
},
|
362 |
+
"check_details": {
|
363 |
+
"checks": [{
|
364 |
+
"bank_branch_name": "P. Tuazon",
|
365 |
+
"check_no": "0347345",
|
366 |
+
"amount": 100200200
|
367 |
+
}],
|
368 |
+
"total_checks": 800000,
|
369 |
+
"total_cash": 20000,
|
370 |
+
"total_deposits_payment": 820000
|
371 |
+
}
|
372 |
+
}
|
373 |
+
}
|
374 |
+
}''',
|
375 |
+
|
376 |
+
"DB-Bad.jpg": '''{
|
377 |
+
"document_type": "DEPOSIT / PAYMENT SLIP BACK",
|
378 |
+
"bank_name": "BANK OF THE PHILIPPINE ISLANDS",
|
379 |
+
"sections": {
|
380 |
+
"check_details_top": {
|
381 |
+
"checks": [{
|
382 |
+
"name_of_bank_branch": "Olanggapo",
|
383 |
+
"check_no": "0543729",
|
384 |
+
"amount": 100000.0
|
385 |
+
}],
|
386 |
+
"total_checks": null,
|
387 |
+
"total_cash": null,
|
388 |
+
"total_deposits_payment": null
|
389 |
+
},
|
390 |
+
"deposit_cash_breakdown": {
|
391 |
+
"items": [
|
392 |
+
{"no_of_pieces": 100, "denominations": 100, "amount": 1000},
|
393 |
+
{"no_of_pieces": 200, "denominations": 200, "amount": 200},
|
394 |
+
{"no_of_pieces": 300, "denominations": 300, "amount": 1500},
|
395 |
+
{"no_of_pieces": 500, "denominations": 400, "amount": 1250},
|
396 |
+
{"no_of_pieces": 600, "denominations": 600, "amount": 1750},
|
397 |
+
{"no_of_pieces": 700, "denominations": 700, "amount": 6350},
|
398 |
+
{"no_of_pieces": 800, "denominations": 800, "amount": 8750}
|
399 |
+
],
|
400 |
+
"total": 10000750000
|
401 |
+
},
|
402 |
+
"representative_information": {
|
403 |
+
"full_name": "Anna Banana Cruz",
|
404 |
+
"contact_number": "09178123775",
|
405 |
+
"address": "11, Tower 2, City Residences, Manila",
|
406 |
+
"citizenship": "Japanese",
|
407 |
+
"date_of_birth": "03/31/2001",
|
408 |
+
"place_of_birth": "Bulacan",
|
409 |
+
"signature": null
|
410 |
+
},
|
411 |
+
"client_copy": {
|
412 |
+
"document_type": "DEPOSIT / PAYMENT SLIP (CLIENT'S COPY)",
|
413 |
+
"for_payments_only": {
|
414 |
+
"policy_plan_reference_no": null,
|
415 |
+
"policy_planholder_name": null,
|
416 |
+
"bp_customer_number": "03756245"
|
417 |
+
},
|
418 |
+
"check_details": {
|
419 |
+
"checks": [{
|
420 |
+
"bank_branch_name": "P. Tuazon",
|
421 |
+
"check_no": "0347345",
|
422 |
+
"amount": 100200200
|
423 |
+
}],
|
424 |
+
"total_checks": 800000,
|
425 |
+
"total_cash": 20000,
|
426 |
+
"total_deposits_payment": 820000
|
427 |
+
}
|
428 |
+
}
|
429 |
+
}
|
430 |
+
}''',
|
431 |
+
|
432 |
+
"WF-Good.jpg": '''{
|
433 |
+
"document_type": "WITHDRAWAL SLIP",
|
434 |
+
"bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
|
435 |
+
"withdrawal_slip_details": {
|
436 |
+
"currency_type": "US DOLLAR",
|
437 |
+
"account_type": "CURRENT",
|
438 |
+
"account_number": "3456777799",
|
439 |
+
"account_name": "Maxine Yu",
|
440 |
+
"teller_validation": null
|
441 |
+
},
|
442 |
+
"withdrawal_amount": {
|
443 |
+
"amount_in_numbers": "USD 50,000"
|
444 |
+
},
|
445 |
+
"depositor_information": {
|
446 |
+
"signature_of_depositor": "present",
|
447 |
+
"date": null
|
448 |
+
},
|
449 |
+
"withdrawal_through_representative": {
|
450 |
+
"name_in_print": "Mark Garcia",
|
451 |
+
"signature_of_representative": "present",
|
452 |
+
"contact_no": "0918 251 0226",
|
453 |
+
"depositor_authorization_signatures": [
|
454 |
+
{"signature": "present", "date": "05/19/25"},
|
455 |
+
{"signature": "present", "date": "05/19/25"}
|
456 |
+
]
|
457 |
+
},
|
458 |
+
"payment_received_by": {
|
459 |
+
"signature": "present",
|
460 |
+
"name": "Marco Polo"
|
461 |
+
},
|
462 |
+
"bank_use_only": {
|
463 |
+
"remarks": null,
|
464 |
+
"verified_by": null,
|
465 |
+
"approved_by": null
|
466 |
+
},
|
467 |
+
"form_no": "BPI-WDL OTC-01222020"
|
468 |
+
}''',
|
469 |
+
|
470 |
+
"WF-Bad.jpg": '''{
|
471 |
+
"document_type": "WITHDRAWAL SLIP",
|
472 |
+
"bank_name": "BANK_OF_THE_PHILIPPINE_ISLANDS",
|
473 |
+
"withdrawal_slip_details": {
|
474 |
+
"currency_type": "US DOLLAR",
|
475 |
+
"account_type": "CURRENT",
|
476 |
+
"account_number": "3456777799",
|
477 |
+
"account_name": "Maxine Yu",
|
478 |
+
"teller_validation": null
|
479 |
+
},
|
480 |
+
"withdrawal_amount": {
|
481 |
+
"amount_in_numbers": "USD 50,000"
|
482 |
+
},
|
483 |
+
"depositor_information": {
|
484 |
+
"signature_of_depositor": "present",
|
485 |
+
"date": null
|
486 |
+
},
|
487 |
+
"withdrawal_through_representative": {
|
488 |
+
"name_in_print": "Mark Garcia",
|
489 |
+
"signature_of_representative": "present",
|
490 |
+
"contact_no": "0918 251 0226",
|
491 |
+
"depositor_authorization_signatures": [
|
492 |
+
{"signature": "present", "date": "05/19/25"},
|
493 |
+
{"signature": "present", "date": "05/19/25"}
|
494 |
+
]
|
495 |
+
},
|
496 |
+
"payment_received_by": {
|
497 |
+
"signature": "present",
|
498 |
+
"name": "Marco Polo"
|
499 |
+
},
|
500 |
+
"bank_use_only": {
|
501 |
+
"remarks": null,
|
502 |
+
"verified_by": null,
|
503 |
+
"approved_by": null
|
504 |
+
},
|
505 |
+
"form_no": "BPI-WDL OTC-01222020"
|
506 |
+
}''',
|
507 |
+
|
508 |
+
"WB-Good.jpg": '''{
|
509 |
+
"document_type": "WITHDRAWAL SLIP BACK",
|
510 |
+
"denominations_breakdown": {
|
511 |
+
"items": [
|
512 |
+
{"no_of_pieces": 1, "denomination": 100, "amount": 100},
|
513 |
+
{"no_of_pieces": 2, "denomination": 500, "amount": 1000},
|
514 |
+
{"no_of_pieces": 3, "denomination": 1000, "amount": 3000}
|
515 |
+
],
|
516 |
+
"total": null
|
517 |
+
},
|
518 |
+
"representative_information": {
|
519 |
+
"full_name": "Mark Garcia",
|
520 |
+
"contact_number": "0918 251 3372",
|
521 |
+
"address": "1F Tower 1, SMDC, Camarines, Sur",
|
522 |
+
"citizenship": "American",
|
523 |
+
"date_of_birth": "12/15/2001",
|
524 |
+
"place_of_birth": "Bicol",
|
525 |
+
"signature": "present"
|
526 |
+
}
|
527 |
+
}''',
|
528 |
+
|
529 |
+
"WB-bad.jpeg": '''{
|
530 |
+
"document_type": "WITHDRAWAL SLIP BACK",
|
531 |
+
"denominations_breakdown": {
|
532 |
+
"items": [
|
533 |
+
{"no_of_pieces": 1, "denomination": 100, "amount": 100},
|
534 |
+
{"no_of_pieces": 2, "denomination": 500, "amount": 1000},
|
535 |
+
{"no_of_pieces": 3, "denomination": 1000, "amount": 3000}
|
536 |
+
],
|
537 |
+
"total": null
|
538 |
+
},
|
539 |
+
"representative_information": {
|
540 |
+
"full_name": "Mark Garcia",
|
541 |
+
"contact_number": "0918 251 3372",
|
542 |
+
"address": "1F Tower 1, SMDC, Camarines, Sur",
|
543 |
+
"citizenship": "American",
|
544 |
+
"date_of_birth": "12/15/2001",
|
545 |
+
"place_of_birth": "Bicol",
|
546 |
+
"signature": "present"
|
547 |
+
}
|
548 |
+
}'''
|
549 |
+
}
|
550 |
+
|
551 |
+
# ─────────────────────────────────────────────────────────────
|
552 |
+
# 3) Evaluation + helper functions
|
553 |
+
# ─────────────────────────────────────────────────────────────
|
554 |
+
|
555 |
+
def compute_cer(gt, pred):
|
556 |
+
"""Compute Character Error Rate."""
|
557 |
+
m, n = len(gt), len(pred)
|
558 |
+
dp = [[0]*(n+1) for _ in range(m+1)]
|
559 |
+
for i in range(m+1):
|
560 |
+
dp[i][0] = i
|
561 |
+
for j in range(n+1):
|
562 |
+
dp[0][j] = j
|
563 |
+
for i in range(1,m+1):
|
564 |
+
for j in range(1,n+1):
|
565 |
+
cost = 0 if gt[i-1]==pred[j-1] else 1
|
566 |
+
dp[i][j] = min(dp[i-1][j]+1, dp[i][j-1]+1, dp[i-1][j-1]+cost)
|
567 |
+
return dp[m][n]/max(m,1)
|
568 |
+
|
569 |
+
def extract_flat(o, parent=""):
|
570 |
+
"""Extract flat key-value pairs from nested JSON."""
|
571 |
+
out = []
|
572 |
+
if isinstance(o, dict):
|
573 |
+
for k,v in o.items():
|
574 |
+
key = f"{parent}.{k}" if parent else k
|
575 |
+
out += extract_flat(v, key)
|
576 |
+
elif isinstance(o, list):
|
577 |
+
for i,v in enumerate(o):
|
578 |
+
out += extract_flat(v, f"{parent}[{i}]")
|
579 |
+
else:
|
580 |
+
out.append((parent, str(o)))
|
581 |
+
return out
|
582 |
+
|
583 |
+
def compute_field_accuracy(gt_json, pred_json):
|
584 |
+
"""Compute strict field accuracy."""
|
585 |
+
try:
|
586 |
+
gt = dict(extract_flat(json.loads(gt_json)))
|
587 |
+
pr = dict(extract_flat(json.loads(pred_json)))
|
588 |
+
except:
|
589 |
+
return 0.0
|
590 |
+
total = len(gt)
|
591 |
+
correct = sum(1 for k,v in gt.items() if pr.get(k)==v)
|
592 |
+
return correct / total if total else 0.0
|
593 |
+
|
594 |
+
def field_matches(gt, pred, max_err_pct=0.1):
|
595 |
+
"""Check if fields match with fuzzy matching."""
|
596 |
+
gt = re.sub(r'[^\w\s]', '', str(gt).lower().strip())
|
597 |
+
pred = re.sub(r'[^\w\s]', '', str(pred).lower().strip())
|
598 |
+
if not gt and not pred:
|
599 |
+
return True
|
600 |
+
return (1 - SequenceMatcher(None, gt, pred).ratio()) <= max_err_pct
|
601 |
+
|
602 |
+
def compute_fuzzy_field_accuracy(gt_json, pred_json):
|
603 |
+
"""Compute fuzzy field accuracy."""
|
604 |
+
try:
|
605 |
+
gt = dict(extract_flat(json.loads(gt_json)))
|
606 |
+
pr = dict(extract_flat(json.loads(pred_json)))
|
607 |
+
except:
|
608 |
+
return 0.0
|
609 |
+
total = len(gt)
|
610 |
+
correct = sum(1 for k,v in gt.items() if field_matches(v, pr.get(k, "")))
|
611 |
+
return correct / total if total else 0.0
|
612 |
+
|
613 |
+
def canonicalize(js):
|
614 |
+
"""Canonicalize JSON string."""
|
615 |
+
return json.dumps(json.loads(js), sort_keys=True, separators=(',', ':'))
|
616 |
+
|
617 |
+
def clean_json_string(js):
|
618 |
+
"""Clean JSON string by removing markdown formatting."""
|
619 |
+
return re.sub(r'```(?:json)?\s*|\s*```', '', js.strip(), flags=re.DOTALL)
|
620 |
+
|
621 |
+
def extract_values_from_jsonlike(text):
|
622 |
+
"""Extract all string values from JSON-like text."""
|
623 |
+
text = re.sub(r'[{}[\]",:]', ' ', text)
|
624 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
625 |
+
return text
|
626 |
+
|
627 |
+
def compute_spelling_error_rate(text):
|
628 |
+
"""Compute spelling error rate using NLTK words corpus."""
|
629 |
+
words = text.lower().split()
|
630 |
+
if not words:
|
631 |
+
return 0.0
|
632 |
+
|
633 |
+
english_words = set(nltk_words.words())
|
634 |
+
misspelled = sum(1 for word in words if word.isalpha() and word not in english_words)
|
635 |
+
return misspelled / len(words)
|
636 |
+
|
637 |
+
def compute_perplexity(text):
|
638 |
+
"""Compute perplexity using GPT-2 model."""
|
639 |
+
try:
|
640 |
+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
641 |
+
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
642 |
+
|
643 |
+
tokenizer.pad_token = tokenizer.eos_token
|
644 |
+
inputs = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
|
645 |
+
|
646 |
+
with torch.no_grad():
|
647 |
+
outputs = model(inputs, labels=inputs)
|
648 |
+
loss = outputs.loss
|
649 |
+
|
650 |
+
return math.exp(loss.item())
|
651 |
+
except Exception as e:
|
652 |
+
print(f"Error computing perplexity: {e}")
|
653 |
+
return float('inf')
|
654 |
+
|
655 |
+
def compute_refined_metrics(text):
|
656 |
+
"""Compute refined spelling error rate with additional checks."""
|
657 |
+
words = text.lower().split()
|
658 |
+
if not words:
|
659 |
+
return 0.0
|
660 |
+
|
661 |
+
english_words = set(nltk_words.words())
|
662 |
+
|
663 |
+
errors = 0
|
664 |
+
for word in words:
|
665 |
+
if not word.isalpha():
|
666 |
+
continue
|
667 |
+
|
668 |
+
if word not in english_words:
|
669 |
+
corrected = word.replace('0', 'o').replace('1', 'l').replace('5', 's')
|
670 |
+
if corrected not in english_words:
|
671 |
+
errors += 1
|
672 |
+
|
673 |
+
return errors / len(words)
|
674 |
+
|
675 |
+
# ─────────────────────────────────────────────────────────────
|
676 |
+
# 4) Main processing function
|
677 |
+
# ─────────────────────────────────────────────────────────────
|
678 |
+
|
679 |
+
def process_document_image(image_bytes, filename=None):
|
680 |
+
"""Process a document image and return extracted information and metrics."""
|
681 |
+
try:
|
682 |
+
# 1) load
|
683 |
+
image = Image.open(BytesIO(image_bytes))
|
684 |
+
img_format = image.format or "PNG"
|
685 |
+
|
686 |
+
# 2) compress & resize loop → ensure <4 MB
|
687 |
+
buf = BytesIO()
|
688 |
+
image.save(buf, format=img_format, optimize=True, quality=85)
|
689 |
+
while buf.getbuffer().nbytes > 4_000_000:
|
690 |
+
w, h = image.size
|
691 |
+
image = image.resize((int(w * 0.8), int(h * 0.8)), Image.LANCZOS) # type: ignore
|
692 |
+
buf = BytesIO()
|
693 |
+
image.save(buf, format=img_format, optimize=True, quality=85)
|
694 |
+
|
695 |
+
img_bytes = buf.getvalue() # this is your final image payload
|
696 |
+
|
697 |
+
ocr_prompt = "Extract all visible printed and handwritten text from this scanned bank document image."
|
698 |
+
|
699 |
+
image_part = {
|
700 |
+
"inlineData": {
|
701 |
+
"mimeType": "image/png",
|
702 |
+
"data": img_bytes
|
703 |
+
}
|
704 |
+
}
|
705 |
+
|
706 |
+
response = client.models.generate_content(
|
707 |
+
model=MODEL_ID,
|
708 |
+
contents=[
|
709 |
+
Part.from_bytes(data=img_bytes, mime_type="image/png"),
|
710 |
+
ocr_prompt
|
711 |
+
])
|
712 |
+
|
713 |
+
raw_text = response.text.strip() # type: ignore
|
714 |
+
print("--- Raw OCR Text ---\n", raw_text[:1000], "\n")
|
715 |
+
|
716 |
+
# Extract JSON with Gemini from OCR
|
717 |
+
schema_prompt = (
|
718 |
+
"You are a JSON extractor for bank forms. Given the OCR text from a scanned image, "
|
719 |
+
"output ONLY valid JSON matching the correct schema, using null for blanks.\n\n"
|
720 |
+
"--- CIF Example:\n" + GROUND_TRUTHS["CIF-Good.png"] + "\n\n"
|
721 |
+
"--- DF Example:\n" + GROUND_TRUTHS["DF-Good.jpg"] + "\n\n"
|
722 |
+
"--- DB Example:\n" + GROUND_TRUTHS["DB-Good.jpg"] + "\n\n"
|
723 |
+
"--- WF Example:\n" + GROUND_TRUTHS["WF-Good.jpg"] + "\n\n"
|
724 |
+
"--- WB Example:\n" + GROUND_TRUTHS["WB-Good.jpg"] + "\n\n"
|
725 |
+
"Now extract JSON from this OCR text:\n" + raw_text
|
726 |
+
)
|
727 |
+
|
728 |
+
final = client.models.generate_content(
|
729 |
+
model=MODEL_ID,
|
730 |
+
contents=[schema_prompt]
|
731 |
+
)
|
732 |
+
|
733 |
+
pred_json = clean_json_string(final.text)
|
734 |
+
|
735 |
+
print("--- Extracted JSON ---\n", pred_json)
|
736 |
+
|
737 |
+
# Parse the extracted JSON
|
738 |
+
try:
|
739 |
+
extracted_data = json.loads(pred_json)
|
740 |
+
except json.JSONDecodeError:
|
741 |
+
extracted_data = {
|
742 |
+
"document_type": "unknown",
|
743 |
+
"raw_text": pred_json
|
744 |
+
}
|
745 |
+
|
746 |
+
# Compute basic metrics
|
747 |
+
clean_text = extract_values_from_jsonlike(pred_json)
|
748 |
+
ser = compute_spelling_error_rate(clean_text)
|
749 |
+
|
750 |
+
try:
|
751 |
+
ppl = compute_perplexity(clean_text)
|
752 |
+
except:
|
753 |
+
ppl = float("inf")
|
754 |
+
|
755 |
+
refined_ser = compute_refined_metrics(clean_text)
|
756 |
+
|
757 |
+
# Evaluate against ground truth if available
|
758 |
+
cer_score = 0.0
|
759 |
+
strict_accuracy = 0.0
|
760 |
+
fuzzy_accuracy = 0.0
|
761 |
+
|
762 |
+
if filename and filename in GROUND_TRUTHS:
|
763 |
+
gt_json = clean_json_string(GROUND_TRUTHS[filename])
|
764 |
+
try:
|
765 |
+
gt_can = canonicalize(gt_json)
|
766 |
+
pred_can = canonicalize(pred_json)
|
767 |
+
cer_score = compute_cer(gt_can, pred_can)
|
768 |
+
strict_accuracy = compute_field_accuracy(gt_json, pred_json)
|
769 |
+
fuzzy_accuracy = compute_fuzzy_field_accuracy(gt_json, pred_json)
|
770 |
+
except Exception as e:
|
771 |
+
print(f"Error in evaluation: {e}")
|
772 |
+
else:
|
773 |
+
print("⚠️ No ground truth available for this file.")
|
774 |
+
|
775 |
+
# Prepare metrics with proper handling of infinite values
|
776 |
+
metrics = {
|
777 |
+
"ser": ser,
|
778 |
+
"ppl": 999999.0 if ppl == float("inf") else ppl, # Replace inf with large finite value
|
779 |
+
"refined_ser": refined_ser,
|
780 |
+
"cer": cer_score,
|
781 |
+
"strict_field_accuracy": strict_accuracy,
|
782 |
+
"fuzzy_field_accuracy": fuzzy_accuracy
|
783 |
+
}
|
784 |
+
|
785 |
+
return {
|
786 |
+
"document_type": extracted_data.get("document_type", "unknown"),
|
787 |
+
"extracted": extracted_data,
|
788 |
+
"metrics": metrics,
|
789 |
+
"raw_text": raw_text,
|
790 |
+
"extracted_json": pred_json
|
791 |
+
}
|
792 |
+
|
793 |
+
except Exception as e:
|
794 |
+
print(f"Error processing document: {e}")
|
795 |
+
return {
|
796 |
+
"error": str(e),
|
797 |
+
"document_type": "unknown",
|
798 |
+
"extracted": {},
|
799 |
+
"metrics": {
|
800 |
+
"ser": 0.0,
|
801 |
+
"ppl": 999999.0, # Replace inf with large finite value
|
802 |
+
"refined_ser": 0.0,
|
803 |
+
"cer": 0.0,
|
804 |
+
"strict_field_accuracy": 0.0,
|
805 |
+
"fuzzy_field_accuracy": 0.0
|
806 |
+
}
|
807 |
+
}
|
services/text_processor.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# PLACEHOLDER
|