melhiq_ocr / app /extraction.py
mussie1212's picture
fix:first commit on the ocr
c53b292
raw
history blame
9.98 kB
import os
import sys
cur_dir = os.getcwd()
parent_dir = os.path.realpath(os.path.join(os.path.dirname(cur_dir)))
if parent_dir not in sys.path:
sys.path.append(parent_dir)
sys.path.append(cur_dir)
sys.path.insert(1, ".")
from PIL import Image
import logging
from app.utils import *
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
## processing engine for PDF or Image
def process_document_file(file_path, model, max_retries=3, base_delay=15):
"""
Processes a PDF or image file using Gemini, with retry logic for quota errors.
Args:
file_path (str): Path to the document file (PDF or image).
model: The GenerativeModel instance.
max_retries (int): Maximum retry attempts for quota errors.
base_delay (int): Base delay for exponential backoff (seconds).
Returns:
str: Generated JSON content from the model.
"""
for attempt in range(max_retries):
try:
# Open file as an image
pil_image = Image.open(file_path)
# Define prompt with specific fields
prompt = """
Analyze the document provided in the image.
First, identify the type of document: 'ID', 'driving_license', or 'librea' (car ownership document).
Then, based on the document type, extract only the following specific information as key-value pairs.
The document may contain text in both English and Amharic. Extract the values corresponding to the specified labels.
If a field has an Amharic version distinct from the English one, include it with a suffix '_amharic' (e.g., full_name_amharic).
If a field is not found, use "N/A" as the value.
For 'ID':
- full_name (Extract from "Full Name" or equivalent Amharic label)
- date_of_birth (Extract from "Date of Birth" or equivalent)
- expiration_date (Extract from "Expiration Date" or equivalent)
- sex (Extract from "Sex" or equivalent)
- country_of_citizenship (Extract from "Country of Citizenship" or equivalent)
- fcn_number (Extract from "FCN Number" or equivalent)
- phone_number (Extract from "Phone Number" or equivalent)
- address (Extract from "Address" or equivalent)
- city (Extract from "City" or equivalent)
- woreda (Extract from "Woreda" or equivalent)
For 'driving_license':
- full_name (Extract from "Full Name" or "ስም")
- region(Extract form "ክልል")
- zone or city (Extract from "ዞን/ከተማ")
- nationality (Extracted from "ዜግነት")
- kebele (extracted from "ቀበሌ")
- license_number (Extract from "License Number" or "የፍቃድ ቁጥር")
- date_of_birth (Extract from "Date of Birth" or "የልደት ቀን")
- issue_date (Extract from "Date of Issue" or "የተሰጠበት ቀን")
- expiration_date (Extract from "Expiration Date" or "የሚያበቃበት ቀን")
- phone_number (Extract from "Phone Number" or "ስልክ ቁጥር" or "ስልክ")
- issuing_authority (Extract from "Issuing Authority" or "የኢትዮጵያ መንግስት የትራንስፖርት እና ትራፊክ ቁጥጥር ባለስልጣን")
- gender (Extract from "Gender" or "ፆታ")
- issuing_place (Extract from "Issuing Place" or "የተሰጠበት ቦታ")
For 'librea':
- owner_full_name (Extract from "Owner's Full Name" or "ስም")
- owner_address (Combine and extract from "Owner's Address" or "ክልል", "ከተማ", "ክ/ከተማ", "ቀበሌ/ወረዳ", "የቤት ቁጥር")
- gender (Extract from "Gender" or "ፆታ")
- nationality (Extract from "Nationality" or "ዜግነት")
- phone_number (Extract from "Phone Number" or "ስልክ")
- car type (Extract from "የተሽከርካሪዉ አይነት")
- registration_number (Extract from "Registration Number" or "የሠሌዳ ቁጥር")
- car_make (Extract from "Make" or "የተሰራበት ሀገር")
- car_model (Extract from "Model" or "የተሽ/ሞዴል")
- car_year (Extract from "Year" or "የተሰራበት አመት")
- engine_number (Extract from "Engine Number" or "የሞተር ቁጥር")
- vin_number (Extract from "Chassis Number" or "የሻንሲ ቁጥር")
- body_type (Extract from "Body Type" or "የአካሉ አይነት")
- color (Extract from "ቀለም")
- fuel type (Extract from " የነዳጅ አይነት")
- engine horsepower (Extract from "የሞተር የፈረስ ጉልበት")
- car horsepower (extract from " የተሽ/ጠቅ/ጉልበት")
- single weight (extract from " ነጠላ ክብደት")
- issue_date (Extract from "Issue Date" or "የተሰጠበት ቀን")
- expiration_date (Extract from "Expiration Date" or "የሚያበቃበት ቀን")
- motor capacity -cc (extracted from "የሞተር ችሎታ /ሲሲ/")
- cilinder capacity (extractd form "የሲሊንደርብዛት")
- Approved work (extract from " የተፈቀለት የስራ ፀባይ")
Output *only* valid JSON, with no additional text, comments, or explanations. Use this format:
{"document_type": "type_here", "extracted_data": {"key1": "value1", "key2": "value2", ...}}
Example output for an ID:
{"document_type": "ID", "extracted_data": {"full_name": "John Doe", "date_of_birth": "1990-01-01", "expiration_date": "2030-01-01", "sex": "Male", "country_of_citizenship": "Ethiopia", "fcn_number": "123456789", "phone_number": "+251912345678", "address": "123 Main St", "city": "Addis Ababa", "woreda": "Bole"}}
Example output for a driving_license:
{"document_type": "driving_license", "extracted_data": {"full_name": "Jane Smith", "full_name_amharic": "ጄን ስሚት", "license_number": "DL987654", "date_of_birth": "1985-03-15", "issue_date": "2020-05-10", "expiration_date": "2030-05-10", "class_type": "B", "restrictions": "None", "phone_number": "+251912345678", "blood_type": "O+", "address": "456 Oak St", "address_amharic": "456 ኦክ ስትሪት", "issuing_authority": "Ethiopian Transport Authority", "gender": "Female", "issuing_place": "Addis Ababa"}}
Example output for a librea:
{"document_type": "librea", "extracted_data": {"owner_full_name": "Alice Brown", "owner_full_name_amharic": "አሊስ ብራውን", "owner_address_amharic": "789 ፓይን ስትሪት, አዲስ አበባ", "phone_number": "+251987654321", "registration_number": "XYZ123", "car_make": "Toyota", "car_model": "Camry", "car_year": "2020", "engine_number": "ENG789", "vin_number": "1HGCM82633A123456", "body_type": "Sedan", "gender": "Female", "nationality": "Ethiopian", "passenger_capacity": "5", "loading_weight": "1500 kg"}}
"""
response = model.generate_content([prompt, pil_image])
response.resolve()
# Log raw response for debugging
logger.debug(f"Raw model response: {response.text}")
# Try cleaning the response
cleaned_response = clean_json_response(response.text.strip())
if cleaned_response:
logger.info(f"Successfully processed file: {file_path}")
return cleaned_response
# Validate JSON
try:
json.loads(response.text.strip())
logger.info(f"Successfully processed file: {file_path}")
return response.text.strip()
except json.JSONDecodeError:
logger.error("Invalid JSON response from model")
if attempt < max_retries - 1:
logger.warning(f"Retrying with simplified prompt... (Attempt {attempt + 1}/{max_retries})")
continue
return json.dumps({"error": "Invalid JSON response from model after retries"})
except Exception as e:
if "429" in str(e) and attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
logger.warning(f"Quota exceeded, retrying in {delay} seconds... (Attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
logger.error(f"Error processing file: {str(e)}")
return json.dumps({"error": f"An error occurred: {str(e)}"})
## Main function to process document
def process_document(file_path, api_key, model_name='gemini-1.5-flash'):
"""
Processes a PDF or image file and returns JSON output.
Args:
file_path (str): Path to the document file (PDF or image).
api_key (str): API key for Gemini.
model_name (str): Model name, default 'gemini-1.5-flash'.
Returns:
str: JSON string with document type and extracted data.
"""
try:
model = create_connection(api_key, model_name)
ext = os.path.splitext(file_path)[1].lower()
if ext in ['.pdf', '.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff']:
return process_document_file(file_path, model)
else:
logger.error("Unsupported file type")
return json.dumps({"error": "Unsupported file type. Please provide a PDF or image file."})
except Exception as e:
logger.error(f"Error in process_document: {str(e)}")
return json.dumps({"error": f"An error occurred: {str(e)}"})