Spaces:
Sleeping
Sleeping
# app.py | |
import gradio as gr | |
from PIL import Image | |
import torch | |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
import re | |
import json | |
# Charger le modèle TrOCR | |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1") | |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1") | |
model.eval() | |
def ocr_trocr(pil_image): | |
image = pil_image.convert("RGB") | |
pixel_values = processor(images=image, return_tensors="pt").pixel_values | |
with torch.no_grad(): | |
generated_ids = model.generate(pixel_values) | |
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return text | |
def est_carte_identite_guineenne(texte): | |
texte = texte.upper().replace("’", "'") | |
mots_cles = [ | |
"CARTE", "IDENTITE", "GUINEE", "GUINEENNE", "REPUBLIQUE", | |
"CEDEAO", "GIN", "DATE DE NAISSANCE", "NUMERO", "MSPC", | |
"NOM", "PRENOM" | |
] | |
return sum(1 for mot in mots_cles if mot in texte) >= 3 | |
def extraire_donnees(texte): | |
texte = texte.upper() | |
patterns = { | |
"nom": r"NOM\s*[:\-]?\s*([A-Z\-]+)", | |
"prenom": r"PRENOM\s*[:\-]?\s*([A-Z\-]+)", | |
"sexe": r"SEXE\s*[:\-]?\s*([MF])", | |
"taille": r"TAILLE\s*[:\-]?\s*([0-9,.]+\s?M)", | |
"nationalite": r"NATIONALITE\s*[:\-]?\s*([A-Z]+)", | |
"date_naissance": r"(\d{2}\s(?:JAN|FEB|MAR|APR|MAI|JUN|JUL|AOU|SEP|OCT|NOV|DEC)\s\d{4})", | |
"numero_id": r"(\d{16})", | |
"nin": r"(\d{15})", | |
"date_emission": r"DATE D['’]?EMISSION\s*[:\-]?\s*(\d{2}\s\w+\s\d{4})", | |
"date_expiration": r"DATE D['’]?EXPIRATION\s*[:\-]?\s*(\d{2}\s\w+\s\d{4})", | |
"lieu": r"CONAKRY|KANKAN|NZEREKORE|LABE|KINDIA|BOKE|FARANAH" | |
} | |
data = {} | |
for key, pattern in patterns.items(): | |
match = re.search(pattern, texte) | |
if match: | |
data[key] = match.group(1) | |
return data | |
def analyse_carte(recto_img, verso_img): | |
try: | |
texte_recto = ocr_trocr(recto_img) | |
texte_verso = ocr_trocr(verso_img) | |
texte_total = texte_recto + "\n" + texte_verso | |
if not est_carte_identite_guineenne(texte_total): | |
return " Ce document ne semble pas être une carte d'identité guinéenne.", {} | |
champs = extraire_donnees(texte_total) | |
return texte_total, champs | |
except Exception as e: | |
return f"Erreur de traitement : {str(e)}", {} | |
interface = gr.Interface( | |
fn=analyse_carte, | |
inputs=[ | |
gr.Image(type="pil", label="Image Recto"), | |
gr.Image(type="pil", label="Image Verso") | |
], | |
outputs=[ | |
gr.Textbox(label="Texte OCR extrait"), | |
gr.JSON(label="Champs structurés extraits") | |
], | |
title="OCRIA - Lecture intelligente de carte d'identité guinéenne", | |
description="Scannez les deux faces d'une carte d'identité guinéenne. Le système vérifie et extrait automatiquement les informations clés." | |
) | |
if __name__ == "__main__": | |
interface.launch() | |