Spaces:

Aekanun
/

Thai-HandWriting-to-Text

Running on Zero

File size: 4,615 Bytes

531f528

import os
import warnings
import torch
import gc
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import PeftModel
from PIL import Image
import gradio as gr
from huggingface_hub import login

# Basic settings
warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # ปิดการใช้ CUDA

# Global variables
model = None
processor = None

# Login to Hugging Face Hub
if 'HUGGING_FACE_HUB_TOKEN' in os.environ:
   print("กำลังเข้าสู่ระบบ Hugging Face Hub...")
   login(token=os.environ['HUGGING_FACE_HUB_TOKEN'])
else:
   print("คำเตือน: ไม่พบ HUGGING_FACE_HUB_TOKEN")

def load_model_and_processor():
   """โหลดโมเดลและ processor"""
   global model, processor
   print("กำลังโหลดโมเดลและ processor...")
   try:
       # Model paths
       base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
       adapter_path = "Aekanun/thai-handwriting-llm"

       # Load processor from base model
       print("กำลังโหลด processor...")
       processor = AutoProcessor.from_pretrained(base_model_path, use_auth_token=True)

       # Load base model
       print("กำลังโหลด base model...")
       base_model = AutoModelForVision2Seq.from_pretrained(
           base_model_path,
           device_map={"": "cpu"},  # ใช้ CPU
           torch_dtype=torch.float32,  # ใช้ float32 แทน bfloat16
           trust_remote_code=True,
           use_auth_token=True
       )

       # Load adapter
       print("กำลังโหลด adapter...")
       model = PeftModel.from_pretrained(
           base_model,
           adapter_path,
           torch_dtype=torch.float32,  # ใช้ float32
           device_map={"": "cpu"},  # ใช้ CPU
           use_auth_token=True
       )
       
       print("โหลดโมเดลสำเร็จ!")
       return True
   except Exception as e:
       print(f"เกิดข้อผิดพลาดในการโหลดโมเดล: {str(e)}")
       return False

def process_handwriting(image):
   """ฟังก์ชันสำหรับ Gradio interface"""
   global model, processor
   
   if image is None:
       return "กรุณาอัพโหลดรูปภาพ"
   
   try:
       # Ensure image is in PIL format
       if not isinstance(image, Image.Image):
           image = Image.fromarray(image)
       
       # Create prompt
       prompt = """Transcribe the Thai handwritten text from the provided image.
Only return the transcription in Thai language."""
       
       # Create model inputs
       messages = [
           {
               "role": "user",
               "content": [
                   {"type": "text", "text": prompt},
                   {"type": "image", "image": image}
               ],
           }
       ]
       
       # Process with model
       text = processor.apply_chat_template(messages, tokenize=False)
       inputs = processor(text=text, images=image, return_tensors="pt")
       
       # Move inputs to CPU
       inputs = {k: v.to('cpu') for k, v in inputs.items()}
       
       # Generate
       with torch.no_grad():
           outputs = model.generate(
               **inputs,
               max_new_tokens=256,
               do_sample=False,
               pad_token_id=processor.tokenizer.pad_token_id
           )
       
       # Decode output
       transcription = processor.decode(outputs[0], skip_special_tokens=True)
       return transcription.strip()
   except Exception as e:
       return f"เกิดข้อผิดพลาด: {str(e)}"

# Initialize application
print("กำลังเริ่มต้นแอปพลิเคชัน...")
if load_model_and_processor():
   # Create Gradio interface
   demo = gr.Interface(
       fn=process_handwriting,
       inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
       outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
       title="Thai Handwriting Recognition",
       description="อัพโหลดรูปภาพลายมือเขียนภาษาไทยเพื่อแปลงเป็นข้อความ",
       examples=[["example1.jpg"], ["example2.jpg"]]
   )
   
   if __name__ == "__main__":
       demo.launch()
else:
   print("ไม่สามารถเริ่มต้นแอปพลิเคชันได้")