Spaces:

AxleToe
/

captcha-solving

Running

File size: 2,255 Bytes

066a23d

# app.py

import gradio as gr
from PIL import Image
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
import torch

print("--- Initializing Solver Service ---")

# Use a GPU if available (Hugging Face may provide one)
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- LOAD MODELS ONLY ONCE AT STARTUP ---
print("1. Loading TrOCR processor...")
processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3", use_fast=True)
print("   - Processor loaded.")

print("2. Loading VisionEncoderDecoder model...")
model = VisionEncoderDecoderModel.from_pretrained("anuashok/ocr-captcha-v3").to(device)
print("   - Model loaded.")
print(f"--- Model is running on: {device.upper()} ---")
# --- END OF HEAVY LOADING ---


def solve_captcha(input_image: Image.Image) -> str:
    """
    Solves a CAPTCHA using the pre-loaded model.
    This function uses the exact image processing logic from your original script.
    """
    print("--- Received image for solving ---")
    
    # 1. Convert input image to RGBA (as in your original code)
    image = input_image.convert("RGBA")

    # 2. Prepare a white background
    background = Image.new("RGBA", image.size, (255, 255, 255))

    # 3. Composite the image onto the white background and convert to RGB
    combined = Image.alpha_composite(background, image).convert("RGB")
    print("   - Image pre-processing complete.")

    # 4. Prepare image for the model
    pixel_values = processor(images=combined, return_tensors="pt").pixel_values.to(device)
    print("   - Image prepared for model.")

    # 5. Run model inference
    generated_ids = model.generate(pixel_values)
    print("   - Model inference complete.")

    # 6. Decode the result
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"   - Decoding complete. Result: {generated_text}")
    
    return generated_text


# --- Create the Gradio Interface and API Endpoint ---
gr.Interface(
    fn=solve_captcha,
    inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
    outputs=gr.Textbox(label="Result"),
    title="TrOCR CAPTCHA Solver (Custom Logic)",
    description="An API for the anuashok/ocr-captcha-v3 model using specific pre-processing."
).launch()