Hammedalmodel's picture
Update app.py
9f8dfc0 verified
raw
history blame
2.34 kB
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch
import requests
from io import BytesIO
app = FastAPI()
# Initialize model and processor
ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
ckpt,
torch_dtype=torch.bfloat16
).to("cuda")
processor = AutoProcessor.from_pretrained(ckpt)
class ImageRequest(BaseModel):
image_path: str
@app.post("/extract_text")
async def extract_text(request: ImageRequest):
try:
# Download image from URL
response = requests.get(request.image_path)
if response.status_code != 200:
raise HTTPException(status_code=400, detail="Failed to fetch image from URL")
# Open image from bytes
image = Image.open(BytesIO(response.content)).convert("RGB")
# Create message structure
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output"},
{"type": "image"}
]
}
]
# Process input
texts = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")
# Generate output
outputs = model.generate(**inputs, max_new_tokens=250)
result = processor.decode(outputs[0], skip_special_tokens=True)
# Clean up the output
if "assistant" in result.lower():
result = result[result.lower().find("assistant") + len("assistant"):].strip()
result = result.replace("user", "").replace("Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output", "").strip()
return {"text": f"\n{result}\n"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)