Image-to-Text / app.py
Futuretop's picture
Update app.py
093909c verified
raw
history blame
2.13 kB
import gradio as gr
import subprocess
import torch
from PIL import Image
from transformers import AutoProcessor, AutoConfig
import importlib, sys
subprocess.run(
"pip install --upgrade transformers>=4.50.0",
shell=True,
check=True
)
model_id = "microsoft/Florence-2-base-ft"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
module_base = config.__module__.rsplit(".", 1)[0]
modeling_mod_path = module_base + ".modeling_florence2"
modeling_mod = importlib.import_module(modeling_mod_path)
FlorenceLM = getattr(
modeling_mod,
"Florence2LanguageForConditionalGeneration"
)
florence_model = FlorenceLM.from_pretrained(
model_id,
trust_remote_code=True,
).to(device).eval()
florence_processor = AutoProcessor.from_pretrained(model, trust_remote_code=True)
def generate_caption(image):
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
generated_ids = florence_model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = florence_processor.post_process_generation(
generated_text,
task="<MORE_DETAILED_CAPTION>",
image_size=(image.width, image.height)
)
prompt = parsed_answer["<MORE_DETAILED_CAPTION>"]
print("\n\nGeneration completed!:"+ prompt)
return prompt
demo = gr.Interface(generate_caption,
inputs=[gr.Image(label="Input Image")],
outputs = [gr.Textbox(label="Output Prompt", lines=3, show_copy_button = True),
],
theme="Yntec/HaleyCH_Theme_Orange",
)
demo.launch(debug=True)