Image-to-Text / app.py
Futuretop's picture
Update app.py
a8150fc verified
raw
history blame
2.46 kB
import gradio as gr
import subprocess
import torch
from PIL import Image
from transformers import AutoProcessor, AutoConfig
import importlib.util, sys, os
subprocess.run(
"pip install --upgrade transformers>=4.50.0",
shell=True,
check=True
)
model_id = "microsoft/Florence-2-base-ft"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config_mod_name = config.__class__.__module__
config_mod = sys.modules[config_mod_name]
code_dir = os.path.dirname(config_mod.__file__)
modeling_path = os.path.join(code_dir, "modeling_florence2.py")
if not os.path.exists(modeling_path):
raise FileNotFoundError(f"Couldn’t find {modeling_path}")
spec = importlib.util.spec_from_file_location("florence2_modeling", modeling_path)
flor_mod = importlib.util.module_from_spec(spec)
sys.modules["florence2_modeling"] = flor_mod
spec.loader.exec_module(flor_mod)
FlorenceLM = flor_mod.Florence2LanguageForConditionalGeneration
florence_model = FlorenceLM.from_pretrained(
model_id,
trust_remote_code=True
).to(device).eval()
florence_processor = AutoProcessor.from_pretrained(model, trust_remote_code=True)
def generate_caption(image):
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
generated_ids = florence_model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = florence_processor.post_process_generation(
generated_text,
task="<MORE_DETAILED_CAPTION>",
image_size=(image.width, image.height)
)
prompt = parsed_answer["<MORE_DETAILED_CAPTION>"]
print("\n\nGeneration completed!:"+ prompt)
return prompt
demo = gr.Interface(generate_caption,
inputs=[gr.Image(label="Input Image")],
outputs = [gr.Textbox(label="Output Prompt", lines=3, show_copy_button = True),
],
theme="Yntec/HaleyCH_Theme_Orange",
)
demo.launch(debug=True)