import gradio as gr import subprocess import torch from PIL import Image from transformers import AutoProcessor, AutoConfig import importlib, sys subprocess.run( "pip install --upgrade transformers>=4.50.0", shell=True, check=True ) model_id = "microsoft/Florence-2-base-ft" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) module_base = config.__module__.rsplit(".", 1)[0] modeling_mod_path = module_base + ".modeling_florence2" modeling_mod = importlib.import_module(modeling_mod_path) FlorenceLM = getattr( modeling_mod, "Florence2LanguageForConditionalGeneration" ) florence_model = FlorenceLM.from_pretrained( model_id, trust_remote_code=True, ).to(device).eval() florence_processor = AutoProcessor.from_pretrained(model, trust_remote_code=True) def generate_caption(image): if not isinstance(image, Image.Image): image = Image.fromarray(image) inputs = florence_processor(text="", images=image, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} generated_ids = florence_model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3, ) generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = florence_processor.post_process_generation( generated_text, task="", image_size=(image.width, image.height) ) prompt = parsed_answer[""] print("\n\nGeneration completed!:"+ prompt) return prompt demo = gr.Interface(generate_caption, inputs=[gr.Image(label="Input Image")], outputs = [gr.Textbox(label="Output Prompt", lines=3, show_copy_button = True), ], theme="Yntec/HaleyCH_Theme_Orange", ) demo.launch(debug=True)