Spaces:

NeuralFalcon
/

Image-to-Prompt

Running

App Files Files Community

Image-to-Prompt / app.py

NeuralFalcon

Create app.py

4255784 verified 4 days ago

raw

history blame contribute delete

2.61 kB


	from transformers import AutoProcessor, AutoModelForImageTextToText
	import torch

	# https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct
	# https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct
	# model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
	# model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"

	# Load model & processor
	model_name= "SmolVLM2-2.2B-Instruct"
	model_path=f"HuggingFaceTB/{model_name}"
	processor = AutoProcessor.from_pretrained(model_path)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model = AutoModelForImageTextToText.from_pretrained(
	model_path,
	torch_dtype=torch.float16, # Use FP16 for better performance on T4
	device_map="auto" # Auto-assign model to GPU
	).to(device)


	import torch
	import os
	def describe_image(image_path, user_prompt="Describe the image in detail.",system_role=""):
	global model, processor
	messages=[]
	if not os.path.exists(image_path):
	return None
	if system_role!="":
	messages.append( {
	"role": "system",
	"content": [{"type": "text", "text": system_role}]
	})
	messages.append(
	{
	"role": "user",
	"content": [
	{"type": "text", "text": user_prompt},
	{"type": "image", "path": image_path},
	]
	}
	)

	# Prepare input
	inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	).to(model.device)

	# Convert only float32 tensors to float16
	for k, v in inputs.items():
	if v.dtype == torch.float32:
	inputs[k] = v.to(torch.float16)

	# Generate response
	generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=1024)

	# Decode and return output
	generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
	return generated_texts[0].split("Assistant:")[-1].replace("\n\n\n\n\n\n", "").strip()

	import gradio as gr

	def ui():
	return gr.Interface(
	fn=describe_image,
	inputs=[
	gr.Image(type="filepath", label="Upload Image"),
	gr.Textbox(value="Describe the image in detail.", label="User Prompt"),
	gr.Textbox(value="", label="System Role (Optional)")
	],
	outputs=gr.Textbox(label="Image Description"),
	title="Image Captioning App",
	description="Upload an image and customize prompts to get a detailed description."
	)
	demo=ui()
	demo.queue().launch()