Spaces:

preston-cell
/

image-text-to-text

Running

image-text-to-text / app.py

Update app.py

1129fe7 verified 5 months ago

940 Bytes

	import gradio as gr
	from transformers import pipeline

	# Captioning model
	captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

	# Bark TTS model
	synthesiser = pipeline("text-to-speech", model="suno/bark")

	def launch(input_image):
	# Step 1: Generate caption
	caption = captioner(input_image)[0]['generated_text']

	# Step 2: Synthesize speech from caption
	speech = synthesiser(caption, forward_params={"do_sample": True})
	audio = np.array(speech["audio"])
	rate = speech["sampling_rate"]

	return (audio, rate), caption

	# Gradio UI
	iface = gr.Interface(
	fn=launch,
	inputs=gr.Image(type="pil", label="Upload Image"),
	outputs=[
	gr.Audio(type="numpy", label="Narrated Audio"),
	gr.Textbox(label="Generated Caption")
	],
	title="🎙️ SeeSay",
	description="Upload an image to hear it described with expressive speech."
	)

	iface.launch(share = True)