Spaces:

preston-cell
/

image-text-to-text

Sleeping

image-text-to-text / app.py

Update app.py

eade8cd verified 5 months ago

959 Bytes

	import gradio as gr
	# Use a pipeline as a high-level helper
	from transformers import pipeline

	pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")

	def launch(input):
	# Step 1: Extract text from image
	caption = pipe(input_image)[0]['generated_text']

	# Step 2: Generate speech from the caption
	audio_output = narrator(caption)

	# Step 3: Save the audio to a temporary file
	audio_data = audio_output["audio"]
	sampling_rate = audio_output["sampling_rate"]

	# Gradio expects a tuple: (numpy_array, sampling_rate)
	return (np.array(audio_data), sampling_rate)

	iface = gr.Interface(launch,
	fn=launch,
	inputs=gr.Image(type='pil'),
	outputs=gr.Audio(type="numpy", label="Narrated Output"),
	title="SeeSay",
	description="Upload an image to hear its context narrated aloud."
	)

	iface.launch()