Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
# Load image-to-text model | |
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
def process_image(input_image): | |
try: | |
# Step 1: Generate caption | |
caption = captioner(input_image)[0]['generated_text'] | |
return caption | |
except Exception as e: | |
return str(e) | |
# Set up Gradio app | |
with gr.Blocks(fill_height=True) as demo: | |
with gr.Sidebar(): | |
gr.Markdown("# SeeSay - Powered by Sesame CSM") | |
gr.Markdown("This Space extracts captions from images and generates expressive speech using CSM.") | |
gr.Markdown("Sign in with your Hugging Face account to access the model.") | |
button = gr.LoginButton("Sign in") | |
# Image Upload and Caption Generation | |
image_input = gr.Image(type="pil", label="Upload Image") | |
caption_output = gr.Textbox(label="Generated Caption") | |
# Speech Generation using CSM | |
with gr.Row(): | |
gr.Markdown("### Speech Generation") | |
gr.load("models/sesame/csm-1b", accept_token=button, provider="hf-inference") | |
# Link input and output | |
image_input.change(fn=process_image, inputs=image_input, outputs=caption_output) | |
demo.launch() |