File size: 1,229 Bytes
f30b843
e363033
608498c
 
1129fe7
83cd235
608498c
 
3ce024b
608498c
3ce024b
 
 
608498c
3ce024b
 
 
 
 
 
 
83cd235
3ce024b
 
 
83cd235
3ce024b
 
 
 
ba2d445
3ce024b
 
5c86456
3ce024b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import gradio as gr
from transformers import pipeline

# Load image-to-text model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

def process_image(input_image):
    try:
        # Step 1: Generate caption
        caption = captioner(input_image)[0]['generated_text']
        return caption
    except Exception as e:
        return str(e)

# Set up Gradio app
with gr.Blocks(fill_height=True) as demo:
    with gr.Sidebar():
        gr.Markdown("# SeeSay - Powered by Sesame CSM")
        gr.Markdown("This Space extracts captions from images and generates expressive speech using CSM.")
        gr.Markdown("Sign in with your Hugging Face account to access the model.")
        button = gr.LoginButton("Sign in")

    # Image Upload and Caption Generation
    image_input = gr.Image(type="pil", label="Upload Image")
    caption_output = gr.Textbox(label="Generated Caption")

    # Speech Generation using CSM
    with gr.Row():
        gr.Markdown("### Speech Generation")
        gr.load("models/sesame/csm-1b", accept_token=button, provider="hf-inference")

    # Link input and output
    image_input.change(fn=process_image, inputs=image_input, outputs=caption_output)

demo.launch()