Spaces:

preston-cell
/

image-text-to-text

Running

File size: 3,685 Bytes

f30b843
1b524ac
 
29f7833
a0edfdb
 
 
 
 
 
 
 
 
e6ea13d
a0edfdb
 
 
 
a483c36
a0edfdb
 
83cd235
a0edfdb
29f7833
a0edfdb
29f7833
ffbd81c
d0aa231
a0edfdb
 
 
 
 
 
8c3caa4
68bf04e
8c3caa4
 
 
1b524ac
8c3caa4
e9b130c
a0edfdb
 
66d96fc
a0edfdb
 
 
29f7833
602e80d
629e04f
a0edfdb
68bf04e
602e80d
a0edfdb
 
 
 
 
29f7833
 
a0edfdb
d0aa231
a0edfdb
ffbd81c
a0edfdb
 
8c3caa4
a0edfdb
 
 
29f7833
1b524ac
a0edfdb
 
 
 
e6ea13d
 
602e80d
e6ea13d
602e80d
629e04f
e9b130c
26dbd13
a0edfdb
26dbd13
602e80d
68bf04e
602e80d
 
efa273d
e9b130c
68bf04e
602e80d
a0edfdb
 
629e04f
5c86456
68bf04e

import gradio as gr
import torch
import numpy as np
from PIL import Image
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoProcessor,
    AutoTokenizer,
    GenerationConfig,
    TextStreamer,
)
from datasets import load_dataset

# Use CPU if no GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Device set to use {device}")

# Load image captioning model (BLIP)
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=device)

# Load OCR model (Florence-2-base)
ocr_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-base", trust_remote_code=True, torch_dtype=dtype
).to(device)
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)

# Load SmallDoge model for context generation
doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
doge_model = AutoModelForCausalLM.from_pretrained(
    "SmallDoge/Doge-320M-Instruct", trust_remote_code=True
).to(device)
doge_config = GenerationConfig(
    max_new_tokens=100,
    use_cache=True,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
    repetition_penalty=1.0
)

# Load SpeechT5 for TTS
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device)

# Use known compatible 600-dim speaker embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)  # Shape: [1, 600]

def process_image(image):
    try:
        # Caption generation
        caption = caption_model(image)[0]['generated_text']

        # OCR extraction
        ocr_inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device)
        ocr_outputs = ocr_model.generate(
            input_ids=ocr_inputs["input_ids"],
            pixel_values=ocr_inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3,
            do_sample=False,
        )
        extracted_text = ocr_processor.batch_decode(ocr_outputs, skip_special_tokens=True)[0]

        # Context generation using Doge
        prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
        conversation = [{"role": "user", "content": prompt}]
        doge_inputs = doge_tokenizer.apply_chat_template(conversation, tokenize=True, return_tensors="pt").to(device)
        doge_output = doge_model.generate(doge_inputs, generation_config=doge_config)
        context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True)

        # Convert context to speech
        speech = synthesiser(
            context,
            forward_params={"speaker_embeddings": speaker_embedding}
        )
        audio = np.array(speech["audio"])
        rate = speech["sampling_rate"]

        return (rate, audio), caption, extracted_text, context

    except Exception as e:
        return None, f"Error: {str(e)}", "", ""

# Gradio UI
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption"),
        gr.Textbox(label="Extracted Text (OCR)"),
        gr.Textbox(label="Generated Context")
    ],
    title="SeeSay Contextualizer with Doge + SpeechT5",
    description="Upload an image to generate a caption, extract OCR text, determine context with Doge-320M, and hear it with SpeechT5."
)

iface.launch(share=True)