Spaces:

preston-cell
/

image-text-to-text

Running

File size: 3,699 Bytes

f30b843
8c3caa4
 
ffbd81c
d0aa231
ffbd81c
8c3caa4
ffbd81c
8c3caa4
ffbd81c
602e80d
 
e6ea13d
ffbd81c
 
 
a483c36
ffbd81c
ed4af8f
83cd235
ffbd81c
e6ea13d
 
ffbd81c
 
 
 
 
a483c36
ffbd81c
d0aa231
ffbd81c
 
 
 
8c3caa4
 
 
 
 
 
 
e9b130c
ffbd81c
 
 
 
 
 
 
 
 
 
e6ea13d
602e80d
629e04f
ffbd81c
ed4af8f
602e80d
ffbd81c
 
 
 
 
 
 
 
d0aa231
ffbd81c
 
 
 
8c3caa4
 
 
 
 
a483c36
8c3caa4
ffbd81c
 
 
8c3caa4
ffbd81c
8c3caa4
ffbd81c
 
 
 
 
e6ea13d
 
602e80d
e6ea13d
602e80d
629e04f
e9b130c
26dbd13
ffbd81c
26dbd13
602e80d
ed4af8f
602e80d
 
efa273d
e9b130c
e6ea13d
602e80d
ffbd81c
 
629e04f
5c86456
ffbd81c

import gradio as gr
from transformers import (
    pipeline,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    TextStreamer
)
from datasets import load_dataset
import torch
import numpy as np

# Set device and dtype
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Image Captioning
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Text-to-Speech
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")

# Florence-2-base for OCR
ocr_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-base",
    torch_dtype=torch_dtype,
    trust_remote_code=True
).to(device)
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)

# Doge model for context generation
doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
doge_config = GenerationConfig(
    max_new_tokens=100,
    use_cache=True,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
    repetition_penalty=1.0
)

# Speaker embedding (600-dim)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
embedding = None
for entry in embeddings_dataset:
    vector = torch.tensor(entry["xvector"]).unsqueeze(0)
    if vector.shape[1] >= 600:
        embedding = vector[:, :600]
        break
if embedding is None:
    raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")

def process_image(image):
    try:
        # Caption
        caption = caption_model(image)[0]['generated_text']

        # OCR
        ocr_inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
        generated_ids = ocr_model.generate(
            input_ids=ocr_inputs["input_ids"],
            pixel_values=ocr_inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3,
            do_sample=False
        )
        extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Doge context generation
        prompt = f"Determine the context of this image. Caption: {caption} Extracted text: {extracted_text}"
        conversation = [{"role": "user", "content": prompt}]
        doge_inputs = doge_tokenizer.apply_chat_template(
            conversation=conversation,
            tokenize=True,
            return_tensors="pt"
        ).to(device)

        outputs = doge_model.generate(
            doge_inputs,
            generation_config=doge_config
        )
        context = doge_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # TTS
        speech = synthesiser(
            context,
            forward_params={"speaker_embeddings": embedding}
        )
        audio = np.array(speech["audio"])
        rate = speech["sampling_rate"]

        return (rate, audio), caption, extracted_text, context

    except Exception as e:
        return None, f"Error: {str(e)}", "", ""

# Gradio Interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption"),
        gr.Textbox(label="Extracted Text (OCR)"),
        gr.Textbox(label="Generated Context")
    ],
    title="SeeSay Contextualizer with Doge & BLIP",
    description="Upload an image to generate a caption, extract text, determine context, and convert it to audio."
)

iface.launch()