Spaces:

preston-cell
/

image-text-to-text

Sleeping

File size: 3,169 Bytes

f30b843
f9a1f04
602e80d
 
 
608498c
f9a1f04
 
 
4f56a4b
ed4af8f
83cd235
602e80d
 
 
efa273d
 
 
 
 
 
c0daae4
 
e9b130c
efa273d
602e80d
 
 
 
 
629e04f
602e80d
ed4af8f
602e80d
efa273d
 
 
 
 
 
 
 
 
 
 
c0daae4
f9a1f04
5a007ee
f9a1f04
f7203e8
 
 
 
 
 
e9b130c
602e80d
 
 
 
f7203e8
e9b130c
602e80d
629e04f
e9b130c
26dbd13
602e80d
 
26dbd13
602e80d
ed4af8f
602e80d
 
efa273d
e9b130c
 
602e80d
c0daae4
 
629e04f
5c86456
ed4af8f
c0daae4

import gradio as gr
from transformers import pipeline, AutoProcessor, AutoModelForCausalLM, AutoTokenizer, set_seed
from datasets import load_dataset
import torch
import numpy as np

# Set seed for reproducibility
set_seed(42)

# Load BLIP model for image captioning
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Load SpeechT5 model for text-to-speech
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")

# Load Florence-2 model for OCR
ocr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=ocr_dtype, trust_remote_code=True).to(ocr_device)
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

# Load GPT-2 (124M) model for text generation
gpt2_generator = pipeline('text-generation', model='gpt2')

# Load speaker embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def process_image(image):
    try:
        # Generate caption from the image
        caption = caption_model(image)[0]['generated_text']

        # Extract text (OCR) using Florence-2
        inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(ocr_device, ocr_dtype)
        generated_ids = ocr_model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=4096,
            num_beams=3,
            do_sample=False
        )
        extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Generate context using GPT-2 (124M)
        prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
        context_output = gpt2_generator(prompt, max_length=100, num_return_sequences=1)
        context = context_output[0]['generated_text']

        # Convert context to speech
        speech = synthesiser(
            context,
            forward_params={"speaker_embeddings": speaker_embedding}
        )

        # Prepare audio data
        audio = np.array(speech["audio"])
        rate = speech["sampling_rate"]

        # Return audio, caption, extracted text, and context
        return (rate, audio), caption, extracted_text, context

    except Exception as e:
        return None, f"Error: {str(e)}", "", ""


# Gradio Interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption"),
        gr.Textbox(label="Extracted Text (OCR)"),
        gr.Textbox(label="Generated Context")
    ],
    title="SeeSay Contextualizer with GPT-2 (124M)",
    description="Upload an image to generate a caption, extract text, create audio from context, and determine the context using GPT-2."
)

iface.launch()