Spaces:
Sleeping
Sleeping
File size: 3,169 Bytes
f30b843 f9a1f04 602e80d 608498c f9a1f04 4f56a4b ed4af8f 83cd235 602e80d efa273d c0daae4 e9b130c efa273d 602e80d 629e04f 602e80d ed4af8f 602e80d efa273d c0daae4 f9a1f04 5a007ee f9a1f04 f7203e8 e9b130c 602e80d f7203e8 e9b130c 602e80d 629e04f e9b130c 26dbd13 602e80d 26dbd13 602e80d ed4af8f 602e80d efa273d e9b130c 602e80d c0daae4 629e04f 5c86456 ed4af8f c0daae4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
from transformers import pipeline, AutoProcessor, AutoModelForCausalLM, AutoTokenizer, set_seed
from datasets import load_dataset
import torch
import numpy as np
# Set seed for reproducibility
set_seed(42)
# Load BLIP model for image captioning
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load SpeechT5 model for text-to-speech
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
# Load Florence-2 model for OCR
ocr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=ocr_dtype, trust_remote_code=True).to(ocr_device)
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
# Load GPT-2 (124M) model for text generation
gpt2_generator = pipeline('text-generation', model='gpt2')
# Load speaker embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def process_image(image):
try:
# Generate caption from the image
caption = caption_model(image)[0]['generated_text']
# Extract text (OCR) using Florence-2
inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(ocr_device, ocr_dtype)
generated_ids = ocr_model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=4096,
num_beams=3,
do_sample=False
)
extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Generate context using GPT-2 (124M)
prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
context_output = gpt2_generator(prompt, max_length=100, num_return_sequences=1)
context = context_output[0]['generated_text']
# Convert context to speech
speech = synthesiser(
context,
forward_params={"speaker_embeddings": speaker_embedding}
)
# Prepare audio data
audio = np.array(speech["audio"])
rate = speech["sampling_rate"]
# Return audio, caption, extracted text, and context
return (rate, audio), caption, extracted_text, context
except Exception as e:
return None, f"Error: {str(e)}", "", ""
# Gradio Interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(label="Generated Audio"),
gr.Textbox(label="Generated Caption"),
gr.Textbox(label="Extracted Text (OCR)"),
gr.Textbox(label="Generated Context")
],
title="SeeSay Contextualizer with GPT-2 (124M)",
description="Upload an image to generate a caption, extract text, create audio from context, and determine the context using GPT-2."
)
iface.launch()
|