Spaces:
Sleeping
Sleeping
File size: 2,761 Bytes
f30b843 8c3caa4 d0aa231 8c3caa4 602e80d d0aa231 608498c e6ea13d d0aa231 a483c36 d0aa231 ed4af8f 83cd235 d0aa231 e6ea13d d0aa231 8c3caa4 a483c36 d0aa231 8c3caa4 e9b130c d0aa231 e6ea13d 602e80d 629e04f d0aa231 ed4af8f 602e80d d0aa231 efa273d d0aa231 8c3caa4 a483c36 8c3caa4 a483c36 8c3caa4 a483c36 8c3caa4 d0aa231 e6ea13d 602e80d e6ea13d 602e80d 629e04f e9b130c 26dbd13 602e80d ed4af8f 602e80d efa273d e9b130c e6ea13d 602e80d d0aa231 629e04f 5c86456 a483c36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
from transformers import (
pipeline,
AutoTokenizer,
AutoModelForCausalLM,
GenerationConfig,
set_seed
)
import torch
import numpy as np
import pytesseract
from PIL import Image
from datasets import load_dataset
set_seed(42)
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Image Captioning (BLIP)
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Text-to-Speech without speaker embeddings
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
# Doge-320M-Instruct for Context Generation
doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
doge_model = AutoModelForCausalLM.from_pretrained(
"SmallDoge/Doge-320M-Instruct", trust_remote_code=True
).to(device)
doge_generation_config = GenerationConfig(
max_new_tokens=100,
use_cache=True,
do_sample=True,
temperature=0.8,
top_p=0.9,
repetition_penalty=1.0
)
def extract_text_with_tesseract(image):
return pytesseract.image_to_string(image)
def process_image(image):
try:
# 1. Caption
caption = caption_model(image)[0]['generated_text']
# 2. OCR
extracted_text = extract_text_with_tesseract(image)
# 3. Context with Doge (truncate input)
prompt = (
f"Determine the context of this image.\n"
f"Caption: {caption[:200]}\nExtracted text: {extracted_text[:200]}\nContext:"
)
conversation = [{"role": "user", "content": prompt}]
doge_inputs = doge_tokenizer.apply_chat_template(
conversation=conversation,
tokenize=True,
return_tensors="pt"
).to(device)
doge_output = doge_model.generate(
input_ids=doge_inputs,
generation_config=doge_generation_config
)
context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True).strip()
# 4. Text-to-Speech (no embeddings)
speech = synthesiser(context)
audio = np.array(speech["audio"])
rate = speech["sampling_rate"]
return (rate, audio), caption, extracted_text, context
except Exception as e:
return None, f"Error: {str(e)}", "", ""
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(label="Generated Audio"),
gr.Textbox(label="Generated Caption"),
gr.Textbox(label="Extracted Text (OCR)"),
gr.Textbox(label="Generated Context")
],
title="SeeSay Contextualizer (Optimized)",
description="Upload an image to generate a caption, extract text (OCR), generate context, and hear it spoken."
)
iface.launch(share=True) |