File size: 2,761 Bytes
f30b843
8c3caa4
 
 
d0aa231
8c3caa4
 
 
602e80d
 
d0aa231
 
 
608498c
e6ea13d
 
d0aa231
a483c36
 
d0aa231
ed4af8f
83cd235
d0aa231
e6ea13d
 
d0aa231
8c3caa4
 
a483c36
 
d0aa231
8c3caa4
 
 
 
 
 
 
 
e9b130c
d0aa231
 
e6ea13d
602e80d
629e04f
d0aa231
ed4af8f
602e80d
d0aa231
 
efa273d
d0aa231
 
 
 
 
8c3caa4
 
 
 
 
a483c36
8c3caa4
a483c36
 
8c3caa4
 
a483c36
8c3caa4
d0aa231
 
e6ea13d
 
602e80d
e6ea13d
602e80d
629e04f
e9b130c
26dbd13
 
602e80d
ed4af8f
602e80d
 
efa273d
e9b130c
e6ea13d
602e80d
d0aa231
 
629e04f
5c86456
a483c36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    set_seed
)
import torch
import numpy as np
import pytesseract
from PIL import Image
from datasets import load_dataset

set_seed(42)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Image Captioning (BLIP)
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Text-to-Speech without speaker embeddings
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")

# Doge-320M-Instruct for Context Generation
doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
doge_model = AutoModelForCausalLM.from_pretrained(
    "SmallDoge/Doge-320M-Instruct", trust_remote_code=True
).to(device)

doge_generation_config = GenerationConfig(
    max_new_tokens=100,
    use_cache=True,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
    repetition_penalty=1.0
)

def extract_text_with_tesseract(image):
    return pytesseract.image_to_string(image)

def process_image(image):
    try:
        # 1. Caption
        caption = caption_model(image)[0]['generated_text']

        # 2. OCR
        extracted_text = extract_text_with_tesseract(image)

        # 3. Context with Doge (truncate input)
        prompt = (
            f"Determine the context of this image.\n"
            f"Caption: {caption[:200]}\nExtracted text: {extracted_text[:200]}\nContext:"
        )
        conversation = [{"role": "user", "content": prompt}]
        doge_inputs = doge_tokenizer.apply_chat_template(
            conversation=conversation,
            tokenize=True,
            return_tensors="pt"
        ).to(device)

        doge_output = doge_model.generate(
            input_ids=doge_inputs,
            generation_config=doge_generation_config
        )
        context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True).strip()

        # 4. Text-to-Speech (no embeddings)
        speech = synthesiser(context)
        audio = np.array(speech["audio"])
        rate = speech["sampling_rate"]

        return (rate, audio), caption, extracted_text, context

    except Exception as e:
        return None, f"Error: {str(e)}", "", ""

iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption"),
        gr.Textbox(label="Extracted Text (OCR)"),
        gr.Textbox(label="Generated Context")
    ],
    title="SeeSay Contextualizer (Optimized)",
    description="Upload an image to generate a caption, extract text (OCR), generate context, and hear it spoken."
)

iface.launch(share=True)