Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,13 +4,13 @@ from transformers import (
|
|
4 |
AutoProcessor,
|
5 |
AutoModelForCausalLM,
|
6 |
AutoTokenizer,
|
7 |
-
GenerationConfig
|
8 |
-
TextStreamer
|
9 |
)
|
10 |
import torch
|
11 |
import numpy as np
|
12 |
from PIL import Image
|
13 |
import requests
|
|
|
14 |
|
15 |
# Device setup
|
16 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -21,16 +21,17 @@ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captionin
|
|
21 |
|
22 |
# Load TTS model (SpeechT5) and static speaker embedding
|
23 |
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
24 |
-
|
25 |
SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
|
26 |
-
|
|
|
|
|
27 |
speaker_embedding = speaker_embedding.unsqueeze(0)
|
28 |
|
29 |
# Load Florence-2-base for OCR
|
30 |
ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
|
31 |
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
|
32 |
|
33 |
-
# Load Doge-320M-Instruct
|
34 |
doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
|
35 |
doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
|
36 |
|
@@ -45,10 +46,10 @@ generation_config = GenerationConfig(
|
|
45 |
|
46 |
def process_image(image):
|
47 |
try:
|
48 |
-
#
|
49 |
caption = caption_model(image)[0]['generated_text']
|
50 |
|
51 |
-
#
|
52 |
inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
|
53 |
generated_ids = ocr_model.generate(
|
54 |
input_ids=inputs["input_ids"],
|
@@ -59,7 +60,7 @@ def process_image(image):
|
|
59 |
)
|
60 |
extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
61 |
|
62 |
-
#
|
63 |
prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
|
64 |
conversation = [{"role": "user", "content": prompt}]
|
65 |
doge_inputs = doge_tokenizer.apply_chat_template(
|
@@ -75,7 +76,7 @@ def process_image(image):
|
|
75 |
|
76 |
context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
77 |
|
78 |
-
#
|
79 |
speech = synthesiser(
|
80 |
context,
|
81 |
forward_params={"speaker_embeddings": speaker_embedding}
|
@@ -89,7 +90,7 @@ def process_image(image):
|
|
89 |
except Exception as e:
|
90 |
return None, f"Error: {str(e)}", "", ""
|
91 |
|
92 |
-
# Gradio
|
93 |
iface = gr.Interface(
|
94 |
fn=process_image,
|
95 |
inputs=gr.Image(type='pil', label="Upload an Image"),
|
@@ -100,7 +101,7 @@ iface = gr.Interface(
|
|
100 |
gr.Textbox(label="Generated Context")
|
101 |
],
|
102 |
title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
|
103 |
-
description="Upload an image to generate a caption, extract text,
|
104 |
)
|
105 |
|
106 |
-
iface.launch(share=True)
|
|
|
4 |
AutoProcessor,
|
5 |
AutoModelForCausalLM,
|
6 |
AutoTokenizer,
|
7 |
+
GenerationConfig
|
|
|
8 |
)
|
9 |
import torch
|
10 |
import numpy as np
|
11 |
from PIL import Image
|
12 |
import requests
|
13 |
+
import io
|
14 |
|
15 |
# Device setup
|
16 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
21 |
|
22 |
# Load TTS model (SpeechT5) and static speaker embedding
|
23 |
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
|
|
24 |
SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
|
25 |
+
response = requests.get(SPEAKER_EMBEDDING_URL)
|
26 |
+
buffer = io.BytesIO(response.content)
|
27 |
+
speaker_embedding = torch.load(buffer)
|
28 |
speaker_embedding = speaker_embedding.unsqueeze(0)
|
29 |
|
30 |
# Load Florence-2-base for OCR
|
31 |
ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
|
32 |
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
|
33 |
|
34 |
+
# Load Doge-320M-Instruct for contextual reasoning
|
35 |
doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
|
36 |
doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
|
37 |
|
|
|
46 |
|
47 |
def process_image(image):
|
48 |
try:
|
49 |
+
# Captioning
|
50 |
caption = caption_model(image)[0]['generated_text']
|
51 |
|
52 |
+
# OCR
|
53 |
inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
|
54 |
generated_ids = ocr_model.generate(
|
55 |
input_ids=inputs["input_ids"],
|
|
|
60 |
)
|
61 |
extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
62 |
|
63 |
+
# Context generation
|
64 |
prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
|
65 |
conversation = [{"role": "user", "content": prompt}]
|
66 |
doge_inputs = doge_tokenizer.apply_chat_template(
|
|
|
76 |
|
77 |
context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
78 |
|
79 |
+
# Text-to-speech
|
80 |
speech = synthesiser(
|
81 |
context,
|
82 |
forward_params={"speaker_embeddings": speaker_embedding}
|
|
|
90 |
except Exception as e:
|
91 |
return None, f"Error: {str(e)}", "", ""
|
92 |
|
93 |
+
# Gradio UI
|
94 |
iface = gr.Interface(
|
95 |
fn=process_image,
|
96 |
inputs=gr.Image(type='pil', label="Upload an Image"),
|
|
|
101 |
gr.Textbox(label="Generated Context")
|
102 |
],
|
103 |
title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
|
104 |
+
description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
|
105 |
)
|
106 |
|
107 |
+
iface.launch(share=True)
|