Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -19,13 +19,8 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
|
19 |
# Load image captioning model (BLIP)
|
20 |
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
21 |
|
22 |
-
# Load TTS model (SpeechT5)
|
23 |
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
24 |
-
SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
|
25 |
-
response = requests.get(SPEAKER_EMBEDDING_URL)
|
26 |
-
buffer = io.BytesIO(response.content)
|
27 |
-
speaker_embedding = torch.load(buffer)
|
28 |
-
speaker_embedding = speaker_embedding.unsqueeze(0)
|
29 |
|
30 |
# Load Florence-2-base for OCR
|
31 |
ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
|
@@ -76,11 +71,8 @@ def process_image(image):
|
|
76 |
|
77 |
context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
78 |
|
79 |
-
# Text-to-speech
|
80 |
-
speech = synthesiser(
|
81 |
-
context,
|
82 |
-
forward_params={"speaker_embeddings": speaker_embedding}
|
83 |
-
)
|
84 |
|
85 |
audio = np.array(speech["audio"])
|
86 |
rate = speech["sampling_rate"]
|
@@ -100,8 +92,8 @@ iface = gr.Interface(
|
|
100 |
gr.Textbox(label="Extracted Text (OCR)"),
|
101 |
gr.Textbox(label="Generated Context")
|
102 |
],
|
103 |
-
title="SeeSay
|
104 |
description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
|
105 |
)
|
106 |
|
107 |
-
iface.launch(share=True)
|
|
|
19 |
# Load image captioning model (BLIP)
|
20 |
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
21 |
|
22 |
+
# Load TTS model (SpeechT5)
|
23 |
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Load Florence-2-base for OCR
|
26 |
ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
|
|
|
71 |
|
72 |
context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
73 |
|
74 |
+
# Text-to-speech (no speaker embedding required)
|
75 |
+
speech = synthesiser(context)
|
|
|
|
|
|
|
76 |
|
77 |
audio = np.array(speech["audio"])
|
78 |
rate = speech["sampling_rate"]
|
|
|
92 |
gr.Textbox(label="Extracted Text (OCR)"),
|
93 |
gr.Textbox(label="Generated Context")
|
94 |
],
|
95 |
+
title="SeeSay",
|
96 |
description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
|
97 |
)
|
98 |
|
99 |
+
iface.launch(share=True)
|