Spaces:

preston-cell
/

image-text-to-text

Sleeping

App Files Files Community

preston-cell commited on Apr 12

Commit

eda8103

verified ·

1 Parent(s): f6399dc

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -13

app.py CHANGED Viewed

@@ -19,13 +19,8 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 # Load image captioning model (BLIP)
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Load TTS model (SpeechT5) and static speaker embedding
 synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
-SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
-response = requests.get(SPEAKER_EMBEDDING_URL)
-buffer = io.BytesIO(response.content)
-speaker_embedding = torch.load(buffer)
-speaker_embedding = speaker_embedding.unsqueeze(0)
 # Load Florence-2-base for OCR
 ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
@@ -76,11 +71,8 @@ def process_image(image):
         context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        # Text-to-speech
-        speech = synthesiser(
-            context,
-            forward_params={"speaker_embeddings": speaker_embedding}
-        )
         audio = np.array(speech["audio"])
         rate = speech["sampling_rate"]
@@ -100,8 +92,8 @@ iface = gr.Interface(
         gr.Textbox(label="Extracted Text (OCR)"),
         gr.Textbox(label="Generated Context")
     ],
-    title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
     description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
 )
-iface.launch(share=True)

 # Load image captioning model (BLIP)
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Load TTS model (SpeechT5)
 synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
 # Load Florence-2-base for OCR
 ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
         context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Text-to-speech (no speaker embedding required)
+        speech = synthesiser(context)
         audio = np.array(speech["audio"])
         rate = speech["sampling_rate"]
         gr.Textbox(label="Extracted Text (OCR)"),
         gr.Textbox(label="Generated Context")
     ],
+    title="SeeSay",
     description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
 )
+iface.launch(share=True)