Spaces:

bobpopboom
/

chaty

Sleeping

hashhac commited on Mar 15

Commit

35e9187

1 Parent(s): 3f5ef1f

have int 16 outputs

Files changed (1) hide show

app.py CHANGED Viewed

@@ -90,18 +90,22 @@ def text_to_speech(text):
     # Generate speech with SpeechT5
     with torch.no_grad():
         # Generate speech
         speech = tts_model.generate_speech(
             inputs["input_ids"].to(device),
-            speaker_embeddings.to(device),
             vocoder=tts_vocoder
         )
     # Convert to numpy array
-    audio_array = speech.cpu().numpy()
-    # Normalize and convert to int16
-    audio_array = (audio_array / np.max(np.abs(audio_array)) * 32767).astype(np.int16)
     # Reshape for fastrtc
     audio_array = audio_array.reshape(1, -1)

     # Generate speech with SpeechT5
     with torch.no_grad():
+        # Convert speaker embeddings to correct dtype and move to device
+        speaker_embeddings_device = speaker_embeddings.to(device).to(torch_dtype)
         # Generate speech
         speech = tts_model.generate_speech(
             inputs["input_ids"].to(device),
+            speaker_embeddings_device,
             vocoder=tts_vocoder
         )
     # Convert to numpy array
+    # Make sure speech is float32 before any conversion to avoid the error
+    audio_array = speech.cpu().numpy().astype(np.float32)
+    # Normalize and convert to int16 for output
+    audio_array = (audio_array / np.max(np.abs(audio_array) + 1e-6) * 32767).astype(np.int16)
     # Reshape for fastrtc
     audio_array = audio_array.reshape(1, -1)