Spaces:

bobpopboom
/

audio1test

Sleeping

hashhac commited on Mar 15

Commit

be00791

1 Parent(s): a70a34d

fixes

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,17 +15,23 @@ asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 # Function to convert speech to text
-def speech_to_text(audio):
-    inputs = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to(device)
     with torch.no_grad():
         logits = asr_model(inputs).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids)[0]
     return transcription
 # Function to convert text to speech
 def text_to_speech(text):
-    inputs = processor(text, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         speech = tts_model.generate_speech(inputs)
     return speech

 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 # Function to convert speech to text
+def speech_to_text(audio_dict):
+    # Extract the audio array from the dictionary
+    audio_array = audio_dict["array"]
+    # Pass the audio array directly to the processor
+    inputs = processor(audio=audio_array, sampling_rate=16000, return_tensors="pt").input_values.to(device)
     with torch.no_grad():
         logits = asr_model(inputs).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids)[0]
     return transcription
 # Function to convert text to speech
 def text_to_speech(text):
+    inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         speech = tts_model.generate_speech(inputs)
     return speech