Spaces:

MicroHealth
/

ai-podcast-builder

Paused

bluenevus commited on Apr 15

Commit

c10cafd

verified ·

1 Parent(s): aa10e55

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import numpy as np
 import re
 import torch
 import torchaudio
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
 import logging
@@ -124,12 +125,16 @@ def text_to_speech(text, voice):
         # Convert output tensor to mel spectrogram
         mel = output[0].cpu()
         # Normalize the mel spectrogram
-        mel = (mel - mel.min()) / (mel.max() - mel.min())
         # Convert mel spectrogram to audio using torchaudio
-        griffin_lim = torchaudio.transforms.GriffinLim(n_fft=2048, n_iter=10)
-        audio = griffin_lim(mel.unsqueeze(0))
         # Convert to numpy array and ensure it's in the correct format
         audio_np = audio.squeeze().numpy()
@@ -139,7 +144,6 @@ def text_to_speech(text, voice):
     except Exception as e:
         logger.error(f"Error in text_to_speech: {str(e)}")
         raise
 @spaces.GPU()
 def render_podcast(api_key, script, voice1, voice2, num_hosts):
     try:

 import re
 import torch
 import torchaudio
+import torchaudio.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
 import logging
         # Convert output tensor to mel spectrogram
         mel = output[0].cpu()
+        # Reshape mel to match expected dimensions
+        n_mels = 80  # Typical number of mel bands
+        time_dim = mel.shape[0]
+        mel_reshaped = mel.view(n_mels, -1)
         # Normalize the mel spectrogram
+        mel_reshaped = (mel_reshaped - mel_reshaped.min()) / (mel_reshaped.max() - mel_reshaped.min())
         # Convert mel spectrogram to audio using torchaudio
+        audio = F.griffinlim(mel_reshaped.unsqueeze(0), n_iter=10, n_fft=2048, hop_length=512, win_length=2048)
         # Convert to numpy array and ensure it's in the correct format
         audio_np = audio.squeeze().numpy()
     except Exception as e:
         logger.error(f"Error in text_to_speech: {str(e)}")
         raise
 @spaces.GPU()
 def render_podcast(api_key, script, voice1, voice2, num_hosts):
     try: