Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

shukdevdatta123 commited on 14 days ago

Commit

3220f5e

verified ·

1 Parent(s): 7efcfa3

Create app.py

Browse files

Files changed (1) hide show

app.py +446 -0

app.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import gradio as gr
+import torch
+import torchaudio
+import numpy as np
+import tempfile
+import os
+from pathlib import Path
+import librosa
+import soundfile as sf
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import warnings
+import gc
+warnings.filterwarnings("ignore")
+class VoiceCloningTTS:
+    def __init__(self):
+        """Initialize the TTS system with SpeechT5 model"""
+        # Use CPU for HF Spaces to avoid memory issues
+        self.device = torch.device("cpu")
+        print(f"Using device: {self.device}")
+        try:
+            # Load SpeechT5 models with memory optimization
+            print("Loading SpeechT5 processor...")
+            self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+            print("Loading SpeechT5 TTS model...")
+            self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+            self.model.to(self.device)
+            self.model.eval()  # Set to evaluation mode
+            print("Loading SpeechT5 vocoder...")
+            self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+            self.vocoder.to(self.device)
+            self.vocoder.eval()
+            # Load default speaker embeddings
+            print("Loading speaker embeddings...")
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
+            self.user_speaker_embeddings = None
+            self.sample_rate = 16000
+            print("✅ TTS system initialized successfully!")
+        except Exception as e:
+            print(f"❌ Error initializing TTS system: {str(e)}")
+            raise e
+    def extract_speaker_embedding(self, audio_path):
+        """Extract speaker embedding from uploaded audio"""
+        try:
+            print(f"Processing audio file: {audio_path}")
+            # Load and preprocess audio
+            waveform, sample_rate = torchaudio.load(audio_path)
+            print(f"Original audio shape: {waveform.shape}, sample rate: {sample_rate}")
+            # Resample if necessary
+            if sample_rate != self.sample_rate:
+                print(f"Resampling from {sample_rate} to {self.sample_rate}")
+                resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
+                waveform = resampler(waveform)
+            # Convert to mono if stereo
+            if waveform.shape[0] > 1:
+                waveform = torch.mean(waveform, dim=0, keepdim=True)
+                print("Converted to mono")
+            # Ensure minimum length (at least 1 second)
+            min_length = self.sample_rate
+            if waveform.shape[1] < min_length:
+                # Pad with zeros if too short
+                padding = min_length - waveform.shape[1]
+                waveform = torch.nn.functional.pad(waveform, (0, padding))
+                print(f"Padded audio to minimum length")
+            # Limit maximum length (30 seconds max for memory efficiency)
+            max_length = 30 * self.sample_rate
+            if waveform.shape[1] > max_length:
+                waveform = waveform[:, :max_length]
+                print("Truncated audio to 30 seconds")
+            # Normalize audio
+            waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
+            # Convert to numpy for librosa processing
+            audio_numpy = waveform.squeeze().numpy()
+            print("Extracting audio features...")
+            # Extract comprehensive audio features
+            try:
+                # MFCC features (mel-frequency cepstral coefficients)
+                mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
+                mfcc_mean = np.mean(mfccs, axis=1)
+                mfcc_std = np.std(mfccs, axis=1)
+                # Spectral features
+                spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
+                spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
+                spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
+                zero_crossing_rate = librosa.feature.zero_crossing_rate(audio_numpy)
+                # Pitch features
+                pitches, magnitudes = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
+                pitch_mean = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
+                # Chroma features
+                chroma = librosa.feature.chroma_stft(y=audio_numpy, sr=self.sample_rate)
+                chroma_mean = np.mean(chroma, axis=1)
+                # Combine all features
+                features = np.concatenate([
+                    mfcc_mean,
+                    mfcc_std,
+                    [np.mean(spectral_centroids)],
+                    [np.mean(spectral_rolloff)],
+                    [np.mean(spectral_bandwidth)],
+                    [np.mean(zero_crossing_rate)],
+                    [pitch_mean],
+                    chroma_mean
+                ])
+                print(f"Extracted {len(features)} audio features")
+            except Exception as e:
+                print(f"Error extracting features: {e}")
+                # Simple fallback feature extraction
+                features = np.array([
+                    np.mean(audio_numpy),
+                    np.std(audio_numpy),
+                    np.max(audio_numpy),
+                    np.min(audio_numpy)
+                ])
+            # Create speaker embedding by modifying the default embedding
+            base_embedding = self.default_speaker_embeddings.clone()
+            # Normalize features
+            features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
+            # Create modification vector (pad or truncate to match embedding size)
+            embedding_size = base_embedding.shape[1]  # Should be 512
+            if len(features_normalized) > embedding_size:
+                modification_vector = features_normalized[:embedding_size]
+            else:
+                modification_vector = np.pad(features_normalized,
+                                           (0, embedding_size - len(features_normalized)),
+                                           'constant', constant_values=0)
+            modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
+            # Apply modifications to create unique speaker embedding
+            # Use a smaller modification factor for stability
+            speaker_embedding = base_embedding + 0.05 * modification_tensor.unsqueeze(0)
+            # Normalize the final embedding
+            speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
+            print("✅ Speaker embedding created successfully!")
+            return speaker_embedding, "✅ Voice profile extracted successfully! You can now generate speech in this voice."
+        except Exception as e:
+            print(f"❌ Error in extract_speaker_embedding: {str(e)}")
+            return None, f"❌ Error processing audio: {str(e)}"
+    def synthesize_speech(self, text, use_cloned_voice=True):
+        """Convert text to speech using the specified voice"""
+        try:
+            if not text.strip():
+                return None, "❌ Please enter some text to convert."
+            # Limit text length for memory efficiency
+            if len(text) > 500:
+                text = text[:500]
+                print("Text truncated to 500 characters for memory efficiency")
+            print(f"Synthesizing speech for text: '{text[:50]}...'")
+            # Choose speaker embedding
+            if use_cloned_voice and self.user_speaker_embeddings is not None:
+                speaker_embeddings = self.user_speaker_embeddings
+                voice_type = "your cloned voice"
+                print("Using cloned voice")
+            else:
+                speaker_embeddings = self.default_speaker_embeddings
+                voice_type = "default voice"
+                print("Using default voice")
+            # Tokenize text
+            inputs = self.processor(text=text, return_tensors="pt")
+            input_ids = inputs["input_ids"].to(self.device)
+            print("Generating speech...")
+            # Generate speech with memory optimization
+            with torch.no_grad():
+                # Clear cache before generation
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                speech = self.model.generate_speech(
+                    input_ids,
+                    speaker_embeddings,
+                    vocoder=self.vocoder
+                )
+            # Convert to numpy
+            speech_numpy = speech.cpu().numpy()
+            print(f"Generated audio shape: {speech_numpy.shape}")
+            # Create temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                sf.write(tmp_file.name, speech_numpy, self.sample_rate)
+                print(f"Audio saved to: {tmp_file.name}")
+                # Clean up memory
+                del speech, input_ids
+                gc.collect()
+                return tmp_file.name, f"✅ Speech generated successfully using {voice_type}!"
+        except Exception as e:
+            print(f"❌ Error in synthesize_speech: {str(e)}")
+            return None, f"❌ Error generating speech: {str(e)}"
+# Initialize the TTS system
+print("🚀 Initializing Voice Cloning TTS System...")
+tts_system = VoiceCloningTTS()
+def process_voice_upload(audio_file):
+    """Process uploaded voice file"""
+    if audio_file is None:
+        return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
+    try:
+        speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
+        if speaker_embedding is not None:
+            tts_system.user_speaker_embeddings = speaker_embedding
+            return message, gr.update(interactive=True), gr.update(interactive=True)
+        else:
+            return message, gr.update(interactive=False), gr.update(interactive=False)
+    except Exception as e:
+        error_msg = f"❌ Error processing audio: {str(e)}"
+        return error_msg, gr.update(interactive=False), gr.update(interactive=False)
+def generate_speech(text, use_cloned_voice):
+    """Generate speech from text"""
+    if not text.strip():
+        return None, "❌ Please enter some text to convert."
+    try:
+        audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice)
+        return audio_file, message
+    except Exception as e:
+        error_msg = f"❌ Error generating speech: {str(e)}"
+        return None, error_msg
+def clear_voice_profile():
+    """Clear the uploaded voice profile"""
+    tts_system.user_speaker_embeddings = None
+    return ("🔄 Voice profile cleared. Upload a new audio file to clone a voice.",
+            gr.update(interactive=False),
+            gr.update(interactive=False))
+def update_generate_button(text, use_cloned):
+    """Update generate button state based on inputs"""
+    text_ready = bool(text.strip())
+    voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None)
+    return gr.update(interactive=text_ready and voice_ready)
+# Create Gradio interface optimized for HF Spaces
+with gr.Blocks(
+    title="🎤 Voice Cloning TTS System",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 1000px !important;
+        margin: auto !important;
+    }
+    .header {
+        text-align: center;
+        margin-bottom: 30px;
+        padding: 20px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 15px;
+        color: white;
+    }
+    .step-box {
+        border: 2px solid #e1e5e9;
+        border-radius: 12px;
+        padding: 20px;
+        margin: 15px 0;
+        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .tips-box {
+        background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
+        border-radius: 12px;
+        padding: 20px;
+        margin: 20px 0;
+        border-left: 5px solid #ff6b6b;
+    }
+    """
+) as demo:
+    gr.HTML("""
+    <div class="header">
+        <h1>🎤 AI Voice Cloning TTS System</h1>
+        <p>🚀 Upload your voice sample and convert any text to speech in YOUR voice!</p>
+        <p>✨ Powered by Microsoft SpeechT5 & Advanced Voice Analysis</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML('<div class="step-box"><h3>🎙️ Step 1: Upload Your Voice Sample</h3><p>Record or upload 10-30 seconds of clear English speech</p></div>')
+            voice_upload = gr.Audio(
+                label="📤 Voice Sample (English)",
+                type="filepath",
+                sources=["upload", "microphone"],
+                format="wav"
+            )
+            upload_status = gr.Textbox(
+                label="📊 Voice Analysis Status",
+                interactive=False,
+                value="⏳ Please upload an audio file to extract your voice profile.",
+                lines=2
+            )
+            clear_btn = gr.Button("🗑️ Clear Voice Profile", variant="secondary", size="sm")
+        with gr.Column(scale=1):
+            gr.HTML('<div class="step-box"><h3>✍️ Step 2: Enter Your Text</h3><p>Type the text you want to convert to speech</p></div>')
+            text_input = gr.Textbox(
+                label="📝 Text to Convert (Max 500 characters)",
+                placeholder="Enter the text you want to convert to speech using your cloned voice...",
+                lines=5,
+                max_lines=8
+            )
+            use_cloned_voice = gr.Checkbox(
+                label="🎭 Use My Cloned Voice",
+                value=True,
+                interactive=False,
+                info="Uncheck to use default voice"
+            )
+            generate_btn = gr.Button(
+                "🎵 Generate Speech",
+                variant="primary",
+                interactive=False,
+                size="lg"
+            )
+    gr.HTML('<div class="step-box"><h3>🔊 Step 3: Your Generated Speech</h3></div>')
+    with gr.Row():
+        with gr.Column():
+            output_audio = gr.Audio(
+                label="🎧 Generated Speech Audio",
+                type="filepath",
+                interactive=False
+            )
+            generation_status = gr.Textbox(
+                label="⚡ Generation Status",
+                interactive=False,
+                lines=2
+            )
+    # Tips and information section
+    gr.HTML("""
+    <div class="tips-box">
+        <h3>💡 Pro Tips for Best Results:</h3>
+        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 15px;">
+            <div>
+                <h4>🎤 Voice Sample Quality:</h4>
+                <ul>
+                    <li>Use clear, natural English speech</li>
+                    <li>10-30 seconds duration is optimal</li>
+                    <li>Minimize background noise</li>
+                    <li>Speak at normal pace and volume</li>
+                </ul>
+            </div>
+            <div>
+                <h4>📝 Text Guidelines:</h4>
+                <ul>
+                    <li>English text works best</li>
+                    <li>Keep sentences natural and clear</li>
+                    <li>Avoid very long paragraphs</li>
+                    <li>Punctuation helps with intonation</li>
+                </ul>
+            </div>
+        </div>
+        <div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.7); border-radius: 8px;">
+            <strong>🔬 How it works:</strong> The system analyzes your voice's unique characteristics (pitch, tone, formants)
+            and creates a personalized voice profile that's used to generate speech that sounds like you!
+        </div>
+    </div>
+    """)
+    # Event handlers with proper state management
+    voice_upload.change(
+        fn=process_voice_upload,
+        inputs=[voice_upload],
+        outputs=[upload_status, use_cloned_voice, generate_btn]
+    )
+    text_input.change(
+        fn=update_generate_button,
+        inputs=[text_input, use_cloned_voice],
+        outputs=[generate_btn]
+    )
+    use_cloned_voice.change(
+        fn=update_generate_button,
+        inputs=[text_input, use_cloned_voice],
+        outputs=[generate_btn]
+    )
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, use_cloned_voice],
+        outputs=[output_audio, generation_status]
+    )
+    clear_btn.click(
+        fn=clear_voice_profile,
+        outputs=[upload_status, use_cloned_voice, generate_btn]
+    )
+# Launch configuration for Hugging Face Spaces
+if __name__ == "__main__":
+    print("🌟 Starting Voice Cloning TTS System on Hugging Face Spaces...")
+    demo.launch(
+        share=True  # HF Spaces handles sharing automatically
+    )