File size: 5,754 Bytes
0bdd1cb
1f6c376
 
 
 
 
 
0bdd1cb
1f6c376
0bdd1cb
 
1f6c376
 
 
0bdd1cb
1f6c376
0bdd1cb
1f6c376
0bdd1cb
 
1f6c376
 
 
0bdd1cb
1f6c376
0bdd1cb
1f6c376
0bdd1cb
 
 
1f6c376
0bdd1cb
 
 
 
1f6c376
0bdd1cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f6c376
 
0bdd1cb
 
1f6c376
0bdd1cb
1f6c376
 
 
 
 
 
 
 
0bdd1cb
1f6c376
 
 
 
 
 
 
 
 
0bdd1cb
 
1f6c376
 
 
0bdd1cb
 
 
 
 
 
 
1f6c376
0bdd1cb
 
1f6c376
0bdd1cb
 
 
1f6c376
0bdd1cb
 
 
 
 
 
 
 
 
1f6c376
0bdd1cb
 
1f6c376
 
 
0bdd1cb
1f6c376
 
 
0bdd1cb
1f6c376
0bdd1cb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# tts_engine.py - TTS engine wrapper for CPU-friendly SpeechT5
import logging
import os
from typing import Optional
import tempfile
import numpy as np
import soundfile as sf
import torch

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset # To get speaker embeddings from VCTK

logger = logging.getLogger(__name__)

class CPUMultiSpeakerTTS:
    def __init__(self):
        self.processor = None
        self.model = None
        self.vocoder = None
        self.speaker_embeddings = {} # Will store speaker embeddings for S1, S2 etc.
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize the SpeechT5 model and vocoder on CPU."""
        try:
            logger.info("Initializing SpeechT5 model for CPU...")
            
            self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
            self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
            self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
            
            # Ensure all components are on CPU explicitly
            self.model.to("cpu")
            self.vocoder.to("cpu")
            logger.info("SpeechT5 model and vocoder initialized successfully on CPU.")

            # Load speaker embeddings for multiple voices
            logger.info("Loading VCTK dataset for speaker embeddings...")
            # VCTK is a multi-speaker dataset used with SpeechT5
            # We'll pick a few representative speaker embeddings for S1, S2, etc.
            
            # This loads the 'xvector' split of the vctk dataset which contains pre-computed embeddings
            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
            
            # Map 'S1' and 'S2' to specific speaker embeddings from the dataset
            # You can pick any speaker IDs from the dataset.
            # Common ones from VCTK for examples are 'p280', 'p272', 'p232', 'p249' etc.
            # Let's map S1 to a male voice and S2 to a female voice from common VCTK examples.
            
            # You can get a list of available speakers from the dataset:
            # print(embeddings_dataset.features['speaker_id'].names)
            
            # Let's use two distinct speakers for S1 and S2
            # These are common speaker IDs from VCTK used in SpeechT5 examples
            self.speaker_embeddings["S1"] = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) # Speaker p280
            self.speaker_embeddings["S2"] = torch.tensor(embeddings_dataset[1]["xvector"]).unsqueeze(0) # Speaker p272

            # Ensure embeddings are also on CPU
            self.speaker_embeddings["S1"] = self.speaker_embeddings["S1"].to("cpu")
            self.speaker_embeddings["S2"] = self.speaker_embeddings["S2"].to("cpu")

            logger.info("Speaker embeddings loaded for S1 and S2.")
            
        except Exception as e:
            logger.error(f"Failed to initialize TTS model (SpeechT5): {e}", exc_info=True)
            self.processor = None
            self.model = None
            self.vocoder = None
    
    def synthesize_segment(
        self, 
        text: str, 
        speaker: str, # This will be 'S1' or 'S2' from segmenter
        output_path: str
    ) -> Optional[str]:
        """
        Synthesize speech for a text segment using SpeechT5.
        
        Args:
            text: Text to synthesize
            speaker: Speaker identifier ('S1' or 'S2' expected from segmenter)
            output_path: Path to save the audio file
            
        Returns:
            Path to the generated audio file, or None if failed
        """
        if not self.model or not self.processor or not self.vocoder:
            logger.error("SpeechT5 model, processor, or vocoder not initialized. Cannot synthesize speech.")
            return None
        
        try:
            # Get the correct speaker embedding
            speaker_embedding = self.speaker_embeddings.get(speaker)
            if speaker_embedding is None:
                logger.warning(f"Speaker '{speaker}' not found in pre-loaded embeddings. Defaulting to S1.")
                speaker_embedding = self.speaker_embeddings["S1"] # Fallback to S1
            
            logger.info(f"Synthesizing text for speaker {speaker}: {text[:100]}...")
            
            # Prepare inputs
            inputs = self.processor(text=text, return_tensors="pt")
            
            # Ensure inputs are on CPU
            inputs = {k: v.to("cpu") for k, v in inputs.items()}

            with torch.no_grad():
                # Generate speech
                # SpeechT5 returns logits/features, which then need to be passed to the vocoder
                speech = self.model.generate_speech(
                    inputs["input_ids"],
                    speaker_embedding, # Pass the speaker embedding here
                    vocoder=self.vocoder
                )
                
                audio_waveform = speech.cpu().numpy().squeeze()
            
            # Sampling rate from the vocoder or model config (typically 16000 for SpeechT5)
            sampling_rate = self.vocoder.config.sampling_rate if hasattr(self.vocoder.config, 'sampling_rate') else 16000
            
            sf.write(output_path, audio_waveform, sampling_rate)
            
            logger.info(f"Generated audio for {speaker}: {len(text)} characters to {output_path}")
            return output_path
            
        except Exception as e:
            logger.error(f"Failed to synthesize segment with SpeechT5: {e}", exc_info=True)
            return None