|  | import http.client | 
					
						
						|  | import json | 
					
						
						|  | import os | 
					
						
						|  | import tempfile | 
					
						
						|  | import urllib.request | 
					
						
						|  | from typing import Tuple | 
					
						
						|  |  | 
					
						
						|  | import numpy as np | 
					
						
						|  | import requests | 
					
						
						|  | from scipy.io import wavfile | 
					
						
						|  |  | 
					
						
						|  | from TTS.utils.audio.numpy_transforms import save_wav | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class Speaker(object): | 
					
						
						|  | """Convert dict to object.""" | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, d, is_voice=False): | 
					
						
						|  | self.is_voice = is_voice | 
					
						
						|  | for k, v in d.items(): | 
					
						
						|  | if isinstance(k, (list, tuple)): | 
					
						
						|  | setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v]) | 
					
						
						|  | else: | 
					
						
						|  | setattr(self, k, Speaker(v) if isinstance(v, dict) else v) | 
					
						
						|  |  | 
					
						
						|  | def __repr__(self): | 
					
						
						|  | return str(self.__dict__) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class CS_API: | 
					
						
						|  | """🐸Coqui Studio API Wrapper. | 
					
						
						|  |  | 
					
						
						|  | 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice | 
					
						
						|  | interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different | 
					
						
						|  | characteristics. You can use these voices to generate new audio files or use them in your applications. | 
					
						
						|  | You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token. | 
					
						
						|  | You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from | 
					
						
						|  | https://app.coqui.ai/account. We can either enter the token as an environment variable as | 
					
						
						|  | `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`. | 
					
						
						|  | Visit https://app.coqui.ai/api for more information. | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable | 
					
						
						|  | `COQUI_STUDIO_TOKEN`. | 
					
						
						|  | model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`. | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | Example listing all available speakers: | 
					
						
						|  | >>> from TTS.api import CS_API | 
					
						
						|  | >>> tts = CS_API() | 
					
						
						|  | >>> tts.speakers | 
					
						
						|  |  | 
					
						
						|  | Example listing all emotions: | 
					
						
						|  | >>> # emotions are only available for `V1` model | 
					
						
						|  | >>> from TTS.api import CS_API | 
					
						
						|  | >>> tts = CS_API(model="V1") | 
					
						
						|  | >>> tts.emotions | 
					
						
						|  |  | 
					
						
						|  | Example with a built-in 🐸 speaker: | 
					
						
						|  | >>> from TTS.api import CS_API | 
					
						
						|  | >>> tts = CS_API() | 
					
						
						|  | >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name) | 
					
						
						|  | >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav") | 
					
						
						|  |  | 
					
						
						|  | Example with multi-language model: | 
					
						
						|  | >>> from TTS.api import CS_API | 
					
						
						|  | >>> tts = CS_API(model="XTTS") | 
					
						
						|  | >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | MODEL_ENDPOINTS = { | 
					
						
						|  | "V1": { | 
					
						
						|  | "list_speakers": "https://app.coqui.ai/api/v2/speakers", | 
					
						
						|  | "synthesize": "https://app.coqui.ai/api/v2/samples", | 
					
						
						|  | "list_voices": "https://app.coqui.ai/api/v2/voices", | 
					
						
						|  | }, | 
					
						
						|  | "XTTS": { | 
					
						
						|  | "list_speakers": "https://app.coqui.ai/api/v2/speakers", | 
					
						
						|  | "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", | 
					
						
						|  | "list_voices": "https://app.coqui.ai/api/v2/voices/xtts", | 
					
						
						|  | }, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, api_token=None, model="XTTS"): | 
					
						
						|  | self.api_token = api_token | 
					
						
						|  | self.model = model | 
					
						
						|  | self.headers = None | 
					
						
						|  | self._speakers = None | 
					
						
						|  | self._check_token() | 
					
						
						|  |  | 
					
						
						|  | @staticmethod | 
					
						
						|  | def ping_api(): | 
					
						
						|  | URL = "https://coqui.gateway.scarf.sh/tts/api" | 
					
						
						|  | _ = requests.get(URL) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def speakers(self): | 
					
						
						|  | if self._speakers is None: | 
					
						
						|  | self._speakers = self.list_all_speakers() | 
					
						
						|  | return self._speakers | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def emotions(self): | 
					
						
						|  | """Return a list of available emotions. | 
					
						
						|  |  | 
					
						
						|  | TODO: Get this from the API endpoint. | 
					
						
						|  | """ | 
					
						
						|  | if self.model == "V1": | 
					
						
						|  | return ["Neutral", "Happy", "Sad", "Angry", "Dull"] | 
					
						
						|  | else: | 
					
						
						|  | raise ValueError(f"❗ Emotions are not available for {self.model}.") | 
					
						
						|  |  | 
					
						
						|  | def _check_token(self): | 
					
						
						|  | if self.api_token is None: | 
					
						
						|  | self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") | 
					
						
						|  | self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} | 
					
						
						|  | if not self.api_token: | 
					
						
						|  | raise ValueError( | 
					
						
						|  | "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n" | 
					
						
						|  | "Visit 🔗https://app.coqui.ai/account to get one.\n" | 
					
						
						|  | "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n" | 
					
						
						|  | "" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def list_all_speakers(self): | 
					
						
						|  | """Return both built-in Coqui Studio speakers and custom voices created by the user.""" | 
					
						
						|  | return self.list_speakers() + self.list_voices() | 
					
						
						|  |  | 
					
						
						|  | def list_speakers(self): | 
					
						
						|  | """List built-in Coqui Studio speakers.""" | 
					
						
						|  | self._check_token() | 
					
						
						|  | conn = http.client.HTTPSConnection("app.coqui.ai") | 
					
						
						|  | url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] | 
					
						
						|  | conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) | 
					
						
						|  | res = conn.getresponse() | 
					
						
						|  | data = res.read() | 
					
						
						|  | return [Speaker(s) for s in json.loads(data)["result"]] | 
					
						
						|  |  | 
					
						
						|  | def list_voices(self): | 
					
						
						|  | """List custom voices created by the user.""" | 
					
						
						|  | conn = http.client.HTTPSConnection("app.coqui.ai") | 
					
						
						|  | url = self.MODEL_ENDPOINTS[self.model]["list_voices"] | 
					
						
						|  | conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) | 
					
						
						|  | res = conn.getresponse() | 
					
						
						|  | data = res.read() | 
					
						
						|  | return [Speaker(s, True) for s in json.loads(data)["result"]] | 
					
						
						|  |  | 
					
						
						|  | def list_speakers_as_tts_models(self): | 
					
						
						|  | """List speakers in ModelManager format.""" | 
					
						
						|  | models = [] | 
					
						
						|  | for speaker in self.speakers: | 
					
						
						|  | model = f"coqui_studio/multilingual/{speaker.name}/{self.model}" | 
					
						
						|  | models.append(model) | 
					
						
						|  | return models | 
					
						
						|  |  | 
					
						
						|  | def name_to_speaker(self, name): | 
					
						
						|  | for speaker in self.speakers: | 
					
						
						|  | if speaker.name == name: | 
					
						
						|  | return speaker | 
					
						
						|  | raise ValueError(f"Speaker {name} not found in {self.speakers}") | 
					
						
						|  |  | 
					
						
						|  | def id_to_speaker(self, speaker_id): | 
					
						
						|  | for speaker in self.speakers: | 
					
						
						|  | if speaker.id == speaker_id: | 
					
						
						|  | return speaker | 
					
						
						|  | raise ValueError(f"Speaker {speaker_id} not found.") | 
					
						
						|  |  | 
					
						
						|  | @staticmethod | 
					
						
						|  | def url_to_np(url): | 
					
						
						|  | tmp_file, _ = urllib.request.urlretrieve(url) | 
					
						
						|  | rate, data = wavfile.read(tmp_file) | 
					
						
						|  | return data, rate | 
					
						
						|  |  | 
					
						
						|  | @staticmethod | 
					
						
						|  | def _create_payload(model, text, speaker, speed, emotion, language): | 
					
						
						|  | payload = {} | 
					
						
						|  |  | 
					
						
						|  | payload["voice_id"] = speaker.id | 
					
						
						|  |  | 
					
						
						|  | payload["speaker_id"] = speaker.id | 
					
						
						|  |  | 
					
						
						|  | if model == "V1": | 
					
						
						|  | payload.update( | 
					
						
						|  | { | 
					
						
						|  | "emotion": emotion, | 
					
						
						|  | "name": speaker.name, | 
					
						
						|  | "text": text, | 
					
						
						|  | "speed": speed, | 
					
						
						|  | } | 
					
						
						|  | ) | 
					
						
						|  | elif model == "XTTS": | 
					
						
						|  | payload.update( | 
					
						
						|  | { | 
					
						
						|  | "name": speaker.name, | 
					
						
						|  | "text": text, | 
					
						
						|  | "speed": speed, | 
					
						
						|  | "language": language, | 
					
						
						|  | } | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | raise ValueError(f"❗ Unknown model {model}") | 
					
						
						|  | return payload | 
					
						
						|  |  | 
					
						
						|  | def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language): | 
					
						
						|  | assert text is not None, "❗ text is required for V1 model." | 
					
						
						|  | assert speaker_name is not None, "❗ speaker_name is required for V1 model." | 
					
						
						|  | if self.model == "V1": | 
					
						
						|  | if emotion is None: | 
					
						
						|  | emotion = "Neutral" | 
					
						
						|  | assert language is None, "❗ language is not supported for V1 model." | 
					
						
						|  | elif self.model == "XTTS": | 
					
						
						|  | assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." | 
					
						
						|  | assert language is not None, "❗ Language is required for XTTS model." | 
					
						
						|  | assert ( | 
					
						
						|  | language in self.SUPPORTED_LANGUAGES | 
					
						
						|  | ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create." | 
					
						
						|  | return text, speaker_name, speaker_id, emotion, speed, language | 
					
						
						|  |  | 
					
						
						|  | def tts( | 
					
						
						|  | self, | 
					
						
						|  | text: str, | 
					
						
						|  | speaker_name: str = None, | 
					
						
						|  | speaker_id=None, | 
					
						
						|  | emotion=None, | 
					
						
						|  | speed=1.0, | 
					
						
						|  | language=None, | 
					
						
						|  | ) -> Tuple[np.ndarray, int]: | 
					
						
						|  | """Synthesize speech from text. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | text (str): Text to synthesize. | 
					
						
						|  | speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and | 
					
						
						|  | voices (user generated speakers) with `list_voices()`. | 
					
						
						|  | speaker_id (str): Speaker ID. If None, the speaker name is used. | 
					
						
						|  | emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only | 
					
						
						|  | supported by `V1` model. Defaults to None. | 
					
						
						|  | speed (float): Speed of the speech. 1.0 is normal speed. | 
					
						
						|  | language (str): Language of the text. If None, the default language of the speaker is used. Language is only | 
					
						
						|  | supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. | 
					
						
						|  | """ | 
					
						
						|  | self._check_token() | 
					
						
						|  | self.ping_api() | 
					
						
						|  |  | 
					
						
						|  | if speaker_name is None and speaker_id is None: | 
					
						
						|  | raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") | 
					
						
						|  | if speaker_id is None: | 
					
						
						|  | speaker = self.name_to_speaker(speaker_name) | 
					
						
						|  | else: | 
					
						
						|  | speaker = self.id_to_speaker(speaker_id) | 
					
						
						|  |  | 
					
						
						|  | text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args( | 
					
						
						|  | text, speaker_name, speaker_id, emotion, speed, language | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | conn = http.client.HTTPSConnection("app.coqui.ai") | 
					
						
						|  | payload = self._create_payload(self.model, text, speaker, speed, emotion, language) | 
					
						
						|  | url = self.MODEL_ENDPOINTS[self.model]["synthesize"] | 
					
						
						|  | conn.request("POST", url, json.dumps(payload), self.headers) | 
					
						
						|  | res = conn.getresponse() | 
					
						
						|  | data = res.read() | 
					
						
						|  | try: | 
					
						
						|  | wav, sr = self.url_to_np(json.loads(data)["audio_url"]) | 
					
						
						|  | except KeyError as e: | 
					
						
						|  | raise ValueError(f" [!] 🐸 API returned error: {data}") from e | 
					
						
						|  | return wav, sr | 
					
						
						|  |  | 
					
						
						|  | def tts_to_file( | 
					
						
						|  | self, | 
					
						
						|  | text: str, | 
					
						
						|  | speaker_name: str, | 
					
						
						|  | speaker_id=None, | 
					
						
						|  | emotion=None, | 
					
						
						|  | speed=1.0, | 
					
						
						|  | pipe_out=None, | 
					
						
						|  | language=None, | 
					
						
						|  | file_path: str = None, | 
					
						
						|  | ) -> str: | 
					
						
						|  | """Synthesize speech from text and save it to a file. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | text (str): Text to synthesize. | 
					
						
						|  | speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and | 
					
						
						|  | voices (user generated speakers) with `list_voices()`. | 
					
						
						|  | speaker_id (str): Speaker ID. If None, the speaker name is used. | 
					
						
						|  | emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". | 
					
						
						|  | speed (float): Speed of the speech. 1.0 is normal speed. | 
					
						
						|  | pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. | 
					
						
						|  | language (str): Language of the text. If None, the default language of the speaker is used. Language is only | 
					
						
						|  | supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". | 
					
						
						|  | file_path (str): Path to save the file. If None, a temporary file is created. | 
					
						
						|  | """ | 
					
						
						|  | if file_path is None: | 
					
						
						|  | file_path = tempfile.mktemp(".wav") | 
					
						
						|  | wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) | 
					
						
						|  | save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out) | 
					
						
						|  | return file_path | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | import time | 
					
						
						|  |  | 
					
						
						|  | api = CS_API() | 
					
						
						|  | print(api.speakers) | 
					
						
						|  | print(api.list_speakers_as_tts_models()) | 
					
						
						|  |  | 
					
						
						|  | ts = time.time() | 
					
						
						|  | wav, sr = api.tts( | 
					
						
						|  | "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name | 
					
						
						|  | ) | 
					
						
						|  | print(f" [i] XTTS took {time.time() - ts:.2f}s") | 
					
						
						|  |  | 
					
						
						|  | filepath = api.tts_to_file( | 
					
						
						|  | text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav" | 
					
						
						|  | ) | 
					
						
						|  |  |