Spaces:

rakhlin
/

Coqui.ai

Sleeping

App Files Files Community

rakhlin commited on Jun 5, 2023

Commit

f016728

1 Parent(s): ad8ece8

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.ipynb_checkpoints/Coqui.ai-checkpoint.ipynb +381 -0
Coqui.ai.ipynb +329 -0
README.md +1 -7
app.py +160 -0
examples/arctic_a0023_bdl.wav +0 -0
examples/arctic_a0023_clb.wav +0 -0
examples/arctic_a0023_rms.wav +0 -0
examples/arctic_a0023_slt.wav +0 -0
examples/arctic_a0366_bdl.wav +0 -0
examples/arctic_a0366_rms.wav +0 -0
examples/arctic_a0407_bdl.wav +0 -0
examples/arctic_a0407_clb.wav +0 -0
examples/arctic_a0407_rms.wav +0 -0
examples/arctic_a0407_slt.wav +0 -0
examples/arctic_b0496_clb.wav +0 -0
examples/arctic_b0496_slt.wav +0 -0
examples/henry5.mp3 +0 -0
examples/hmm_i_dont_know.wav +0 -0
examples/see_in_eyes.wav +0 -0
examples/yearn_for_time.mp3 +0 -0
requirements.txt +18 -0

.ipynb_checkpoints/Coqui.ai-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,381 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6065d339",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from TTS.api import TTS\n",
+    "from TTS.utils.manage import ModelManager"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1e64dfd7",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7863\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: univnet\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: univnet_generator\n",
+      " > Discriminator Model: univnet_discriminator\n",
+      "Passing through TTS model tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "Using original voice\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
+      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
+      " > Processing time: 3.316999912261963\n",
+      " > Real-time factor: 0.38182763983344614\n",
+      "Loading TTS model from tts_models/en/ek1/tacotron2\n",
+      " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-10\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:0\n",
+      " | > fft_size:1024\n",
+      " | > power:1.8\n",
+      " | > preemphasis:0.99\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:0\n",
+      " | > mel_fmax:8000.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: wavegrad\n",
+      "Passing through TTS model tts_models/en/ek1/tacotron2\n",
+      "model: tts_models/en/ek1/tacotron2\n",
+      "language: \n",
+      "speaker: \n",
+      "Using original voice\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n"
+     ]
+    }
+   ],
+   "source": [
+    "title = \"\"\n",
+    "description = \"\"\"\"\"\"\n",
+    "article = \"\"\"\"\"\"\n",
+    "\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "GPU = device == \"cuda\"\n",
+    "INT16MAX = np.iinfo(np.int16).max\n",
+    "\n",
+    "model_ids = ModelManager(verbose=False).list_models()\n",
+    "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
+    "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "examples_pt = 'examples'\n",
+    "allowed_extentions = ['.mp3', '.wav']\n",
+    "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
+    "verse = \"\"\"Mary had a little lamb,\n",
+    "Its fleece was white as snow.\n",
+    "Everywhere the child went,\n",
+    "The little lamb was sure to go.\"\"\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "def on_model_tts_select(model_name, tts_var):\n",
+    "    if tts_var is None or tts_var.model_name != model_name:\n",
+    "        print(f'Loading TTS model from {model_name}')\n",
+    "        tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
+    "    else:\n",
+    "        print(f'Passing through TTS model {tts_var.model_name}')\n",
+    "    languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
+    "    speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
+    "    language = languages[0]\n",
+    "    speaker = speakers[0]\n",
+    "    return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
+    "                gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
+    "\n",
+    "\n",
+    "def on_model_vc_select(model_name, vc_var):\n",
+    "    if vc_var is None or vc_var.model_name != model_name:\n",
+    "        print(f'Loading voice conversion model from {model_name}')\n",
+    "        vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
+    "    else:\n",
+    "        print(f'Passing through voice conversion model {vc_var.model_name}')\n",
+    "    return vc_var\n",
+    "\n",
+    "\n",
+    "def on_voicedropdown(x):\n",
+    "    return examples[x]\n",
+    "\n",
+    "\n",
+    "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
+    "    if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
+    "        return (16000, np.zeros(0).astype(np.int16))\n",
+    "    \n",
+    "    sample_rate = tts_model.synthesizer.output_sample_rate\n",
+    "    if tts_model.is_multi_speaker:\n",
+    "        speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
+    "    print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
+    "    \n",
+    "    language = None if language == '' else language\n",
+    "    speaker = None if speaker == '' else speaker\n",
+    "    if use_original_voice:\n",
+    "        print('Using original voice')\n",
+    "        speech = tts_model.tts(text, language=language, speaker=speaker)       \n",
+    "    elif tts_model.synthesizer.tts_model.speaker_manager:\n",
+    "        print('voice cloning with the tts')\n",
+    "        speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
+    "    else:\n",
+    "        print('voice cloning with the voice conversion model')\n",
+    "        speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
+    "\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "def voice_clone(vc_model, source_wav, target_wav):\n",
+    "    print(f'model: {vc_model.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
+    "    sample_rate = vc_model.voice_converter.output_sample_rate\n",
+    "    if vc_model is None or source_wav is None or target_wav is None:\n",
+    "        return (sample_rate, np.zeros(0).astype(np.int16))\n",
+    "\n",
+    "    speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "with gr.Blocks() as demo:\n",
+    "    tts_model = gr.State(None)\n",
+    "    vc_model = gr.State(None)\n",
+    "    def activate(*args):\n",
+    "        return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
+    "    def deactivate(*args):\n",
+    "        return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
+    "\n",
+    "    gr.Markdown(description)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=5, min_width=50):\n",
+    "            model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
+    "        with gr.Column(scale=5, min_width=50):\n",
+    "            with gr.Row(equal_height=True):\n",
+    "#                 model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)\n",
+    "                model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)\n",
+    "                \n",
+    "    with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
+    "        gr.Markdown(\"Upload target voice...\")\n",
+    "        with gr.Row(equal_height=True):\n",
+    "            voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
+    "            voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=2):\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                with gr.Column():\n",
+    "                    text_to_convert = gr.Textbox(verse)\n",
+    "                    orig_voice = gr.Checkbox(label='Use original voice')\n",
+    "                voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                button_text = gr.Button('Text to speech', interactive=True)\n",
+    "                button_audio = gr.Button('Convert audio', interactive=True)\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
+    "        \n",
+    "    # actions\n",
+    "    model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    \n",
+    "    button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
+    "        then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
+    "             outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "\n",
+    "    button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
+    "        then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    \n",
+    "    gr.HTML(article)\n",
+    "demo.launch(share=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Coqui.ai.ipynb ADDED Viewed

	@@ -0,0 +1,329 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "57fc627d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from TTS.api import TTS\n",
+    "from TTS.utils.manage import ModelManager"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a5789dee",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7864\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: univnet\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: univnet_generator\n",
+      " > Discriminator Model: univnet_discriminator\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "Using original voice\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
+      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
+      " > Processing time: 24.694000244140625\n",
+      " > Real-time factor: 2.8425842872081772\n"
+     ]
+    }
+   ],
+   "source": [
+    "title = \"\"\n",
+    "description = \"\"\"\"\"\"\n",
+    "article = \"\"\"\"\"\"\n",
+    "\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "GPU = device == \"cuda\"\n",
+    "INT16MAX = np.iinfo(np.int16).max\n",
+    "\n",
+    "model_ids = ModelManager(verbose=False).list_models()\n",
+    "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
+    "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "examples_pt = 'examples'\n",
+    "allowed_extentions = ['.mp3', '.wav']\n",
+    "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
+    "verse = \"\"\"Mary had a little lamb,\n",
+    "Its fleece was white as snow.\n",
+    "Everywhere the child went,\n",
+    "The little lamb was sure to go.\"\"\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "def on_model_tts_select(model_name, tts_var):\n",
+    "    if tts_var is None or tts_var.model_name != model_name:\n",
+    "        print(f'Loading TTS model from {model_name}')\n",
+    "        tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
+    "    else:\n",
+    "        print(f'Passing through TTS model {tts_var.model_name}')\n",
+    "    languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
+    "    speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
+    "    language = languages[0]\n",
+    "    speaker = speakers[0]\n",
+    "    return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
+    "                gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
+    "\n",
+    "\n",
+    "def on_model_vc_select(model_name, vc_var):\n",
+    "    if vc_var is None or vc_var.model_name != model_name:\n",
+    "        print(f'Loading voice conversion model from {model_name}')\n",
+    "        vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
+    "    else:\n",
+    "        print(f'Passing through voice conversion model {vc_var.model_name}')\n",
+    "    return vc_var\n",
+    "\n",
+    "\n",
+    "def on_voicedropdown(x):\n",
+    "    return examples[x]\n",
+    "\n",
+    "\n",
+    "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
+    "    if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
+    "        return (16000, np.zeros(0).astype(np.int16))\n",
+    "    \n",
+    "    sample_rate = tts_model.synthesizer.output_sample_rate\n",
+    "    if tts_model.is_multi_speaker:\n",
+    "        speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
+    "    print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
+    "    \n",
+    "    language = None if language == '' else language\n",
+    "    speaker = None if speaker == '' else speaker\n",
+    "    if use_original_voice:\n",
+    "        print('Using original voice')\n",
+    "        speech = tts_model.tts(text, language=language, speaker=speaker)       \n",
+    "    elif tts_model.synthesizer.tts_model.speaker_manager:\n",
+    "        print('voice cloning with the tts')\n",
+    "        speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
+    "    else:\n",
+    "        print('voice cloning with the voice conversion model')\n",
+    "        speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
+    "\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "def voice_clone(vc_model, source_wav, target_wav):\n",
+    "    print(f'model: {vc_model.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
+    "    sample_rate = vc_model.voice_converter.output_sample_rate\n",
+    "    if vc_model is None or source_wav is None or target_wav is None:\n",
+    "        return (sample_rate, np.zeros(0).astype(np.int16))\n",
+    "\n",
+    "    speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "with gr.Blocks() as demo:\n",
+    "    tts_model = gr.State(None)\n",
+    "    vc_model = gr.State(None)\n",
+    "    def activate(*args):\n",
+    "        return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
+    "    def deactivate(*args):\n",
+    "        return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
+    "\n",
+    "    gr.Markdown(description)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=5, min_width=50):\n",
+    "            model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
+    "        with gr.Column(scale=5, min_width=50):\n",
+    "            with gr.Row(equal_height=True):\n",
+    "#                 model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)\n",
+    "                model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)\n",
+    "                \n",
+    "    with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
+    "        gr.Markdown(\"Upload target voice...\")\n",
+    "        with gr.Row(equal_height=True):\n",
+    "            voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
+    "            voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=2):\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                with gr.Column():\n",
+    "                    text_to_convert = gr.Textbox(verse)\n",
+    "                    orig_voice = gr.Checkbox(label='Use original voice')\n",
+    "                voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                button_text = gr.Button('Text to speech', interactive=True)\n",
+    "                button_audio = gr.Button('Convert audio', interactive=True)\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
+    "        \n",
+    "    # actions\n",
+    "    model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    \n",
+    "    button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
+    "        then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
+    "             outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "\n",
+    "    button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
+    "        then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    \n",
+    "    gr.HTML(article)\n",
+    "demo.launch(share=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
 title: Coqui.ai
-emoji: 🏆
-colorFrom: blue
-colorTo: blue
 sdk: gradio
 sdk_version: 3.33.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Coqui.ai
+app_file: app.py
 sdk: gradio
 sdk_version: 3.33.1
 ---

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import gradio as gr
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pathlib import Path
+from TTS.api import TTS
+from TTS.utils.manage import ModelManager
+title = ""
+description = """"""
+article = """"""
+device = "cuda" if torch.cuda.is_available() else "cpu"
+GPU = device == "cuda"
+INT16MAX = np.iinfo(np.int16).max
+model_ids = ModelManager(verbose=False).list_models()
+model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
+model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
+model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
+examples_pt = 'examples'
+allowed_extentions = ['.mp3', '.wav']
+examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
+verse = """Mary had a little lamb,
+Its fleece was white as snow.
+Everywhere the child went,
+The little lamb was sure to go."""
+def on_model_tts_select(model_name, tts_var):
+    if tts_var is None or tts_var.model_name != model_name:
+        print(f'Loading TTS model from {model_name}')
+        tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
+    else:
+        print(f'Passing through TTS model {tts_var.model_name}')
+    languages = tts_var.languages if tts_var.is_multi_lingual else ['']
+    speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
+    language = languages[0]
+    speaker = speakers[0]
+    return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
+                gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
+def on_model_vc_select(model_name, vc_var):
+    if vc_var is None or vc_var.model_name != model_name:
+        print(f'Loading voice conversion model from {model_name}')
+        vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
+    else:
+        print(f'Passing through voice conversion model {vc_var.model_name}')
+    return vc_var
+def on_voicedropdown(x):
+    return examples[x]
+def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
+    if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
+        return (16000, np.zeros(0).astype(np.int16))
+    sample_rate = tts_model.synthesizer.output_sample_rate
+    if tts_model.is_multi_speaker:
+        speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
+    print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
+    language = None if language == '' else language
+    speaker = None if speaker == '' else speaker
+    if use_original_voice:
+        print('Using original voice')
+        speech = tts_model.tts(text, language=language, speaker=speaker)
+    elif tts_model.synthesizer.tts_model.speaker_manager:
+        print('voice cloning with the tts')
+        speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
+    else:
+        print('voice cloning with the voice conversion model')
+        speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
+    speech = (np.array(speech) * INT16MAX).astype(np.int16)
+    return (sample_rate, speech)
+def voice_clone(vc_model, source_wav, target_wav):
+    print(f'model: {vc_model.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
+    sample_rate = vc_model.voice_converter.output_sample_rate
+    if vc_model is None or source_wav is None or target_wav is None:
+        return (sample_rate, np.zeros(0).astype(np.int16))
+    speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+    speech = (np.array(speech) * INT16MAX).astype(np.int16)
+    return (sample_rate, speech)
+with gr.Blocks() as demo:
+    tts_model = gr.State(None)
+    vc_model = gr.State(None)
+    def activate(*args):
+        return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
+    def deactivate(*args):
+        return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
+    gr.Markdown(description)
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=5, min_width=50):
+            model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)
+        with gr.Column(scale=1, min_width=10):
+                language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
+        with gr.Column(scale=1, min_width=10):
+                speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
+        with gr.Column(scale=5, min_width=50):
+            with gr.Row(equal_height=True):
+#                 model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)
+                model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)
+    with gr.Accordion("Target voice", open=False) as accordion:
+        gr.Markdown("Upload target voice...")
+        with gr.Row(equal_height=True):
+            voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
+            voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            with gr.Row(equal_height=True):
+                with gr.Column():
+                    text_to_convert = gr.Textbox(verse)
+                    orig_voice = gr.Checkbox(label='Use original voice')
+                voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
+            with gr.Row(equal_height=True):
+                button_text = gr.Button('Text to speech', interactive=True)
+                button_audio = gr.Button('Convert audio', interactive=True)
+    with gr.Row(equal_height=True):
+        speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False)
+    # actions
+    model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
+        then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
+             outputs=speech).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
+        then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    gr.HTML(article)
+demo.launch(share=False)

examples/arctic_a0023_bdl.wav ADDED Viewed

Binary file (168 kB). View file

examples/arctic_a0023_clb.wav ADDED Viewed

Binary file (189 kB). View file

examples/arctic_a0023_rms.wav ADDED Viewed

Binary file (172 kB). View file

examples/arctic_a0023_slt.wav ADDED Viewed

Binary file (153 kB). View file

examples/arctic_a0366_bdl.wav ADDED Viewed

Binary file (166 kB). View file

examples/arctic_a0366_rms.wav ADDED Viewed

Binary file (184 kB). View file

examples/arctic_a0407_bdl.wav ADDED Viewed

Binary file (183 kB). View file

examples/arctic_a0407_clb.wav ADDED Viewed

Binary file (200 kB). View file

examples/arctic_a0407_rms.wav ADDED Viewed

Binary file (216 kB). View file

examples/arctic_a0407_slt.wav ADDED Viewed

Binary file (171 kB). View file

examples/arctic_b0496_clb.wav ADDED Viewed

Binary file (192 kB). View file

examples/arctic_b0496_slt.wav ADDED Viewed

Binary file (171 kB). View file

examples/henry5.mp3 ADDED Viewed

Binary file (375 kB). View file

examples/hmm_i_dont_know.wav ADDED Viewed

Binary file (203 kB). View file

examples/see_in_eyes.wav ADDED Viewed

Binary file (65.2 kB). View file

examples/yearn_for_time.mp3 ADDED Viewed

Binary file (56.3 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+TTS
+numpy==1.21.6;python_version<"3.10"
+numpy;python_version=="3.10"
+cython==0.29.28
+scipy>=1.4.0
+torch>=1.7
+torchaudio
+soundfile
+librosa==0.10.0.*
+numba==0.55.1;python_version<"3.9"
+numba==0.56.4;python_version>="3.9"
+inflect==5.6.0
+tqdm
+anyascii
+pyyaml
+fsspec>=2021.04.0
+aiohttp
+packaging