import gradio as gr import os from constants import VOICE_METHODS, BARK_VOICES, EDGE_VOICES import platform from models.model import * from tts.conversion import COQUI_LANGUAGES import pytube import os import traceback from pydub import AudioSegment # from audio_enhance.functions import audio_enhance def convert_yt_to_wav(url): if not url: return "Please enter the video link first", None try: print(f"Converting video {url}...") # Download the video using pytube video = pytube.YouTube(url) stream = video.streams.filter(only_audio=True).first() video_output_folder = os.path.join(f"yt_videos") # Destination folder path audio_output_folder = 'audios' print("Downloading video") video_file_path = stream.download(output_path=video_output_folder) print(video_file_path) file_name = os.path.basename(video_file_path) audio_file_path = os.path.join(audio_output_folder, file_name.replace('.mp4','.wav')) # Convert mp4 to wav print("Converting to wav") sound = AudioSegment.from_file(video_file_path, format="mp4") sound.export(audio_file_path, format="wav") if os.path.exists(video_file_path): os.remove(video_file_path) return "Success", audio_file_path except ConnectionResetError as cre: return "Connection lost, please refresh or try again later.", None except Exception as e: return str(e), None with gr.Blocks() as app: gr.HTML("

Simple RVC Inference - by Juuxn 💻

") gr.HTML("

This space uses CPU only, so it's for inference only. It's recommended to duplicate the space to avoid issues with processing queues.

") gr.Markdown("Simple RVC GPU Inference on colab: [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/drive/1NKqqTR04HujeBxzwe7jbYEvNi8LbxD_N?usp=sharing)") gr.Markdown( "[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/juuxn/SimpleRVC?duplicate=true)\n\n" ) gr.Markdown("Collection of models you can use: RVC + Kits ai. **[RVC Community Models](https://docs.google.com/spreadsheets/d/1owfUtQuLW9ReiIwg6U9UkkDmPOTkuNHf0OKQtWu1iaI)**") with gr.Tab("Inference"): model_url = gr.Textbox(placeholder="https://huggingface.co/AIVER-SE/BillieEilish/resolve/main/BillieEilish.zip", label="Model URL", show_label=True) with gr.Row(): with gr.Column(): audio_path = gr.Audio(label="Audio file", show_label=True, type="filepath") index_rate = gr.Slider(minimum=0, maximum=1, label="Search feature ratio:", value=0.75, interactive=True) filter_radius1 = gr.Slider(minimum=0, maximum=7, label="Filter (reduce breath harshness)", value=3, step=1, interactive=True) with gr.Column(): f0_method = gr.Dropdown(choices=["harvest", "pm", "crepe", "crepe-tiny", "mangio-crepe", "mangio-crepe-tiny", "rmvpe"], value="rmvpe", label="Algorithm", show_label=True) vc_transform0 = gr.Slider(minimum=-12, label="Number of semitones, raise an octave: 12, lower an octave: -12", value=0, maximum=12, step=1) protect0 = gr.Slider( minimum=0, maximum=0.5, label="Protect voiceless consonants and breath sounds. 0.5 to disable.", value=0.33, step=0.01, interactive=True, ) resample_sr1 = gr.Slider( minimum=0, maximum=48000, label="Resample output audio to the final sampling frequency. 0 to disable resampling.", value=0, step=1, interactive=True, ) # Output with gr.Row(): vc_output1 = gr.Textbox(label="Output") vc_output2 = gr.Audio(label="Output audio") btn = gr.Button(value="Convert") btn.click(infer, inputs=[model_url, f0_method, audio_path, index_rate, vc_transform0, protect0, resample_sr1, filter_radius1], outputs=[vc_output1, vc_output2]) with gr.TabItem("TTS"): with gr.Row(): tts_text = gr.Textbox( label="Text:", placeholder="Text you want to convert to speech...", lines=6, ) with gr.Column(): with gr.Row(): tts_model_url = gr.Textbox(placeholder="https://huggingface.co/AIVER-SE/BillieEilish/resolve/main/BillieEilish.zip", label="RVC Model URL", show_label=True) with gr.Row(): tts_method = gr.Dropdown(choices=VOICE_METHODS, value="Edge-tts", label="TTS Method:", visible=True) tts_model = gr.Dropdown(choices=EDGE_VOICES, label="TTS Model:", visible=True, interactive=True) tts_api_key = gr.Textbox(label="ElevenLabs API key", show_label=True, placeholder="4a4afce72349680c8e8b6fdcfaf2b65a",interactive=True, visible=False) tts_coqui_languages = gr.Radio( label="Language", choices=COQUI_LANGUAGES, value="en", visible=False ) tts_btn = gr.Button(value="Convert") with gr.Row(): tts_vc_output1 = gr.Textbox(label="Output") tts_vc_output2 = gr.Audio(label="Output audio") tts_btn.click(fn=tts_infer, inputs=[tts_text, tts_model_url, tts_method, tts_model, tts_api_key, tts_coqui_languages], outputs=[tts_vc_output1, tts_vc_output2]) tts_msg = gr.Markdown("""**I recommend creating an Eleven Labs account and adding your API key. It’s free, and you get a 10k character limit per month.**
![Imgur](https://imgur.com/HH6YTu0.png) """, visible=False) tts_method.change(fn=update_tts_methods_voice, inputs=[tts_method], outputs=[tts_model, tts_msg, tts_api_key, tts_coqui_languages]) with gr.TabItem("YouTube"): gr.Markdown("## Convert YouTube video to audio") with gr.Row(): yt_url = gr.Textbox( label="Video URL:", placeholder="https://www.youtube.com/watch?v=3vEiqil5d3Q" ) yt_btn = gr.Button(value="Convert") with gr.Row(): yt_output1 = gr.Textbox(label="Output") yt_output2 = gr.Audio(label="Output audio") yt_btn.click(fn=convert_yt_to_wav, inputs=[yt_url], outputs=[yt_output1, yt_output2]) with gr.Tab("Models"): gr.HTML("

Search Models

") search_name = gr.Textbox(placeholder="Billie Eillish (RVC v2 - 100 epoch)", label="Name", show_label=True) # Output with gr.Row(): sarch_output = gr.Markdown(label="Output") btn_search_model = gr.Button(value="Search") btn_search_model.click(fn=search_model, inputs=[search_name], outputs=[sarch_output]) gr.HTML("

Publish Your Model

") post_name = gr.Textbox(placeholder="Billie Eillish (RVC v2 - 100 epoch)", label="Name", show_label=True) post_model_url = gr.Textbox(placeholder="https://huggingface.co/AIVER-SE/BillieEilish/resolve/main/BillieEilish.zip", label="Model URL", show_label=True) post_creator = gr.Textbox(placeholder="Discord ID or creator profile link", label="Creator", show_label=True) post_version = gr.Dropdown(choices=["RVC v1", "RVC v2"], value="RVC v1", label="Version", show_label=True) # Output with gr.Row(): post_output = gr.Markdown(label="Output") btn_post_model = gr.Button(value="Publish") btn_post_model.click(fn=post_model, inputs=[post_name, post_model_url, post_version, post_creator], outputs=[post_output]) gr.Markdown( """For commercial use of the models and spaces, consider purchasing a license, or negotiate one with the voice creators.""" ) if __name__ == "__main__": app.queue().launch(debug=True)