VoiceChanger

Runtime error

App Files Files Community

kevinwang676 commited on Aug 10, 2023

Commit

25fce83

1 Parent(s): 5af09d3

Update app_multi.py

Browse files

Files changed (1) hide show

app_multi.py +380 -26

app_multi.py CHANGED Viewed

@@ -1,21 +1,34 @@
 from typing import Union
 from argparse import ArgumentParser
 import asyncio
 import json
 import hashlib
 from os import path, getenv
 import gradio as gr
 import torch
-import numpy as np
-import librosa
 import edge_tts
 import config
 import util
 from infer_pack.models import (
@@ -27,6 +40,8 @@ from vc_infer_pipeline import VC
 # Reference: https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L21  # noqa
 in_hf_space = getenv('SYSTEM') == 'spaces'
 # Argument parsing
 arg_parser = ArgumentParser()
 arg_parser.add_argument(
@@ -127,7 +142,303 @@ print(f'Models loaded: {len(loaded_models)}')
 # Edge TTS speakers
 tts_speakers_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())  # noqa
 # https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118  # noqa
 def vc_func(
     input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
@@ -146,8 +457,8 @@ def vc_func(
     # https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49
     # Can be change well, we will see
-    if (audio_npy.shape[0] / audio_samp) > 320 and in_hf_space:
-        return (None, 'Input audio is longer than 60 secs.')
     # Bloody hell: https://stackoverflow.com/questions/26921836/
     if audio_npy.dtype != np.float32:  # :thonk:
@@ -293,22 +604,42 @@ async def _example_edge_tts(
 with app:
-    gr.Markdown(
-        '## A simplistic Web interface\n'
-        'RVC interface, project based on [RVC-WebUI](https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI)'  # thx noqa
-        'A lot of inspiration from what\'s already out there, including [zomehwh/rvc-models](https://huggingface.co/spaces/zomehwh/rvc-models) & [DJQmUKV/rvc-inference](https://huggingface.co/spaces/DJQmUKV/rvc-inference).\n '  # thx noqa
-    )
     with gr.Row():
         with gr.Column():
-            with gr.Tab('Audio conversion'):
-                input_audio = gr.Audio(label='Input audio')
-                vc_convert_btn = gr.Button('Convert', variant='primary')
-            with gr.Tab('TTS conversion'):
-                tts_input = gr.TextArea(
-                    label='TTS input text'
                 )
                 tts_speaker = gr.Dropdown(
                     [
@@ -318,12 +649,24 @@ with app:
                         )
                         for s in tts_speakers_list
                     ],
-                    label='TTS speaker',
                     type='index'
                 )
-                tts_convert_btn = gr.Button('Convert', variant='primary')
             pitch_adjust = gr.Slider(
                 label='Pitch',
                 minimum=-24,
@@ -338,7 +681,7 @@ with app:
                 interactive=True
             )
-            with gr.Accordion('Advanced options', open=False):
                 feat_ratio = gr.Slider(
                     label='Feature ratio',
                     minimum=0,
@@ -382,19 +725,19 @@ with app:
                     )
                     for m in loaded_models
                 ],
-                label='Model',
                 type='index'
             )
             # Model info
             with gr.Box():
                 model_info = gr.Markdown(
-                    '### Model info\n'
                     'Please select a model from dropdown above.',
                     elem_id='model_info'
                 )
-            output_audio = gr.Audio(label='Output audio')
             output_msg = gr.Textbox(label='Output message')
     multi_examples = multi_cfg.get('examples')
@@ -454,6 +797,8 @@ with app:
         api_name='tts_conversion'
     )
     model_index.change(
         update_model_info,
         inputs=[model_index],
@@ -461,6 +806,15 @@ with app:
         show_progress=False,
         queue=False
     )
 app.queue(
     concurrency_count=1,

 from typing import Union
 from argparse import ArgumentParser
+from pathlib import Path
+import subprocess
+import librosa
+import os
+import time
+import random
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from moviepy.editor import *
+from moviepy.video.io.VideoFileClip import VideoFileClip
 import asyncio
 import json
 import hashlib
 from os import path, getenv
+from pydub import AudioSegment
 import gradio as gr
 import torch
 import edge_tts
+from datetime import datetime
+from scipy.io.wavfile import write
 import config
 import util
 from infer_pack.models import (
 # Reference: https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L21  # noqa
 in_hf_space = getenv('SYSTEM') == 'spaces'
+high_quality = True
 # Argument parsing
 arg_parser = ArgumentParser()
 arg_parser.add_argument(
 # Edge TTS speakers
 tts_speakers_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())  # noqa
+# Make MV
+def make_bars_image(height_values, index, new_height):
+    # Define the size of the image
+    width = 512
+    height = new_height
+    # Create a new image with a transparent background
+    image = Image.new('RGBA', (width, height), color=(0, 0, 0, 0))
+    # Get the image drawing context
+    draw = ImageDraw.Draw(image)
+    # Define the rectangle width and spacing
+    rect_width = 2
+    spacing = 2
+    # Define the list of height values for the rectangles
+    #height_values = [20, 40, 60, 80, 100, 80, 60, 40]
+    num_bars = len(height_values)
+    # Calculate the total width of the rectangles and the spacing
+    total_width = num_bars * rect_width + (num_bars - 1) * spacing
+    # Calculate the starting position for the first rectangle
+    start_x = int((width - total_width) / 2)
+    # Define the buffer size
+    buffer_size = 80
+    # Draw the rectangles from left to right
+    x = start_x
+    for i, height in enumerate(height_values):
+        # Define the rectangle coordinates
+        y0 = buffer_size
+        y1 = height + buffer_size
+        x0 = x
+        x1 = x + rect_width
+        # Draw the rectangle
+        draw.rectangle([x0, y0, x1, y1], fill='white')
+        # Move to the next rectangle position
+        if i < num_bars - 1:
+            x += rect_width + spacing
+    # Rotate the image by 180 degrees
+    image = image.rotate(180)
+    # Mirror the image
+    image = image.transpose(Image.FLIP_LEFT_RIGHT)
+    # Save the image
+    image.save('audio_bars_'+ str(index) + '.png')
+    return 'audio_bars_'+ str(index) + '.png'
+def db_to_height(db_value):
+    # Scale the dB value to a range between 0 and 1
+    scaled_value = (db_value + 80) / 80
+    # Convert the scaled value to a height between 0 and 100
+    height = scaled_value * 50
+    return height
+def infer(title, audio_in, image_in):
+    # Load the audio file
+    audio_path = audio_in
+    audio_data, sr = librosa.load(audio_path)
+    # Get the duration in seconds
+    duration = librosa.get_duration(y=audio_data, sr=sr)
+    # Extract the audio data for the desired time
+    start_time = 0 # start time in seconds
+    end_time = duration # end time in seconds
+    start_index = int(start_time * sr)
+    end_index = int(end_time * sr)
+    audio_data = audio_data[start_index:end_index]
+    # Compute the short-time Fourier transform
+    hop_length = 512
+    stft = librosa.stft(audio_data, hop_length=hop_length)
+    spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
+    # Get the frequency values
+    freqs = librosa.fft_frequencies(sr=sr, n_fft=stft.shape[0])
+    # Select the indices of the frequency values that correspond to the desired frequencies
+    n_freqs = 114
+    freq_indices = np.linspace(0, len(freqs) - 1, n_freqs, dtype=int)
+    # Extract the dB values for the desired frequencies
+    db_values = []
+    for i in range(spectrogram.shape[1]):
+        db_values.append(list(zip(freqs[freq_indices], spectrogram[freq_indices, i])))
+    # Print the dB values for the first time frame
+    print(db_values[0])
+    proportional_values = []
+    for frame in db_values:
+        proportional_frame = [db_to_height(db) for f, db in frame]
+        proportional_values.append(proportional_frame)
+    print(proportional_values[0])
+    print("AUDIO CHUNK: " + str(len(proportional_values)))
+    # Open the background image
+    background_image = Image.open(image_in)
+    # Resize the image while keeping its aspect ratio
+    bg_width, bg_height = background_image.size
+    aspect_ratio = bg_width / bg_height
+    new_width = 512
+    new_height = int(new_width / aspect_ratio)
+    resized_bg = background_image.resize((new_width, new_height))
+    # Apply black cache for better visibility of the white text
+    bg_cache = Image.open('black_cache.png')
+    resized_bg.paste(bg_cache, (0, resized_bg.height - bg_cache.height), mask=bg_cache)
+    # Create a new ImageDraw object
+    draw = ImageDraw.Draw(resized_bg)
+    # Define the text to be added
+    text = title
+    font = ImageFont.truetype("NotoSansSC-Regular.otf", 16)
+    text_color = (255, 255, 255) # white color
+    # Calculate the position of the text
+    text_width, text_height = draw.textsize(text, font=font)
+    x = 30
+    y = new_height - 70
+    # Draw the text on the image
+    draw.text((x, y), text, fill=text_color, font=font)
+    # Save the resized image
+    resized_bg.save('resized_background.jpg')
+    generated_frames = []
+    for i, frame in enumerate(proportional_values):
+        bars_img = make_bars_image(frame, i, new_height)
+        bars_img = Image.open(bars_img)
+        # Paste the audio bars image on top of the background image
+        fresh_bg = Image.open('resized_background.jpg')
+        fresh_bg.paste(bars_img, (0, 0), mask=bars_img)
+        # Save the image
+        fresh_bg.save('audio_bars_with_bg' + str(i) + '.jpg')
+        generated_frames.append('audio_bars_with_bg' + str(i) + '.jpg')
+    print(generated_frames)
+    # Create a video clip from the images
+    clip = ImageSequenceClip(generated_frames, fps=len(generated_frames)/(end_time-start_time))
+    audio_clip = AudioFileClip(audio_in)
+    clip = clip.set_audio(audio_clip)
+    # Set the output codec
+    codec = 'libx264'
+    audio_codec = 'aac'
+    # Save the video to a file
+    clip.write_videofile("my_video.mp4", codec=codec, audio_codec=audio_codec)
+    retimed_clip = VideoFileClip("my_video.mp4")
+    # Set the desired frame rate
+    new_fps = 25
+    # Create a new clip with the new frame rate
+    new_clip = retimed_clip.set_fps(new_fps)
+    # Save the new clip as a new video file
+    new_clip.write_videofile("my_video_retimed.mp4", codec=codec, audio_codec=audio_codec)
+    return "my_video_retimed.mp4"
+# mix vocal and non-vocal
+def mix(audio1, audio2):
+  sound1 = AudioSegment.from_file(audio1)
+  sound2 = AudioSegment.from_file(audio2)
+  length = len(sound1)
+  mixed = sound1[:length].overlay(sound2)
+  mixed.export("song.wav", format="wav")
+  return "song.wav"
+# Bilibili
+def youtube_downloader(
+    video_identifier,
+    start_time,
+    end_time,
+    output_filename="track.wav",
+    num_attempts=5,
+    url_base="",
+    quiet=False,
+    force=True,
+):
+    output_path = Path(output_filename)
+    if output_path.exists():
+        if not force:
+            return output_path
+        else:
+            output_path.unlink()
+    quiet = "--quiet --no-warnings" if quiet else ""
+    command = f"""
+        yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501
+    """.strip()
+    attempts = 0
+    while True:
+        try:
+            _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            attempts += 1
+            if attempts == num_attempts:
+                return None
+        else:
+            break
+    if output_path.exists():
+        return output_path
+    else:
+        return None
+def audio_separated(audio_input, progress=gr.Progress()):
+    # start progress
+    progress(progress=0, desc="Starting...")
+    time.sleep(0.1)
+    # check file input
+    if audio_input is None:
+        # show progress
+        for i in progress.tqdm(range(100), desc="Please wait..."):
+            time.sleep(0.01)
+        return (None, None, 'Please input audio.')
+    # create filename
+    filename = str(random.randint(10000,99999))+datetime.now().strftime("%d%m%Y%H%M%S")
+    # progress
+    progress(progress=0.10, desc="Please wait...")
+    # make dir output
+    os.makedirs("output", exist_ok=True)
+    # progress
+    progress(progress=0.20, desc="Please wait...")
+    # write
+    if high_quality:
+        write(filename+".wav", audio_input[0], audio_input[1])
+    else:
+        write(filename+".mp3", audio_input[0], audio_input[1])
+    # progress
+    progress(progress=0.50, desc="Please wait...")
+    # demucs process
+    if high_quality:
+        command_demucs = "python3 -m demucs --two-stems=vocals -d cpu "+filename+".wav -o output"
+    else:
+        command_demucs = "python3 -m demucs --two-stems=vocals --mp3 --mp3-bitrate 128 -d cpu "+filename+".mp3 -o output"
+    os.system(command_demucs)
+    # progress
+    progress(progress=0.70, desc="Please wait...")
+    # remove file audio
+    if high_quality:
+        command_delete = "rm -v ./"+filename+".wav"
+    else:
+        command_delete = "rm -v ./"+filename+".mp3"
+    os.system(command_delete)
+    # progress
+    progress(progress=0.80, desc="Please wait...")
+    # progress
+    for i in progress.tqdm(range(80,100), desc="Please wait..."):
+        time.sleep(0.1)
+    if high_quality:
+        return "./output/htdemucs/"+filename+"/vocals.wav","./output/htdemucs/"+filename+"/no_vocals.wav","Successfully..."
+    else:
+        return "./output/htdemucs/"+filename+"/vocals.mp3","./output/htdemucs/"+filename+"/no_vocals.mp3","Successfully..."
 # https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118  # noqa
 def vc_func(
     input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
     # https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49
     # Can be change well, we will see
+    if (audio_npy.shape[0] / audio_samp) > 600 and in_hf_space:
+        return (None, 'Input audio is longer than 600 secs.')
     # Bloody hell: https://stackoverflow.com/questions/26921836/
     if audio_npy.dtype != np.float32:  # :thonk:
 with app:
+    gr.HTML("<center>"
+            "<h1>🥳🎶🎡 - AI歌手，RVC歌声转换 + AI变声</h1>"
+            "</center>")
+    gr.Markdown("### <center>🦄 - 能够自动提取视频中的声音，并去除背景音；Powered by [RVC-Project](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)</center>")
+    gr.Markdown("### <center>更多精彩应用，敬请关注[滔滔AI](http://www.talktalkai.com)；滔滔AI，为爱滔滔！💕</center>")
+    with gr.Tab("🤗 - B站视频提取声音"):
+        with gr.Row():
+            with gr.Column():
+                ydl_url_input  = gr.Textbox(label="B站视频网址(可直接填写相应的BV号)", value = "https://www.bilibili.com/video/BV...")
+                start = gr.Number(value=0, label="起始时间 (秒)")
+                end = gr.Number(value=15, label="结束时间 (秒)")
+                ydl_url_submit = gr.Button("提取声音文件吧", variant="primary")
+                as_audio_submit = gr.Button("去除背景音吧", variant="primary")
+            with gr.Column():
+                ydl_audio_output = gr.Audio(label="Audio from Bilibili")
+                as_audio_input  = ydl_audio_output
+                as_audio_vocals    = gr.Audio(label="歌曲人声部分")
+                as_audio_no_vocals = gr.Audio(label="Music only", type="filepath", visible=False)
+                as_audio_message   = gr.Textbox(label="Message", visible=False)
+    ydl_url_submit.click(fn=youtube_downloader, inputs=[ydl_url_input, start, end], outputs=[ydl_audio_output])
+    as_audio_submit.click(fn=audio_separated, inputs=[as_audio_input], outputs=[as_audio_vocals, as_audio_no_vocals, as_audio_message], show_progress=True, queue=True)
     with gr.Row():
         with gr.Column():
+            with gr.Tab('🎶 - 歌声转换'):
+                input_audio = as_audio_vocals
+                vc_convert_btn = gr.Button('进行歌声转换吧！', variant='primary')
+                full_song = gr.Button("加入歌曲伴奏吧！", variant="primary")
+                new_song = gr.Audio(label="AI歌手+伴奏", type="filepath")
+            with gr.Tab('🎙️ - 文本转语音'):
+                tts_input = gr.Textbox(
+                    label='请填写您想要转换的文本(中英皆可)',
+                    lines=3
                 )
                 tts_speaker = gr.Dropdown(
                     [
                         )
                         for s in tts_speakers_list
                     ],
+                    label='请选择一个相应语言的说话人',
                     type='index'
                 )
+                tts_convert_btn = gr.Button('进行AI变声吧', variant='primary')
+            with gr.Tab("📺 - 音乐视频"):
+                with gr.Row():
+                    with gr.Column():
+                        inp1 = gr.Textbox(label="为视频配上精彩的文案吧(选填;英文)")
+                        inp2 = new_song
+                        inp3 = gr.Image(source='upload', type='filepath', label="上传一张背景图片吧")
+                        btn = gr.Button("生成您的专属音乐视频吧", variant="primary")
+                    with gr.Column():
+                        out1 = gr.Video(label='您的专属音乐视频')
+            btn.click(fn=infer, inputs=[inp1, inp2, inp3], outputs=[out1])
             pitch_adjust = gr.Slider(
                 label='Pitch',
                 minimum=-24,
                 interactive=True
             )
+            with gr.Accordion('更多设置', open=False):
                 feat_ratio = gr.Slider(
                     label='Feature ratio',
                     minimum=0,
                     )
                     for m in loaded_models
                 ],
+                label='请选择您的AI歌手(必选)',
                 type='index'
             )
             # Model info
             with gr.Box():
                 model_info = gr.Markdown(
+                    '### AI歌手信息\n'
                     'Please select a model from dropdown above.',
                     elem_id='model_info'
                 )
+            output_audio = gr.Audio(label='AI歌手(无伴奏)', type="filepath")
             output_msg = gr.Textbox(label='Output message')
     multi_examples = multi_cfg.get('examples')
         api_name='tts_conversion'
     )
+    full_song.click(fn=mix, inputs=[output_audio, as_audio_no_vocals], outputs=[new_song])
     model_index.change(
         update_model_info,
         inputs=[model_index],
         show_progress=False,
         queue=False
     )
+    gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
+    gr.Markdown("### <center>🧸 - 如何使用此程序：填写视频网址和视频起止时间后，依次点击“提取声音文件吧”、“去除背景音吧”、“进行歌声转换吧！”、“加入歌曲伴奏吧！”四个按键即可。</center>")
+    gr.HTML('''
+        <div class="footer">
+                    <p>🌊🏞️🎶 - 江水东流急，滔滔无尽声。 明·顾璘
+                    </p>
+        </div>
+    ''')
 app.queue(
     concurrency_count=1,