File size: 4,356 Bytes
7f3a4c0
293dca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f3a4c0
 
293dca7
 
 
7f3a4c0
 
 
c3bba51
9462b41
c3bba51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293dca7
9462b41
 
 
 
 
 
 
 
 
 
 
 
 
5175f9d
9462b41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5175f9d
293dca7
 
 
 
20200de
7f3a4c0
 
293dca7
7f3a4c0
 
 
 
293dca7
 
 
c3bba51
293dca7
7f3a4c0
 
f93771f
293dca7
 
7f3a4c0
0a97cd1
7f3a4c0
 
0a97cd1
f93771f
293dca7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
# import os, subprocess, torchaudio
# import torch
from PIL import Image
from gtts import gTTS
import tempfile
from pydub import AudioSegment
from pydub.generators import Sine
# from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
# from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import soundfile

import dlib
import cv2
import imageio
import os
import gradio as gr
import os, subprocess, torchaudio
from PIL import Image
import ffmpeg



block = gr.Blocks()

def calculate(image_in, audio_in):
    print("in calculate")
    waveform, sample_rate = torchaudio.load(audio_in)
    waveform = torch.mean(waveform, dim=0, keepdim=True)
    torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
    image = Image.open(image_in)
    image = pad_image(image)
    image.save("image.png")

    pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
    jq_run = subprocess.run(['jq', '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
    with open("test.json", "w") as f:
        f.write(jq_run.stdout.decode('utf-8').strip())
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
    return "/content/train/image_audio.mp4"
    
    
def one_shot(image,input_text,gender):
    if gender == 'Female' or gender == 'female':
        tts = gTTS(input_text)
        with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
            tts.write_to_fp(f)
            f.seek(0)
            sound = AudioSegment.from_file(f.name, format="mp3")
            sound.export("/content/audio.wav", format="wav")
        return calculate(image,"/content/audio.wav")
    elif gender == 'Male' or gender == 'male':
        print(gender)
        models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
            "Voicemod/fastspeech2-en-male1",
          arg_overrides={"vocoder": "hifigan", "fp16": False}
        )
        model = models[0].cuda()
        TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
        generator = task.build_generator([model], cfg)
        # next(model.parameters()).device
        
        sample = TTSHubInterface.get_model_input(task, input_text)
        sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
        sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
        sample["speaker"] = sample["speaker"].cuda()
        
        wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
        # soundfile.write("/content/audio_before.wav", wav, rate)
        soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
        cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
        os.system(cmd)
        calculate(image,'audio.wav')
 


def generate_ocr(method,image,gender):
    return "Hello"
    
def run():
  with block:
  
    with gr.Group():
      with gr.Box():
        with gr.Row().style(equal_height=True):
          image_in = gr.Image(show_label=False, type="filepath")
          # audio_in = gr.Audio(show_label=False, type='filepath')
          input_text=gr.Textbox(lines=3, value="Hello How are you?", label="Input Text")
          gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
          video_out = gr.Video(label="output")
          # video_out = gr.Video(show_label=False)
        with gr.Row().style(equal_height=True):
          btn = gr.Button("Generate")          

    btn.click(one_shot, inputs=[image_in, input_text,gender], outputs=[video_out])
    # block.queue()
    block.launch(server_name="0.0.0.0", server_port=7860)

if __name__ == "__main__":
    run()