File size: 4,445 Bytes
fe30080
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a09824
 
bac325d
 
 
 
 
 
 
0eec857
b477175
81cf309
 
 
 
 
 
 
 
 
 
5d58ab8
 
15317be
5d58ab8
 
 
81cf309
5d58ab8
 
 
81cf309
 
 
 
ae9b111
e8b688a
2ea8134
81cf309
2e0398b
bac325d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe30080
 
7079f58
20200de
 
f93771f
 
82589e7
f93771f
 
 
 
 
 
 
5d58ab8
f93771f
 
 
 
bac325d
f93771f
 
 
 
 
 
 
 
20200de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
# import os, subprocess, torchaudio
# import torch
from PIL import Image
from gtts import gTTS
import tempfile
from pydub import AudioSegment
from pydub.generators import Sine
# from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
# from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import soundfile

import dlib
import cv2
import imageio
import os
import gradio as gr
import os, subprocess, torchaudio
from PIL import Image
import ffmpeg



block = gr.Blocks()


    
def one_shot(image,input_text,gender): 
   if gender == 'Female' or gender == 'female':
     tts = gTTS(input_text)
     with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
          tts.write_to_fp(f)
          f.seek(0)
          sound = AudioSegment.from_file(f.name, format="mp3")
          sound.export("/content/audio.wav", format="wav")
     waveform, sample_rate = torchaudio.load("/content/audio.wav")

     try:
         torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
         image = Image.open(image_in)
         image = pad_image(image)
         image.save("/content/image_pre.png")
         pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
         jq_run = subprocess.run(['jq', '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
         with open("test.json", "w") as f:
             f.write(jq_run.stdout.decode('utf-8').strip())
         import json

         with open('test.json') as user_file:
           file_contents = user_file.read()
          
         
        
         parsed_json = json.loads(file_contents)
         return parsed_json
         exit()
         os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image_pre.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
         
     except Exception as e:
         print(e)
     return parsed_json
     exit()
       
     
       
   elif gender == 'Male' or gender == 'male':
      print(gender)
      models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
          "Voicemod/fastspeech2-en-male1",
          arg_overrides={"vocoder": "hifigan", "fp16": False}
      )

      model = models[0].cuda()
      TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
      generator = task.build_generator([model], cfg)
      # next(model.parameters()).device

      sample = TTSHubInterface.get_model_input(task, input_text)
      sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
      sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
      sample["speaker"] = sample["speaker"].cuda()

      wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
      # soundfile.write("/content/audio_before.wav", wav, rate)
      soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
      cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
      os.system(cmd)
      one_shot_talking(image,'audio.wav')

       


def generate_ocr(method,image,gender):
    return "Hello"
    
def run():
  with block:
  
    with gr.Group():
      with gr.Box():
        with gr.Row().style(equal_height=True):
          image_in = gr.Image(show_label=False, type="filepath")
          # audio_in = gr.Audio(show_label=False, type='filepath')
          input_text=gr.Textbox(lines=3, value="Hello How are you?", label="Input Text")
          gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
          video_out = gr.Textbox(label="output")
          # video_out = gr.Video(show_label=False)
        with gr.Row().style(equal_height=True):
          btn = gr.Button("Generate")          

    btn.click(one_shot, inputs=[image_in, input_text,gender], outputs=[video_out])
    # block.queue()
    block.launch(server_name="0.0.0.0", server_port=7860)

if __name__ == "__main__":
    run()