pragnakalp commited on
Commit
293dca7
·
1 Parent(s): 68776fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -35
app.py CHANGED
@@ -1,57 +1,116 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os, subprocess, torchaudio
3
- import torch
4
  from PIL import Image
 
 
 
5
 
6
  block = gr.Blocks()
7
 
8
- def pad_image(image):
9
- w, h = image.size
10
- if w == h:
11
- return image
12
- elif w > h:
13
- new_image = Image.new(image.mode, (w, w), (0, 0, 0))
14
- new_image.paste(image, (0, (w - h) // 2))
15
- return new_image
16
- else:
17
- new_image = Image.new(image.mode, (h, h), (0, 0, 0))
18
- new_image.paste(image, ((h - w) // 2, 0))
19
- return new_image
20
-
21
- def calculate(image_in, audio_in):
22
- waveform, sample_rate = torchaudio.load(audio_in)
23
- waveform = torch.mean(waveform, dim=0, keepdim=True)
24
- torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
25
- image = Image.open(image_in)
26
- image = pad_image(image)
27
- image.save("image.png")
28
-
29
- pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
30
- jq_run = subprocess.run(['jq', '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
31
- with open("test.json", "w") as f:
32
- f.write(jq_run.stdout.decode('utf-8').strip())
33
- # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
34
- os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
35
- return "/content/train/image_audio.mp4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def run():
38
  with block:
39
-
40
  with gr.Group():
41
  with gr.Box():
42
  with gr.Row().style(equal_height=True):
43
  image_in = gr.Image(show_label=False, type="filepath")
44
- audio_in = gr.Audio(show_label=False, type='filepath')
45
- video_out = gr.Video(show_label=False)
 
 
 
46
  with gr.Row().style(equal_height=True):
47
  btn = gr.Button("Generate")
48
 
49
-
50
- btn.click(calculate, inputs=[image_in, audio_in], outputs=[video_out])
51
- block.queue()
52
  block.launch(server_name="0.0.0.0", server_port=7860)
53
 
54
  if __name__ == "__main__":
55
  run()
56
 
57
 
 
 
 
1
  import gradio as gr
2
+ # import os, subprocess, torchaudio
3
+ # import torch
4
+ from PIL import Image
5
+ from gtts import gTTS
6
+ import tempfile
7
+ from pydub import AudioSegment
8
+ from pydub.generators import Sine
9
+ # from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
10
+ # from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
11
+ import soundfile
12
+
13
+ import dlib
14
+ import cv2
15
+ import imageio
16
+ import os
17
+ import gradio as gr
18
  import os, subprocess, torchaudio
 
19
  from PIL import Image
20
+ import ffmpeg
21
+
22
+
23
 
24
  block = gr.Blocks()
25
 
26
+
27
+
28
+ def one_shot(image,input_text,gender):
29
+ if gender == 'Female' or gender == 'female':
30
+ tts = gTTS(input_text)
31
+ with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
32
+ tts.write_to_fp(f)
33
+ f.seek(0)
34
+ sound = AudioSegment.from_file(f.name, format="mp3")
35
+ sound.export("/content/audio.wav", format="wav")
36
+ waveform, sample_rate = torchaudio.load("/content/audio.wav")
37
+
38
+
39
+ torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
40
+ image = Image.open(image_in)
41
+ image = pad_image(image)
42
+ image.save("/content/image_pre.png")
43
+ pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
44
+ jq_run = subprocess.run(['jq', '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
45
+ with open("test.json", "w") as f:
46
+ f.write(jq_run.stdout.decode('utf-8').strip())
47
+ import json
48
+
49
+ with open('test.json') as user_file:
50
+ file_contents = user_file.read()
51
+
52
+
53
+
54
+ parsed_json = json.loads(file_contents)
55
+ return parsed_json
56
+ exit()
57
+ os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image_pre.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
58
+
59
+
60
+
61
+
62
+ elif gender == 'Male' or gender == 'male':
63
+ print(gender)
64
+ models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
65
+ "Voicemod/fastspeech2-en-male1",
66
+ arg_overrides={"vocoder": "hifigan", "fp16": False}
67
+ )
68
+
69
+ model = models[0].cuda()
70
+ TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
71
+ generator = task.build_generator([model], cfg)
72
+ # next(model.parameters()).device
73
+
74
+ sample = TTSHubInterface.get_model_input(task, input_text)
75
+ sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
76
+ sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
77
+ sample["speaker"] = sample["speaker"].cuda()
78
+
79
+ wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
80
+ # soundfile.write("/content/audio_before.wav", wav, rate)
81
+ soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
82
+ cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
83
+ os.system(cmd)
84
+ one_shot_talking(image,'audio.wav')
85
+
86
+
87
+
88
+
89
+ def generate_ocr(method,image,gender):
90
+ return "Hello"
91
 
92
  def run():
93
  with block:
94
+
95
  with gr.Group():
96
  with gr.Box():
97
  with gr.Row().style(equal_height=True):
98
  image_in = gr.Image(show_label=False, type="filepath")
99
+ # audio_in = gr.Audio(show_label=False, type='filepath')
100
+ input_text=gr.Textbox(lines=3, value="Hello How are you?", label="Input Text")
101
+ gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
102
+ video_out = gr.Textbox(label="output")
103
+ # video_out = gr.Video(show_label=False)
104
  with gr.Row().style(equal_height=True):
105
  btn = gr.Button("Generate")
106
 
107
+ btn.click(one_shot, inputs=[image_in, input_text,gender], outputs=[video_out])
108
+ # block.queue()
 
109
  block.launch(server_name="0.0.0.0", server_port=7860)
110
 
111
  if __name__ == "__main__":
112
  run()
113
 
114
 
115
+
116
+