pragnakalp commited on
Commit
9462b41
·
1 Parent(s): 5175f9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -31
app.py CHANGED
@@ -24,6 +24,7 @@ import ffmpeg
24
  block = gr.Blocks()
25
 
26
  def calculate(image_in, audio_in):
 
27
  waveform, sample_rate = torchaudio.load(audio_in)
28
  waveform = torch.mean(waveform, dim=0, keepdim=True)
29
  torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
@@ -40,38 +41,37 @@ def calculate(image_in, audio_in):
40
  return "/content/train/image_audio.mp4"
41
 
42
 
43
- def one_shot(image,input_text,gender):
44
- if gender == 'Female' or gender == 'female':
45
- tts = gTTS(input_text)
46
- with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
47
- tts.write_to_fp(f)
48
- f.seek(0)
49
- sound = AudioSegment.from_file(f.name, format="mp3")
50
- sound.export("/content/audio.wav", format="wav")
51
- return calculate(image,"/content/audio.wav")
52
- elif gender == 'Male' or gender == 'male':
53
- print(gender)
54
- models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
55
- "Voicemod/fastspeech2-en-male1",
56
  arg_overrides={"vocoder": "hifigan", "fp16": False}
57
- )
58
-
59
- model = models[0].cuda()
60
- TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
61
- generator = task.build_generator([model], cfg)
62
- # next(model.parameters()).device
63
-
64
- sample = TTSHubInterface.get_model_input(task, input_text)
65
- sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
66
- sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
67
- sample["speaker"] = sample["speaker"].cuda()
68
-
69
- wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
70
- # soundfile.write("/content/audio_before.wav", wav, rate)
71
- soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
72
- cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
73
- os.system(cmd)
74
- calculate(image,'audio.wav')
75
 
76
 
77
 
 
24
  block = gr.Blocks()
25
 
26
  def calculate(image_in, audio_in):
27
+ print("in calculate")
28
  waveform, sample_rate = torchaudio.load(audio_in)
29
  waveform = torch.mean(waveform, dim=0, keepdim=True)
30
  torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
 
41
  return "/content/train/image_audio.mp4"
42
 
43
 
44
+ def one_shot(image,input_text,gender):
45
+ if gender == 'Female' or gender == 'female':
46
+ tts = gTTS(input_text)
47
+ with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
48
+ tts.write_to_fp(f)
49
+ f.seek(0)
50
+ sound = AudioSegment.from_file(f.name, format="mp3")
51
+ sound.export("/content/audio.wav", format="wav")
52
+ return calculate(image,"/content/audio.wav")
53
+ elif gender == 'Male' or gender == 'male':
54
+ print(gender)
55
+ models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
56
+ "Voicemod/fastspeech2-en-male1",
57
  arg_overrides={"vocoder": "hifigan", "fp16": False}
58
+ )
59
+ model = models[0].cuda()
60
+ TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
61
+ generator = task.build_generator([model], cfg)
62
+ # next(model.parameters()).device
63
+
64
+ sample = TTSHubInterface.get_model_input(task, input_text)
65
+ sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
66
+ sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
67
+ sample["speaker"] = sample["speaker"].cuda()
68
+
69
+ wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
70
+ # soundfile.write("/content/audio_before.wav", wav, rate)
71
+ soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
72
+ cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
73
+ os.system(cmd)
74
+ calculate(image,'audio.wav')
 
75
 
76
 
77