SWivid commited on
Commit
757cc7d
·
1 Parent(s): cabfcc4

Fix #1046 tempfile related bug

Browse files
src/f5_tts/infer/infer_gradio.py CHANGED
@@ -3,6 +3,7 @@
3
 
4
  import gc
5
  import json
 
6
  import re
7
  import tempfile
8
  from collections import OrderedDict
@@ -189,16 +190,20 @@ def infer(
189
 
190
  # Remove silence
191
  if remove_silence:
192
- with tempfile.NamedTemporaryFile(suffix=".wav") as f:
193
- sf.write(f.name, final_wave, final_sample_rate)
 
 
194
  remove_silence_for_generated_wav(f.name)
195
  final_wave, _ = torchaudio.load(f.name)
 
 
196
  final_wave = final_wave.squeeze().cpu().numpy()
197
 
198
  # Save the spectrogram
199
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
200
  spectrogram_path = tmp_spectrogram.name
201
- save_spectrogram(combined_spectrogram, spectrogram_path)
202
 
203
  return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
204
 
 
3
 
4
  import gc
5
  import json
6
+ import os
7
  import re
8
  import tempfile
9
  from collections import OrderedDict
 
190
 
191
  # Remove silence
192
  if remove_silence:
193
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
194
+ temp_path = f.name
195
+ try:
196
+ sf.write(temp_path, final_wave, final_sample_rate)
197
  remove_silence_for_generated_wav(f.name)
198
  final_wave, _ = torchaudio.load(f.name)
199
+ finally:
200
+ os.unlink(temp_path)
201
  final_wave = final_wave.squeeze().cpu().numpy()
202
 
203
  # Save the spectrogram
204
+ with tempfile.NamedTemporaryFile(suffix=".png", delete_on_close=False) as tmp_spectrogram:
205
  spectrogram_path = tmp_spectrogram.name
206
+ save_spectrogram(combined_spectrogram, spectrogram_path)
207
 
208
  return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
209
 
src/f5_tts/infer/utils_infer.py CHANGED
@@ -306,42 +306,44 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
306
  ref_audio = _ref_audio_cache[audio_hash]
307
 
308
  else: # first pass, do preprocess
309
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
310
- aseg = AudioSegment.from_file(ref_audio_orig)
311
 
312
- # 1. try to find long silence for clipping
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  non_silent_segs = silence.split_on_silence(
314
- aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
315
  )
316
  non_silent_wave = AudioSegment.silent(duration=0)
317
  for non_silent_seg in non_silent_segs:
318
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
319
- show_info("Audio is over 12s, clipping short. (1)")
320
  break
321
  non_silent_wave += non_silent_seg
322
 
323
- # 2. try to find short silence for clipping if 1. failed
324
- if len(non_silent_wave) > 12000:
325
- non_silent_segs = silence.split_on_silence(
326
- aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
327
- )
328
- non_silent_wave = AudioSegment.silent(duration=0)
329
- for non_silent_seg in non_silent_segs:
330
- if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
331
- show_info("Audio is over 12s, clipping short. (2)")
332
- break
333
- non_silent_wave += non_silent_seg
334
-
335
- aseg = non_silent_wave
336
-
337
- # 3. if no proper silence found for clipping
338
- if len(aseg) > 12000:
339
- aseg = aseg[:12000]
340
- show_info("Audio is over 12s, clipping short. (3)")
341
-
342
- aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
343
- aseg.export(f.name, format="wav")
344
- ref_audio = f.name
345
 
346
  # Cache the processed reference audio
347
  _ref_audio_cache[audio_hash] = ref_audio
 
306
  ref_audio = _ref_audio_cache[audio_hash]
307
 
308
  else: # first pass, do preprocess
309
+ with tempfile.NamedTemporaryFile(delete_on_close=False, suffix=".wav") as f:
310
+ temp_path = f.name
311
 
312
+ aseg = AudioSegment.from_file(ref_audio_orig)
313
+
314
+ # 1. try to find long silence for clipping
315
+ non_silent_segs = silence.split_on_silence(
316
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
317
+ )
318
+ non_silent_wave = AudioSegment.silent(duration=0)
319
+ for non_silent_seg in non_silent_segs:
320
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
321
+ show_info("Audio is over 12s, clipping short. (1)")
322
+ break
323
+ non_silent_wave += non_silent_seg
324
+
325
+ # 2. try to find short silence for clipping if 1. failed
326
+ if len(non_silent_wave) > 12000:
327
  non_silent_segs = silence.split_on_silence(
328
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
329
  )
330
  non_silent_wave = AudioSegment.silent(duration=0)
331
  for non_silent_seg in non_silent_segs:
332
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
333
+ show_info("Audio is over 12s, clipping short. (2)")
334
  break
335
  non_silent_wave += non_silent_seg
336
 
337
+ aseg = non_silent_wave
338
+
339
+ # 3. if no proper silence found for clipping
340
+ if len(aseg) > 12000:
341
+ aseg = aseg[:12000]
342
+ show_info("Audio is over 12s, clipping short. (3)")
343
+
344
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
345
+ aseg.export(temp_path, format="wav")
346
+ ref_audio = temp_path
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  # Cache the processed reference audio
349
  _ref_audio_cache[audio_hash] = ref_audio