suprimedev commited on
Commit
303ab27
·
verified ·
1 Parent(s): 078ab64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -27
app.py CHANGED
@@ -1,15 +1,16 @@
1
  import gradio as gr
2
  import librosa
3
- import librosa.display
4
  import numpy as np
5
  from pydub import AudioSegment
6
  import io
7
  import os
 
8
 
9
  # Function to convert any audio to WAV using pydub
10
  def convert_to_wav(audio_file_path):
11
  try:
12
  audio = AudioSegment.from_file(audio_file_path)
 
13
  wav_file_path = audio_file_path + ".wav"
14
  audio.export(wav_file_path, format="wav")
15
  return wav_file_path
@@ -22,10 +23,13 @@ def voice_changer(source_audio_path, target_audio_path):
22
  raise gr.Error("Please upload both source and target audio files.")
23
 
24
  # Ensure audio files are in WAV format
25
- source_wav_path = convert_to_wav(source_audio_path)
26
- target_wav_path = convert_to_wav(target_audio_path)
27
 
28
  try:
 
 
 
29
  # Load audio files
30
  y_source, sr_source = librosa.load(source_wav_path, sr=None)
31
  y_target, sr_target = librosa.load(target_wav_path, sr=None)
@@ -35,54 +39,80 @@ def voice_changer(source_audio_path, target_audio_path):
35
  y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
36
  print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")
37
 
38
-
39
  # --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
40
  # This is a very basic approach and not a full timbre transfer.
41
  # It tries to align the dominant pitch of the target with the source.
42
 
43
  # 1. Pitch Estimation for Source
44
- f0_source, voiced_flag_source, voiced_probs_source = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
45
  try:
46
- f0_source, _, _ = librosa.pyin(y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source, frame_length=2048)
 
 
 
 
47
  except Exception as e:
48
- print(f"Pyin failed for source, trying different params or fallback: {e}")
49
- f0_source, _, _ = librosa.pyin(y_source, fmin=60, fmax=500, sr=sr_source, frame_length=2048) # More robust range
50
-
 
51
 
52
  # 2. Estimate F0 for Target
53
- f0_target, voiced_flag_target, voiced_probs_target = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
54
  try:
55
- f0_target, _, _ = librosa.pyin(y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target, frame_length=2048)
 
 
56
  except Exception as e:
57
- print(f"Pyin failed for target, trying different params or fallback: {e}")
58
- f0_target, _, _ = librosa.pyin(y_target, fmin=60, fmax=500, sr=sr_target, frame_length=2048) # More robust range
 
 
59
 
60
-
61
- # Handle NaN values in f0_source (unvoiced segments)
62
- f0_source_interpolated = np.nan_to_num(f0_source, nan=0.0)
63
- f0_target_interpolated = np.nan_to_num(f0_target, nan=0.0)
64
 
65
  # Calculate a simple pitch shift ratio based on mean F0
66
  # This is very simplistic and doesn't account for variations over time.
67
  # A more advanced approach would involve temporal alignment and mapping.
68
- mean_f0_source = np.mean(f0_source_interpolated[f0_source_interpolated > 0])
69
- mean_f0_target = np.mean(f0_target_interpolated[f0_target_interpolated > 0])
70
 
71
- if mean_f0_target > 0 and mean_f0_source > 0:
72
  pitch_shift_factor = mean_f0_source / mean_f0_target
73
  else:
74
- pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected
75
 
76
  # Apply a pitch shift to the target audio
77
  # Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
78
  # This is not PSOLA and can introduce artifacts.
79
  # The `n_steps` argument is in semitones.
 
80
  n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0
 
 
81
 
82
  # Adjust the duration of the target audio to roughly match the source
83
  # This is a crude time stretching/compressing
84
- duration_ratio = len(y_source) / len(y_target)
85
- y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  # Apply pitch shift to the tempo-adjusted target audio
88
  y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)
@@ -99,10 +129,10 @@ def voice_changer(source_audio_path, target_audio_path):
99
  except Exception as e:
100
  raise gr.Error(f"An error occurred during voice processing: {e}")
101
  finally:
102
- # Clean up temporary WAV files
103
- if os.path.exists(source_wav_path):
104
  os.remove(source_wav_path)
105
- if os.path.exists(target_wav_path):
106
  os.remove(target_wav_path)
107
 
108
  # Gradio Interface
@@ -132,5 +162,5 @@ with gr.Blocks() as demo:
132
  )
133
 
134
  if __name__ == "__main__":
135
- import soundfile as sf # Required for sf.write
136
  demo.launch()
 
 
1
  import gradio as gr
2
  import librosa
 
3
  import numpy as np
4
  from pydub import AudioSegment
5
  import io
6
  import os
7
+ import soundfile as sf # Required for sf.write
8
 
9
  # Function to convert any audio to WAV using pydub
10
  def convert_to_wav(audio_file_path):
11
  try:
12
  audio = AudioSegment.from_file(audio_file_path)
13
+ # Create a temporary file path for WAV
14
  wav_file_path = audio_file_path + ".wav"
15
  audio.export(wav_file_path, format="wav")
16
  return wav_file_path
 
23
  raise gr.Error("Please upload both source and target audio files.")
24
 
25
  # Ensure audio files are in WAV format
26
+ source_wav_path = None
27
+ target_wav_path = None
28
 
29
  try:
30
+ source_wav_path = convert_to_wav(source_audio_path)
31
+ target_wav_path = convert_to_wav(target_audio_path)
32
+
33
  # Load audio files
34
  y_source, sr_source = librosa.load(source_wav_path, sr=None)
35
  y_target, sr_target = librosa.load(target_wav_path, sr=None)
 
39
  y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
40
  print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")
41
 
 
42
  # --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
43
  # This is a very basic approach and not a full timbre transfer.
44
  # It tries to align the dominant pitch of the target with the source.
45
 
46
  # 1. Pitch Estimation for Source
47
+ # librosa.pyin returns (f0, voiced_flag, voiced_probabilities)
48
  try:
49
+ f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
50
+ y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source
51
+ # frame_length argument is not directly for pyin in newer librosa versions
52
+ # It's usually inferred from hop_length for features, or not needed for pyin directly
53
+ )
54
  except Exception as e:
55
+ print(f"Pyin failed for source with general range, trying broader range: {e}")
56
+ f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
57
+ y_source, fmin=60, fmax=500, sr=sr_source # More robust range for typical speech
58
+ )
59
 
60
  # 2. Estimate F0 for Target
 
61
  try:
62
+ f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
63
+ y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target
64
+ )
65
  except Exception as e:
66
+ print(f"Pyin failed for target with general range, trying broader range: {e}")
67
+ f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
68
+ y_target, fmin=60, fmax=500, sr=sr_target # More robust range for typical speech
69
+ )
70
 
71
+ # Handle NaN values in f0 (unvoiced segments)
72
+ # Replace NaN with 0, so they don't affect mean calculation, but also limit to voiced segments
73
+ f0_source_valid = f0_source[~np.isnan(f0_source)]
74
+ f0_target_valid = f0_target[~np.isnan(f0_target)]
75
 
76
  # Calculate a simple pitch shift ratio based on mean F0
77
  # This is very simplistic and doesn't account for variations over time.
78
  # A more advanced approach would involve temporal alignment and mapping.
79
+ mean_f0_source = np.mean(f0_source_valid) if len(f0_source_valid) > 0 else 0
80
+ mean_f0_target = np.mean(f0_target_valid) if len(f0_target_valid) > 0 else 0
81
 
82
+ if mean_f0_target > 0.1 and mean_f0_source > 0.1: # Check for very small positive values
83
  pitch_shift_factor = mean_f0_source / mean_f0_target
84
  else:
85
+ pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected or both are silent
86
 
87
  # Apply a pitch shift to the target audio
88
  # Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
89
  # This is not PSOLA and can introduce artifacts.
90
  # The `n_steps` argument is in semitones.
91
+ # log2(pitch_shift_factor) * 12 gives us semitones
92
  n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0
93
+ print(f"Calculated pitch shift: {n_steps:.2f} semitones.")
94
+
95
 
96
  # Adjust the duration of the target audio to roughly match the source
97
  # This is a crude time stretching/compressing
98
+ # Using librosa.get_duration to handle potential discrepancies in array lengths
99
+ duration_source = librosa.get_duration(y=y_source, sr=sr_source)
100
+ duration_target = librosa.get_duration(y=y_target, sr=sr_target)
101
+
102
+ # Avoid division by zero
103
+ if duration_target > 0:
104
+ duration_ratio = duration_source / duration_target
105
+ else:
106
+ duration_ratio = 1.0 # No time change if target has no duration
107
+
108
+ print(f"Duration Source: {duration_source:.2f}s, Target: {duration_target:.2f}s, Ratio: {duration_ratio:.2f}")
109
+
110
+ if duration_ratio != 1.0:
111
+ # We need to compute an appropriate hop_length for time_stretch if rate is not int.
112
+ # Using rate directly for time_stretch
113
+ y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)
114
+ else:
115
+ y_target_adjusted_tempo = y_target # No stretching needed
116
 
117
  # Apply pitch shift to the tempo-adjusted target audio
118
  y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)
 
129
  except Exception as e:
130
  raise gr.Error(f"An error occurred during voice processing: {e}")
131
  finally:
132
+ # Clean up temporary WAV files irrespective of success/failure
133
+ if source_wav_path and os.path.exists(source_wav_path):
134
  os.remove(source_wav_path)
135
+ if target_wav_path and os.path.exists(target_wav_path):
136
  os.remove(target_wav_path)
137
 
138
  # Gradio Interface
 
162
  )
163
 
164
  if __name__ == "__main__":
 
165
  demo.launch()
166
+