ahk-d commited on
Commit
a3f80ab
Β·
verified Β·
1 Parent(s): 50052ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -43
app.py CHANGED
@@ -4,13 +4,42 @@ import gradio as gr
4
  import requests
5
  import tempfile
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # Separate content and style options based on URL paths
8
- # This categorizes presets by looking for 'content' or 'style' keywords in the file paths
9
  CONTENT_OPTIONS = [key for key in INPUT_URLS.keys() if any(word in INPUT_URLS[key] for word in ['content'])]
10
  STYLE_OPTIONS = [key for key in INPUT_URLS.keys() if any(word in INPUT_URLS[key] for word in ['style'])]
11
 
12
- # Add remaining items to both lists if they don't contain 'content' or 'style' in their paths
13
- # This ensures all presets are available in both dropdowns for flexibility
14
  for key in INPUT_URLS.keys():
15
  if key not in CONTENT_OPTIONS and key not in STYLE_OPTIONS:
16
  CONTENT_OPTIONS.append(key)
@@ -19,14 +48,12 @@ for key in INPUT_URLS.keys():
19
  def load_audio_from_url(url, sr=None):
20
  """Load audio from URL by downloading to temporary file"""
21
  response = requests.get(url)
22
- # Create temporary file to store downloaded audio (deleted after use)
23
  with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_file:
24
  tmp_file.write(response.content)
25
  tmp_file_path = tmp_file.name
26
 
27
- # Load audio using librosa and clean up temp file
28
  audio, _ = librosa.load(tmp_file_path, sr=sr)
29
- os.unlink(tmp_file_path) # Delete temporary file
30
  return audio
31
 
32
  def preview_content_preset(preset_name):
@@ -34,7 +61,6 @@ def preview_content_preset(preset_name):
34
  if preset_name and preset_name in INPUT_URLS:
35
  try:
36
  audio = load_audio_from_url(INPUT_URLS[preset_name], sr=exp.sr)
37
- # Limit to 5 seconds for preview
38
  preview_duration = 5
39
  max_samples = int(preview_duration * exp.sr)
40
  if len(audio) > max_samples:
@@ -50,7 +76,6 @@ def preview_style_preset(preset_name):
50
  if preset_name and preset_name in INPUT_URLS:
51
  try:
52
  audio = load_audio_from_url(INPUT_URLS[preset_name], sr=exp.sr)
53
- # Limit to 5 seconds for preview
54
  preview_duration = 5
55
  max_samples = int(preview_duration * exp.sr)
56
  if len(audio) > max_samples:
@@ -62,13 +87,9 @@ def preview_style_preset(preset_name):
62
  return None
63
 
64
  def process_timbre_transfer(content_file, content_preset, style_file, style_preset, max_duration=8):
65
- """Process timbre transfer with uploaded files or presets
66
-
67
- Allows mixing: upload + preset or preset + upload
68
- """
69
  try:
70
- # Load content audio (musical notes/melody to preserve)
71
- # Priority: uploaded file > preset selection
72
  if content_file is not None:
73
  a_content, _ = librosa.load(content_file, sr=exp.sr)
74
  elif content_preset and content_preset in INPUT_URLS:
@@ -76,8 +97,7 @@ def process_timbre_transfer(content_file, content_preset, style_file, style_pres
76
  else:
77
  return None, "Please upload a content file or select a content preset"
78
 
79
- # Load style audio (timbre/texture to apply)
80
- # Priority: uploaded file > preset selection
81
  if style_file is not None:
82
  a_style, _ = librosa.load(style_file, sr=exp.sr)
83
  elif style_preset and style_preset in INPUT_URLS:
@@ -85,34 +105,32 @@ def process_timbre_transfer(content_file, content_preset, style_file, style_pres
85
  else:
86
  return None, "Please upload a style file or select a style preset"
87
 
88
- # Limit duration to prevent memory issues and long processing times
89
  max_samples = int(max_duration * exp.sr)
90
  if len(a_content) > max_samples:
91
- a_content = a_content[:max_samples] # Truncate to max duration
92
  if len(a_style) > max_samples:
93
  a_style = a_style[:max_samples]
94
 
95
- # Preprocess: Convert audio to model input format (spectrograms)
96
  s_content = torch.as_tensor(exp.preprocess(a_content), device=exp.device)[None, :]
97
  s_style = torch.as_tensor(exp.preprocess(a_style), device=exp.device)[None, :]
98
- # Create length tensors for variable-length input handling
99
  l_content, l_style = (torch.as_tensor([x.shape[2]], device=exp.device) for x in [s_content, s_style])
100
 
101
- # Run model: Extract content features, extract style features, then recombine
102
- with torch.no_grad(): # Disable gradient computation for inference
103
  s_output = exp.model(input_c=s_content, input_s=s_style,
104
  length_c=l_content, length_s=l_style)
105
 
106
- # Postprocess: Convert model output back to audio waveform
107
  a_output = exp.postprocess(s_output.cpu().numpy()[0])
108
 
109
- # Return audio in format expected by Gradio: (sample_rate, audio_array)
110
  return (exp.sr, a_output), "Transfer completed successfully!"
111
 
112
  except Exception as e:
113
  return None, f"Error: {str(e)}"
114
 
115
- # Create Gradio interface with modern theme
116
  with gr.Blocks(title="VQ-VAE Timbre Transfer", theme=gr.themes.Soft()) as demo:
117
  gr.Markdown("""
118
  # 🎡 VQ-VAE Timbre Transfer Demo
@@ -131,17 +149,15 @@ with gr.Blocks(title="VQ-VAE Timbre Transfer", theme=gr.themes.Soft()) as demo:
131
  - Try different combinations - unexpected results can be musically interesting!
132
  """)
133
 
134
- # Two-column layout for content and style inputs
135
  with gr.Row():
136
  with gr.Column():
137
  gr.Markdown("### 🎼 Content Audio")
138
  content_file = gr.Audio(label="Upload Content Audio", type="filepath")
139
  content_preset = gr.Dropdown(
140
- choices=[""] + CONTENT_OPTIONS, # Empty string allows no selection
141
  label="Or choose preset",
142
- value="" # No default selection
143
  )
144
- # Preview audio for content preset
145
  content_preview = gr.Audio(
146
  label="πŸ”Š Content Preview (5s)",
147
  interactive=False,
@@ -154,26 +170,22 @@ with gr.Blocks(title="VQ-VAE Timbre Transfer", theme=gr.themes.Soft()) as demo:
154
  style_preset = gr.Dropdown(
155
  choices=[""] + STYLE_OPTIONS,
156
  label="Or choose preset",
157
- value="Electric Guitar Close" # Default style selection
158
  )
159
- # Preview audio for style preset
160
  style_preview = gr.Audio(
161
  label="πŸ”Š Style Preview (5s)",
162
  interactive=False,
163
- visible=True # Visible by default since we have a default selection
164
  )
165
 
166
- # Duration control to balance quality vs processing time
167
  max_duration = gr.Slider(1, 15, value=8, step=1, label="Max Duration (seconds)")
168
-
169
  process_btn = gr.Button("πŸš€ Transfer Timbre", variant="primary", size="lg")
170
 
171
- # Output section
172
  with gr.Row():
173
  output_audio = gr.Audio(label="🎡 Output Audio", interactive=False)
174
  status_msg = gr.Textbox(label="Status", interactive=False, max_lines=3)
175
 
176
- # Hide previews when user uploads their own files
177
  content_file.change(
178
  fn=lambda file: gr.update(visible=False) if file is not None else None,
179
  inputs=[content_file],
@@ -186,11 +198,11 @@ with gr.Blocks(title="VQ-VAE Timbre Transfer", theme=gr.themes.Soft()) as demo:
186
  outputs=[style_preview]
187
  )
188
 
189
- # Connect preset selection to audio preview (only when no file uploaded)
190
  content_preset.change(
191
  fn=lambda preset, file: (
192
  preview_content_preset(preset) if preset and file is None else None,
193
- gr.update(visible=bool(preset and file is None)) # Show only if preset selected and no file uploaded
194
  ),
195
  inputs=[content_preset, content_file],
196
  outputs=[content_preview, content_preview]
@@ -199,24 +211,24 @@ with gr.Blocks(title="VQ-VAE Timbre Transfer", theme=gr.themes.Soft()) as demo:
199
  style_preset.change(
200
  fn=lambda preset, file: (
201
  preview_style_preset(preset) if preset and file is None else None,
202
- gr.update(visible=bool(preset and file is None)) # Show only if preset selected and no file uploaded
203
  ),
204
  inputs=[style_preset, style_file],
205
  outputs=[style_preview, style_preview]
206
  )
207
 
208
- # Load default style preview on startup
209
  demo.load(
210
  fn=lambda: preview_style_preset("Electric Guitar Close"),
211
  outputs=[style_preview]
212
  )
213
 
214
- # Connect button click to processing function
215
  process_btn.click(
216
  fn=process_timbre_transfer,
217
  inputs=[content_file, content_preset, style_file, style_preset, max_duration],
218
  outputs=[output_audio, status_msg]
219
  )
220
 
221
- # Launch with public sharing enabled and debug mode for development
222
- demo.launch(share=True, debug=True, height=1400) # Increased height for preview components
 
4
  import requests
5
  import tempfile
6
 
7
+ # Preset audio URLs
8
+ INPUT_ROOT = 'https://adasp.telecom-paris.fr/rc-ext/demos_companion-pages/vqvae_examples/'
9
+ INPUT_URLS = {
10
+ 'Electric Guitar': INPUT_ROOT + 'real/content/UnicornRodeo_Maybe_UnicornRodeo_Maybe_Full_25_ElecGtr2CloseMic3.0148.mp3',
11
+ 'Electric Organ': INPUT_ROOT + 'real/style/AllenStone_Naturally_Allen%20Stone_Naturally_Keys-Organ-Active%20DI.0253.mp3',
12
+ 'Jazz Piano': INPUT_ROOT + 'real/style/MaurizioPagnuttiSextet_AllTheGinIsGone_MaurizioPagnuttiSextet_AllTheGinIsGone_Full_12_PianoMics1.08.mp3',
13
+ 'Synth': INPUT_ROOT + 'real/content/Skelpolu_TogetherAlone_Skelpolu_TogetherAlone_Full_13_Synth.0190.mp3',
14
+ 'Rhodes DI': INPUT_ROOT + 'real/content/Diesel13_ColourMeRed_Diesel13_ColourMeRed_Full_30_RhodesDI.0062.mp3',
15
+ 'Acoustic Guitar Lead': INPUT_ROOT + 'real/style/NikolaStajicFtVlasisKostas_Nalim_Nikola%20Stajic%20ft.%20Vlasis%20Kostas_Nalim_Acoustic%20Guitar-Lead-Ela%20M%20251.0170.mp3',
16
+ 'Bass Amp': INPUT_ROOT + 'real/content/HurrayForTheRiffRaff_LivingInTheCity_Hurray%20for%20the%20Riff%20Raff_Livin%20in%20the%20City_Bass-Amp-M82.0018.mp3',
17
+ 'Bass Bip': INPUT_ROOT + 'real/style/RememberDecember_CUNextTime_RememberDecember_CUNextTime_Full_11_Bass_bip.041.mp3',
18
+ 'SynthFX': INPUT_ROOT + 'real/content/MR0902_JamesElder_MR0902_JamesElder_Full_13_SynthFX1.163.mp3',
19
+ 'Electric Guitar Close': INPUT_ROOT + 'real/style/Fergessen_TheWind_Fergessen_TheWind_Full_17_SlecGtr3a_Close.146.mp3',
20
+ 'Rhodes NBATG': INPUT_ROOT + 'real/content/NickiBluhmAndTheGramblers_GoGoGo_NBATG%20-%20Rhodes%20-%20DI.098.mp3',
21
+ 'Keys DI Grace': INPUT_ROOT + 'real/style/JessicaChildress_SlowDown_SD%20KEYS-DI-GRACE.147.mp3',
22
+ 'Dulcimer': INPUT_ROOT + 'real/content/ButterflyEffect_PreachRightHere_ButterflyEffect_PreachRightHere_Full_16_Dulcimer2.076.mp3',
23
+ 'Strings Section': INPUT_ROOT + 'real/style/AngeloBoltini_ThisTown_AngeloBoltini_ThisTown_Full_47_Strings_SectionMic_Vln2.0181.mp3',
24
+ 'Mellotron': INPUT_ROOT + 'real/content/Triviul_Dorothy_Triviul_Dorothy_Full_07_Mellotron.120.mp3',
25
+ 'Acoustic Guitar CU': INPUT_ROOT + 'real/style/UncleDad_WhoIAm_legend-strings_AC%20GUITAR-3-CU29-SHADOWHILL.R.0106.mp3',
26
+ 'Fiddle': INPUT_ROOT + 'real/content/EndaReilly_CurAnLongAgSeol_EndaReilly_CurAnLongAgSeol_Full_10_Fiddle2.0163.mp3',
27
+ 'Violins': INPUT_ROOT + 'real/style/ScottElliott_AeternumVale_ScottElliott_AeternumVale_Full_41_Violins.0138.mp3',
28
+ 'Upright Bass': INPUT_ROOT + 'real/content/AbletonesBigBand_SongOfIndia_UPRIGHT%20BASS%20-%20ELA%20M%20260%20-%20Neve%2033102.136.mp3',
29
+ 'Taiko': INPUT_ROOT + 'real/style/CarlosGonzalez_APlaceForUs_CarlosGonzalez_APlaceForUs_Full_21_Taiko.0115.mp3',
30
+ 'Guitar 2': INPUT_ROOT + 'real/content/AllHandsLost_Ambitions_AllHandsLost_Ambitions_Full_Guitar%202.0292.mp3',
31
+ 'Alto Sax': INPUT_ROOT + 'real/style/SunshineGarciaBand_ForIAmTheMoon_zip5-outro-uke-shaker_OUTRO%20ALTO-251E-SSL6000E.0290.mp3',
32
+ 'Bass Close Mic': INPUT_ROOT + 'real/content/DonCamilloChoir_MarshMarigoldsSong_DonCamilloChoir_MarshMarigoldsSong_Full_08_BassCloseMic2.000.mp3',
33
+ 'Electric Guitar Distorted': INPUT_ROOT + 'real/style/EnterTheHaggis_TwoBareHands_25.%20Jubilee%20Riots%20-%202%20Bar%20Hands_ELE%20Guitars-Ignater-M81.160.mp3',
34
+ 'Bells': INPUT_ROOT + 'real/content/cryonicPAX_Melancholy_cryonicPAX_Melancholy_Full_10_Bells.0034.mp3',
35
+ 'Bass Mic 647': INPUT_ROOT + 'real/style/KungFu_JoyRide_40.%20Kung%20Fu%20-%20Joy%20ride_Bass-Mic-647.0090.mp3',
36
+ }
37
+
38
  # Separate content and style options based on URL paths
 
39
  CONTENT_OPTIONS = [key for key in INPUT_URLS.keys() if any(word in INPUT_URLS[key] for word in ['content'])]
40
  STYLE_OPTIONS = [key for key in INPUT_URLS.keys() if any(word in INPUT_URLS[key] for word in ['style'])]
41
 
42
+ # Add remaining items to both lists if they don't contain 'content' or 'style'
 
43
  for key in INPUT_URLS.keys():
44
  if key not in CONTENT_OPTIONS and key not in STYLE_OPTIONS:
45
  CONTENT_OPTIONS.append(key)
 
48
  def load_audio_from_url(url, sr=None):
49
  """Load audio from URL by downloading to temporary file"""
50
  response = requests.get(url)
 
51
  with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_file:
52
  tmp_file.write(response.content)
53
  tmp_file_path = tmp_file.name
54
 
 
55
  audio, _ = librosa.load(tmp_file_path, sr=sr)
56
+ os.unlink(tmp_file_path)
57
  return audio
58
 
59
  def preview_content_preset(preset_name):
 
61
  if preset_name and preset_name in INPUT_URLS:
62
  try:
63
  audio = load_audio_from_url(INPUT_URLS[preset_name], sr=exp.sr)
 
64
  preview_duration = 5
65
  max_samples = int(preview_duration * exp.sr)
66
  if len(audio) > max_samples:
 
76
  if preset_name and preset_name in INPUT_URLS:
77
  try:
78
  audio = load_audio_from_url(INPUT_URLS[preset_name], sr=exp.sr)
 
79
  preview_duration = 5
80
  max_samples = int(preview_duration * exp.sr)
81
  if len(audio) > max_samples:
 
87
  return None
88
 
89
  def process_timbre_transfer(content_file, content_preset, style_file, style_preset, max_duration=8):
90
+ """Process timbre transfer with uploaded files or presets"""
 
 
 
91
  try:
92
+ # Load content audio
 
93
  if content_file is not None:
94
  a_content, _ = librosa.load(content_file, sr=exp.sr)
95
  elif content_preset and content_preset in INPUT_URLS:
 
97
  else:
98
  return None, "Please upload a content file or select a content preset"
99
 
100
+ # Load style audio
 
101
  if style_file is not None:
102
  a_style, _ = librosa.load(style_file, sr=exp.sr)
103
  elif style_preset and style_preset in INPUT_URLS:
 
105
  else:
106
  return None, "Please upload a style file or select a style preset"
107
 
108
+ # Limit duration
109
  max_samples = int(max_duration * exp.sr)
110
  if len(a_content) > max_samples:
111
+ a_content = a_content[:max_samples]
112
  if len(a_style) > max_samples:
113
  a_style = a_style[:max_samples]
114
 
115
+ # Preprocess
116
  s_content = torch.as_tensor(exp.preprocess(a_content), device=exp.device)[None, :]
117
  s_style = torch.as_tensor(exp.preprocess(a_style), device=exp.device)[None, :]
 
118
  l_content, l_style = (torch.as_tensor([x.shape[2]], device=exp.device) for x in [s_content, s_style])
119
 
120
+ # Run model
121
+ with torch.no_grad():
122
  s_output = exp.model(input_c=s_content, input_s=s_style,
123
  length_c=l_content, length_s=l_style)
124
 
125
+ # Postprocess
126
  a_output = exp.postprocess(s_output.cpu().numpy()[0])
127
 
 
128
  return (exp.sr, a_output), "Transfer completed successfully!"
129
 
130
  except Exception as e:
131
  return None, f"Error: {str(e)}"
132
 
133
+ # Create Gradio interface
134
  with gr.Blocks(title="VQ-VAE Timbre Transfer", theme=gr.themes.Soft()) as demo:
135
  gr.Markdown("""
136
  # 🎡 VQ-VAE Timbre Transfer Demo
 
149
  - Try different combinations - unexpected results can be musically interesting!
150
  """)
151
 
 
152
  with gr.Row():
153
  with gr.Column():
154
  gr.Markdown("### 🎼 Content Audio")
155
  content_file = gr.Audio(label="Upload Content Audio", type="filepath")
156
  content_preset = gr.Dropdown(
157
+ choices=[""] + CONTENT_OPTIONS,
158
  label="Or choose preset",
159
+ value=""
160
  )
 
161
  content_preview = gr.Audio(
162
  label="πŸ”Š Content Preview (5s)",
163
  interactive=False,
 
170
  style_preset = gr.Dropdown(
171
  choices=[""] + STYLE_OPTIONS,
172
  label="Or choose preset",
173
+ value="Electric Guitar Close"
174
  )
 
175
  style_preview = gr.Audio(
176
  label="πŸ”Š Style Preview (5s)",
177
  interactive=False,
178
+ visible=True
179
  )
180
 
 
181
  max_duration = gr.Slider(1, 15, value=8, step=1, label="Max Duration (seconds)")
 
182
  process_btn = gr.Button("πŸš€ Transfer Timbre", variant="primary", size="lg")
183
 
 
184
  with gr.Row():
185
  output_audio = gr.Audio(label="🎡 Output Audio", interactive=False)
186
  status_msg = gr.Textbox(label="Status", interactive=False, max_lines=3)
187
 
188
+ # Hide previews when user uploads files
189
  content_file.change(
190
  fn=lambda file: gr.update(visible=False) if file is not None else None,
191
  inputs=[content_file],
 
198
  outputs=[style_preview]
199
  )
200
 
201
+ # Show previews when presets are selected
202
  content_preset.change(
203
  fn=lambda preset, file: (
204
  preview_content_preset(preset) if preset and file is None else None,
205
+ gr.update(visible=bool(preset and file is None))
206
  ),
207
  inputs=[content_preset, content_file],
208
  outputs=[content_preview, content_preview]
 
211
  style_preset.change(
212
  fn=lambda preset, file: (
213
  preview_style_preset(preset) if preset and file is None else None,
214
+ gr.update(visible=bool(preset and file is None))
215
  ),
216
  inputs=[style_preset, style_file],
217
  outputs=[style_preview, style_preview]
218
  )
219
 
220
+ # Load default style preview
221
  demo.load(
222
  fn=lambda: preview_style_preset("Electric Guitar Close"),
223
  outputs=[style_preview]
224
  )
225
 
226
+ # Process button
227
  process_btn.click(
228
  fn=process_timbre_transfer,
229
  inputs=[content_file, content_preset, style_file, style_preset, max_duration],
230
  outputs=[output_audio, status_msg]
231
  )
232
 
233
+ # Launch
234
+ demo.launch(share=True, debug=True, height=1400)