quazim commited on
Commit
a47122b
·
1 Parent(s): e38c357
Files changed (1) hide show
  1. app.py +101 -103
app.py CHANGED
@@ -13,7 +13,14 @@ from transformers import AutoProcessor, pipeline
13
  from elastic_models.transformers import MusicgenForConditionalGeneration
14
 
15
  MODEL_CONFIG = {
16
- 'cost_per_hour': 1.8, # $1.8 per hour
 
 
 
 
 
 
 
17
  }
18
 
19
  original_time_cache = {"original_time": 22.57}
@@ -256,6 +263,13 @@ def calculate_cost_savings(compressed_time, original_time):
256
  }
257
 
258
 
 
 
 
 
 
 
 
259
  def get_cache_key(prompt, duration, guidance_scale):
260
  return f"{hash(prompt)}_{duration}_{guidance_scale}"
261
 
@@ -266,10 +280,11 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
266
  generator, processor = load_model()
267
  model_name = "Compressed (S)"
268
 
269
- print(f"[GENERATION] Starting batch generation using {model_name} model...")
270
  print(f"[GENERATION] Prompt: '{text_prompt}'")
271
  print(f"[GENERATION] Duration: {duration}s")
272
  print(f"[GENERATION] Guidance scale: {guidance_scale}")
 
273
 
274
  cleanup_gpu()
275
  set_seed(42)
@@ -285,31 +300,33 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
285
  'cache_implementation': 'paged',
286
  }
287
 
288
- prompts = [text_prompt] * 4
 
 
289
  start_time = time.time()
290
  outputs = generator(
291
  prompts,
292
- batch_size=4,
293
  generate_kwargs=generation_params
294
  )
295
  generation_time = time.time() - start_time
296
 
297
- print(f"[GENERATION] Batch generation completed in {generation_time:.2f}s")
298
 
299
  audio_variants = []
300
  sample_rate = outputs[0]['sampling_rate']
301
-
302
  for i, output in enumerate(outputs):
303
  audio_data = output['audio']
304
-
305
  print(f"[GENERATION] Processing variant {i + 1} audio shape: {audio_data.shape}")
306
-
307
  if hasattr(audio_data, 'cpu'):
308
  audio_data = audio_data.cpu().numpy()
309
 
310
  if len(audio_data.shape) == 3:
311
  audio_data = audio_data[0]
312
-
313
  if len(audio_data.shape) == 2:
314
  if audio_data.shape[0] < audio_data.shape[1]:
315
  audio_data = audio_data.T
@@ -317,58 +334,26 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
317
  audio_data = audio_data[:, 0]
318
  else:
319
  audio_data = audio_data.flatten()
320
-
321
  audio_data = audio_data.flatten()
322
-
323
  max_val = np.max(np.abs(audio_data))
324
  if max_val > 0:
325
  audio_data = audio_data / max_val * 0.95
326
-
327
  audio_data = (audio_data * 32767).astype(np.int16)
328
  audio_variants.append((sample_rate, audio_data))
329
-
330
  print(f"[GENERATION] Variant {i + 1} final shape: {audio_data.shape}")
331
 
332
- comparison_message = ""
333
-
334
- if "original_time" in original_time_cache:
335
- original_time = original_time_cache["original_time"]
336
- cost_info = calculate_cost_savings(generation_time, original_time)
337
-
338
- comparison_message = f"💰 Cost Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%) - Compressed: ${cost_info['compressed_cost']:.4f} vs Original: ${cost_info['original_cost']:.4f}"
339
- print(f"[COST] Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%)")
340
- else:
341
- try:
342
- print(f"[TIMING] Measuring original model speed for comparison...")
343
- original_generator, original_processor = load_original_model()
344
-
345
- original_start = time.time()
346
- original_outputs = original_generator(
347
- prompts,
348
- batch_size=4,
349
- generate_kwargs=generation_params
350
- )
351
- original_time = time.time() - original_start
352
-
353
- original_time_cache[cache_key] = original_time
354
-
355
- cost_info = calculate_cost_savings(generation_time, original_time)
356
- comparison_message = f"💰 Cost Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%) - Compressed: ${cost_info['compressed_cost']:.4f} vs Original: ${cost_info['original_cost']:.4f}"
357
- print(
358
- f"[COST] First comparison - Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%)")
359
- print(f"[TIMING] Original: {original_time:.2f}s, Compressed: {generation_time:.2f}s")
360
-
361
- del original_generator, original_processor
362
- cleanup_gpu()
363
- print(f"[CLEANUP] Original model cleaned up after timing measurement")
364
-
365
- except Exception as e:
366
- print(f"[WARNING] Could not measure original timing: {e}")
367
- compressed_cost = calculate_generation_cost(generation_time, 'S')
368
- comparison_message = f"💸 Compressed Cost: ${compressed_cost:.4f} (could not compare with original)"
369
-
370
- generation_info = f"✅ Generated 4 variants in {generation_time:.2f}s\n{comparison_message}"
371
 
 
 
 
 
 
372
  return audio_variants[0], audio_variants[1], audio_variants[2], audio_variants[3], generation_info
373
 
374
  except Exception as e:
@@ -378,63 +363,71 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
378
  return None, None, None, None, error_msg
379
 
380
 
381
- with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
382
  gr.Markdown("# 🎵 MusicGen Large Music Generator")
 
383
  gr.Markdown(
384
- "Generate music from text descriptions using Facebook's MusicGen Large model accelerated by TheStage for 2.3x faster performance")
385
-
386
- with gr.Row():
387
- with gr.Column():
388
- text_input = gr.Textbox(
389
- label="Music Description",
390
- placeholder="Enter a description of the music you want to generate",
391
- lines=3,
392
- value="A groovy funk bassline with a tight drum beat"
393
- )
394
-
395
- with gr.Row():
396
- duration = gr.Slider(
397
- minimum=5,
398
- maximum=30,
399
- value=10,
400
- step=1,
401
- label="Duration (seconds)"
402
- )
403
- guidance_scale = gr.Slider(
404
- minimum=1.0,
405
- maximum=10.0,
406
- value=3.0,
407
- step=0.5,
408
- label="Guidance Scale",
409
- info="Higher values follow prompt more closely"
410
- )
411
-
412
- generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg")
413
-
414
- with gr.Column():
415
- generation_info = gr.Markdown("Ready to generate music variants with cost comparison vs original model")
416
-
417
- with gr.Row():
418
- audio_output1 = gr.Audio(label="Variant 1", type="numpy")
419
- audio_output2 = gr.Audio(label="Variant 2", type="numpy")
420
-
421
- with gr.Row():
422
- audio_output3 = gr.Audio(label="Variant 3", type="numpy")
423
- audio_output4 = gr.Audio(label="Variant 4", type="numpy")
424
-
425
- with gr.Accordion("Tips", open=False):
426
- gr.Markdown("""
427
- - Be specific in your descriptions (e.g., "slow blues guitar with harmonica")
428
- - Higher guidance scale = follows prompt more closely
429
- - Lower guidance scale = more creative/varied results
430
- - Duration is limited to 30 seconds for faster generation
431
- """)
432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
  def generate_simple(text_prompt, duration, guidance_scale):
435
  return generate_music_batch(text_prompt, duration, guidance_scale, "compressed")
436
 
437
-
438
  generate_btn.click(
439
  fn=generate_simple,
440
  inputs=[text_input, duration, guidance_scale],
@@ -460,7 +453,12 @@ with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
460
  gr.Markdown("---")
461
  gr.Markdown("""
462
  <div style="text-align: center; color: #666; font-size: 12px; margin-top: 2rem;">
463
- <strong>Limitations:</strong><br>
 
 
 
 
 
464
  • The model is not able to generate realistic vocals.<br>
465
  • The model has been trained with English descriptions and will not perform as well in other languages.<br>
466
  • The model does not perform equally well for all music styles and cultures.<br>
 
13
  from elastic_models.transformers import MusicgenForConditionalGeneration
14
 
15
  MODEL_CONFIG = {
16
+ 'cost_per_hour': 1.8, # $1.8 per hour on L40S
17
+ 'cost_savings_1000h': {
18
+ 'savings_dollars': 8.4, # $8.4 saved per 1000 hours
19
+ 'savings_percent': 74.9, # 74.9% savings
20
+ 'compressed_cost': 2.8, # $2.8 for compressed
21
+ 'original_cost': 11.3, # $11.3 for original
22
+ },
23
+ 'batch_mode': False
24
  }
25
 
26
  original_time_cache = {"original_time": 22.57}
 
263
  }
264
 
265
 
266
+ def get_fixed_savings_message():
267
+ config = MODEL_CONFIG['cost_savings_1000h']
268
+ return f"💰 **Cost Savings on L40S (1000h)**: ${config['savings_dollars']:.1f}" \
269
+ f" ({config['savings_percent']:.1f}%) - Compressed: ${config['compressed_cost']:.1f} " \
270
+ f"vs Original: ${config['original_cost']:.1f}"
271
+
272
+
273
  def get_cache_key(prompt, duration, guidance_scale):
274
  return f"{hash(prompt)}_{duration}_{guidance_scale}"
275
 
 
280
  generator, processor = load_model()
281
  model_name = "Compressed (S)"
282
 
283
+ print(f"[GENERATION] Starting generation using {model_name} model...")
284
  print(f"[GENERATION] Prompt: '{text_prompt}'")
285
  print(f"[GENERATION] Duration: {duration}s")
286
  print(f"[GENERATION] Guidance scale: {guidance_scale}")
287
+ print(f"[GENERATION] Batch mode: {MODEL_CONFIG['batch_mode']}")
288
 
289
  cleanup_gpu()
290
  set_seed(42)
 
300
  'cache_implementation': 'paged',
301
  }
302
 
303
+ batch_size = 4 if MODEL_CONFIG['batch_mode'] else 1
304
+ prompts = [text_prompt] * batch_size
305
+
306
  start_time = time.time()
307
  outputs = generator(
308
  prompts,
309
+ batch_size=batch_size,
310
  generate_kwargs=generation_params
311
  )
312
  generation_time = time.time() - start_time
313
 
314
+ print(f"[GENERATION] Generation completed in {generation_time:.2f}s")
315
 
316
  audio_variants = []
317
  sample_rate = outputs[0]['sampling_rate']
318
+
319
  for i, output in enumerate(outputs):
320
  audio_data = output['audio']
321
+
322
  print(f"[GENERATION] Processing variant {i + 1} audio shape: {audio_data.shape}")
323
+
324
  if hasattr(audio_data, 'cpu'):
325
  audio_data = audio_data.cpu().numpy()
326
 
327
  if len(audio_data.shape) == 3:
328
  audio_data = audio_data[0]
329
+
330
  if len(audio_data.shape) == 2:
331
  if audio_data.shape[0] < audio_data.shape[1]:
332
  audio_data = audio_data.T
 
334
  audio_data = audio_data[:, 0]
335
  else:
336
  audio_data = audio_data.flatten()
337
+
338
  audio_data = audio_data.flatten()
339
+
340
  max_val = np.max(np.abs(audio_data))
341
  if max_val > 0:
342
  audio_data = audio_data / max_val * 0.95
343
+
344
  audio_data = (audio_data * 32767).astype(np.int16)
345
  audio_variants.append((sample_rate, audio_data))
346
+
347
  print(f"[GENERATION] Variant {i + 1} final shape: {audio_data.shape}")
348
 
349
+ while len(audio_variants) < 4:
350
+ audio_variants.append(None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
+ savings_message = get_fixed_savings_message()
353
+
354
+ variants_text = "4 variants" if MODEL_CONFIG['batch_mode'] else "1 variant"
355
+ generation_info = f"✅ Generated {variants_text} in {generation_time:.2f}s\n{savings_message}"
356
+
357
  return audio_variants[0], audio_variants[1], audio_variants[2], audio_variants[3], generation_info
358
 
359
  except Exception as e:
 
363
  return None, None, None, None, error_msg
364
 
365
 
366
+ with gr.Blocks(title="MusicGen Large - Music Generation", theme=gr.themes.Soft()) as demo:
367
  gr.Markdown("# 🎵 MusicGen Large Music Generator")
368
+
369
  gr.Markdown(
370
+ f"Generate music from text descriptions using Facebook's MusicGen "
371
+ f"Large model accelerated by TheStage for 2.3x faster performance.")
372
+
373
+ with gr.Column():
374
+ text_input = gr.Textbox(
375
+ label="Music Description",
376
+ placeholder="Enter a description of the music you want to generate",
377
+ lines=3,
378
+ value="A groovy funk bassline with a tight drum beat"
379
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
+ with gr.Row():
382
+ duration = gr.Slider(
383
+ minimum=5,
384
+ maximum=30,
385
+ value=10,
386
+ step=1,
387
+ label="Duration (seconds)"
388
+ )
389
+ guidance_scale = gr.Slider(
390
+ minimum=1.0,
391
+ maximum=10.0,
392
+ value=3.0,
393
+ step=0.5,
394
+ label="Guidance Scale",
395
+ info="Higher values follow prompt more closely"
396
+ )
397
+
398
+ generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg")
399
+
400
+ generation_info = gr.Markdown("Ready to generate music with elastic acceleration")
401
+
402
+ audio_section_title = "### Generated Music" + (f" ({4 if MODEL_CONFIG['batch_mode'] else 1} variant{'s' if MODEL_CONFIG['batch_mode'] else ''})")
403
+ gr.Markdown(audio_section_title)
404
+
405
+ with gr.Row():
406
+ audio_output1 = gr.Audio(label="Variant 1", type="numpy")
407
+ audio_output2 = gr.Audio(label="Variant 2", type="numpy", visible=MODEL_CONFIG['batch_mode'])
408
+
409
+ with gr.Row():
410
+ audio_output3 = gr.Audio(label="Variant 3", type="numpy", visible=MODEL_CONFIG['batch_mode'])
411
+ audio_output4 = gr.Audio(label="Variant 4", type="numpy", visible=MODEL_CONFIG['batch_mode'])
412
+
413
+ savings_banner = gr.Markdown(get_fixed_savings_message())
414
+
415
+ with gr.Accordion("💡 Tips & Information", open=False):
416
+ gr.Markdown(f"""
417
+ **Generation Tips:**
418
+ - Be specific in your descriptions (e.g., "slow blues guitar with harmonica")
419
+ - Higher guidance scale = follows prompt more closely
420
+ - Lower guidance scale = more creative/varied results
421
+ - Duration is limited to 30 seconds for faster generation
422
+
423
+ **Performance:**
424
+ - Accelerated by TheStage elastic compression
425
+ - L40S GPU pricing: $1.8/hour
426
+ """)
427
 
428
  def generate_simple(text_prompt, duration, guidance_scale):
429
  return generate_music_batch(text_prompt, duration, guidance_scale, "compressed")
430
 
 
431
  generate_btn.click(
432
  fn=generate_simple,
433
  inputs=[text_input, duration, guidance_scale],
 
453
  gr.Markdown("---")
454
  gr.Markdown("""
455
  <div style="text-align: center; color: #666; font-size: 12px; margin-top: 2rem;">
456
+ <strong>TheStage Elastic Acceleration:</strong><br>
457
+ • 2.3x faster generation vs original MusicGen model<br>
458
+ • Benchmarked on L40S GPU @ $1.8/hour pricing<br>
459
+ • Elastic compression maintains audio quality while reducing compute time<br>
460
+
461
+ <strong>Model Limitations:</strong><br>
462
  • The model is not able to generate realistic vocals.<br>
463
  • The model has been trained with English descriptions and will not perform as well in other languages.<br>
464
  • The model does not perform equally well for all music styles and cultures.<br>