Spaces:
Runtime error
Runtime error
added latent diffusion to notebook
Browse files- notebooks/test_model.ipynb +118 -4
notebooks/test_model.ipynb
CHANGED
|
@@ -177,13 +177,13 @@
|
|
| 177 |
},
|
| 178 |
"outputs": [],
|
| 179 |
"source": [
|
| 180 |
-
"
|
| 181 |
"track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)\n",
|
| 182 |
"for variation in range(12):\n",
|
| 183 |
" image2, (\n",
|
| 184 |
" sample_rate,\n",
|
| 185 |
" audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
|
| 186 |
-
" raw_audio=audio, start_step=
|
| 187 |
" display(image2)\n",
|
| 188 |
" display(Audio(audio2, rate=sample_rate))\n",
|
| 189 |
" track = np.concatenate(\n",
|
|
@@ -490,10 +490,124 @@
|
|
| 490 |
"display(Audio(audio, rate=sample_rate))"
|
| 491 |
]
|
| 492 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
{
|
| 494 |
"cell_type": "code",
|
| 495 |
"execution_count": null,
|
| 496 |
-
"id": "
|
| 497 |
"metadata": {},
|
| 498 |
"outputs": [],
|
| 499 |
"source": []
|
|
@@ -520,7 +634,7 @@
|
|
| 520 |
"name": "python",
|
| 521 |
"nbconvert_exporter": "python",
|
| 522 |
"pygments_lexer": "ipython3",
|
| 523 |
-
"version": "3.
|
| 524 |
},
|
| 525 |
"toc": {
|
| 526 |
"base_numbering": 1,
|
|
|
|
| 177 |
},
|
| 178 |
"outputs": [],
|
| 179 |
"source": [
|
| 180 |
+
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
| 181 |
"track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)\n",
|
| 182 |
"for variation in range(12):\n",
|
| 183 |
" image2, (\n",
|
| 184 |
" sample_rate,\n",
|
| 185 |
" audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
|
| 186 |
+
" raw_audio=audio, start_step=start_step)\n",
|
| 187 |
" display(image2)\n",
|
| 188 |
" display(Audio(audio2, rate=sample_rate))\n",
|
| 189 |
" track = np.concatenate(\n",
|
|
|
|
| 490 |
"display(Audio(audio, rate=sample_rate))"
|
| 491 |
]
|
| 492 |
},
|
| 493 |
+
{
|
| 494 |
+
"cell_type": "markdown",
|
| 495 |
+
"id": "5b7081f7",
|
| 496 |
+
"metadata": {},
|
| 497 |
+
"source": [
|
| 498 |
+
"## Latent Audio Diffusion\n",
|
| 499 |
+
"Instead of de-noising images directly in the pixel space, we can work in the latent space of a pre-trained VAE (Variational AutoEncoder). This is much faster to train and run inference on, although the quality suffers as there are now three stages involved in encoding / decoding: mel spectrogram, VAE and de-noising."
|
| 500 |
+
]
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"cell_type": "code",
|
| 504 |
+
"execution_count": null,
|
| 505 |
+
"id": "17610772",
|
| 506 |
+
"metadata": {},
|
| 507 |
+
"outputs": [],
|
| 508 |
+
"source": [
|
| 509 |
+
"model_id = \"teticio/latent-audio-diffusion-ddim-256\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
|
| 510 |
+
]
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"cell_type": "code",
|
| 514 |
+
"execution_count": null,
|
| 515 |
+
"id": "9e6c73e6",
|
| 516 |
+
"metadata": {},
|
| 517 |
+
"outputs": [],
|
| 518 |
+
"source": [
|
| 519 |
+
"audio_diffusion = AudioDiffusion(model_id=model_id)"
|
| 520 |
+
]
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"cell_type": "code",
|
| 524 |
+
"execution_count": null,
|
| 525 |
+
"id": "d37a03a9",
|
| 526 |
+
"metadata": {},
|
| 527 |
+
"outputs": [],
|
| 528 |
+
"source": [
|
| 529 |
+
"seed = 6015487092443227811 #@param {type:\"integer\"}\n",
|
| 530 |
+
"generator.manual_seed(seed)\n",
|
| 531 |
+
"image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n",
|
| 532 |
+
" generator=generator)\n",
|
| 533 |
+
"display(image)\n",
|
| 534 |
+
"display(Audio(audio, rate=sample_rate))"
|
| 535 |
+
]
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"cell_type": "code",
|
| 539 |
+
"execution_count": null,
|
| 540 |
+
"id": "c0328a56",
|
| 541 |
+
"metadata": {},
|
| 542 |
+
"outputs": [],
|
| 543 |
+
"source": [
|
| 544 |
+
"seed2 = 5623685468252603494 #@param {type:\"integer\"}\n",
|
| 545 |
+
"generator.manual_seed(seed2)\n",
|
| 546 |
+
"image2, (sample_rate, audio2) = audio_diffusion.generate_spectrogram_and_audio(\n",
|
| 547 |
+
" generator=generator)\n",
|
| 548 |
+
"display(image2)\n",
|
| 549 |
+
"display(Audio(audio2, rate=sample_rate))"
|
| 550 |
+
]
|
| 551 |
+
},
|
| 552 |
+
{
|
| 553 |
+
"cell_type": "markdown",
|
| 554 |
+
"id": "bd1f2b58",
|
| 555 |
+
"metadata": {},
|
| 556 |
+
"source": [
|
| 557 |
+
"### Interpolation in latent space\n",
|
| 558 |
+
"As the VAE forces a more compact, lower dimensional representation for the spectrograms, interpolation in latent space can lead to meaningful combinations of audios. In combination with the (deterministic) DDIM from the previous section, the model can be used as an encoder / decoder to a lower dimensional space."
|
| 559 |
+
]
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"cell_type": "code",
|
| 563 |
+
"execution_count": null,
|
| 564 |
+
"id": "23ff0ee7",
|
| 565 |
+
"metadata": {},
|
| 566 |
+
"outputs": [],
|
| 567 |
+
"source": [
|
| 568 |
+
"generator.manual_seed(seed)\n",
|
| 569 |
+
"noise = torch.randn((1, audio_diffusion.pipe.unet.in_channels,\n",
|
| 570 |
+
" audio_diffusion.pipe.unet.sample_size[0],\n",
|
| 571 |
+
" audio_diffusion.pipe.unet.sample_size[1]),\n",
|
| 572 |
+
" generator=generator)\n",
|
| 573 |
+
"noise.shape"
|
| 574 |
+
]
|
| 575 |
+
},
|
| 576 |
+
{
|
| 577 |
+
"cell_type": "code",
|
| 578 |
+
"execution_count": null,
|
| 579 |
+
"id": "ff13a2cb",
|
| 580 |
+
"metadata": {},
|
| 581 |
+
"outputs": [],
|
| 582 |
+
"source": [
|
| 583 |
+
"generator.manual_seed(seed2)\n",
|
| 584 |
+
"noise2 = torch.randn((1, audio_diffusion.pipe.unet.in_channels,\n",
|
| 585 |
+
" audio_diffusion.pipe.unet.sample_size[0],\n",
|
| 586 |
+
" audio_diffusion.pipe.unet.sample_size[1]),\n",
|
| 587 |
+
" generator=generator)\n",
|
| 588 |
+
"noise2.shape"
|
| 589 |
+
]
|
| 590 |
+
},
|
| 591 |
+
{
|
| 592 |
+
"cell_type": "code",
|
| 593 |
+
"execution_count": null,
|
| 594 |
+
"id": "bea26a5e",
|
| 595 |
+
"metadata": {},
|
| 596 |
+
"outputs": [],
|
| 597 |
+
"source": [
|
| 598 |
+
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
| 599 |
+
"_, (sample_rate, audio3) = audio_diffusion.generate_spectrogram_and_audio(\n",
|
| 600 |
+
" noise=audio_diffusion.pipe.slerp(noise, noise2, alpha),\n",
|
| 601 |
+
" generator=generator)\n",
|
| 602 |
+
"display(Audio(audio, rate=mel.get_sample_rate()))\n",
|
| 603 |
+
"display(Audio(audio2, rate=mel.get_sample_rate()))\n",
|
| 604 |
+
"display(Audio(audio3, rate=sample_rate))"
|
| 605 |
+
]
|
| 606 |
+
},
|
| 607 |
{
|
| 608 |
"cell_type": "code",
|
| 609 |
"execution_count": null,
|
| 610 |
+
"id": "60080eed",
|
| 611 |
"metadata": {},
|
| 612 |
"outputs": [],
|
| 613 |
"source": []
|
|
|
|
| 634 |
"name": "python",
|
| 635 |
"nbconvert_exporter": "python",
|
| 636 |
"pygments_lexer": "ipython3",
|
| 637 |
+
"version": "3.10.6"
|
| 638 |
},
|
| 639 |
"toc": {
|
| 640 |
"base_numbering": 1,
|