Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,902 Bytes
860e11a 456ed62 860e11a 456ed62 8bd34c2 9c6d7bb f84fac1 9c6d7bb faca83a c1ef20f b725215 456ed62 b725215 456ed62 b725215 6a270df c1ef20f 456ed62 b725215 456ed62 6a270df b725215 faca83a b725215 a17cbc4 456ed62 6a270df 456ed62 cc99eb6 6a270df c1ef20f 456ed62 8436e2d faca83a 8436e2d cc99eb6 8436e2d cc99eb6 8436e2d cc99eb6 8436e2d cc99eb6 8436e2d cc99eb6 8436e2d cfd828f 456ed62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import spaces
import os
import uuid
os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
alloc_conf_parts = [
'expandable_segments:True',
'pinned_use_background_threads:True' # Specific to pinned memory.
]
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
import torch
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.backends.cuda.preferred_blas_library="cublas"
torch.backends.cuda.preferred_linalg_library="cusolver"
torch.set_float32_matmul_precision("highest")
import torchaudio
from einops import rearrange
import gradio as gr
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond
model, model_config = get_pretrained_model("ford442/stable-audio-open-1.0")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model.to(device,torch.float32)
@spaces.GPU(duration=60)
def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7, use_bfloat=False, use_eval=False):
print(f"Prompt received: {prompt}")
print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
sample_rate = model_config["sample_rate"]
sample_size = model_config["sample_size"]
print(f"Sample rate: {sample_rate}, Sample size: {sample_size}")
print("Model moved to device.")
conditioning = [{
"prompt": prompt,
"seconds_start": 0,
"seconds_total": seconds_total
}]
print(f"Conditioning: {conditioning}")
print("Generating audio...")
if use_bfloat==True:
model.to(torch.bfloat16)
if use_eval==True:
model.eval()
output = generate_diffusion_cond(
model,
steps=steps,
cfg_scale=cfg_scale,
conditioning=conditioning,
sample_size=sample_size,
sigma_min=0.3,
sigma_max=500,
sampler_type="dpmpp-3m-sde",
device=device
)
print("Audio generated.")
output = rearrange(output, "b d n -> d (b n)")
# Peak normalize, clip, convert to int16
output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
unique_filename = f"output_{uuid.uuid4().hex}.mp3"
print(f"Saving audio to file: {unique_filename}")
torchaudio.save(
unique_filename,
output,
sample_rate,
format="mp3",
encoding="MP3",
bits_per_sample=320
)
print(f"Audio saved: {unique_filename}")
return unique_filename
interface = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
gr.Slider(0, 420, value=30, label="Duration in Seconds"),
gr.Slider(10, 420, value=100, step=10, label="Number of Diffusion Steps"),
gr.Slider(1.0, 32.0, value=7.0, step=0.1, label="CFG Scale"),
gr.Checkbox(value=False, label="Use Brainfloat"),
gr.Checkbox(value=False, label="Use eval()")
],
outputs=gr.Audio(type="filepath", label="Generated Audio"),
title="Stable Audio Generator",
description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
examples=[
[
"Create a serene soundscape of a quiet beach at sunset.", # Text prompt
45, # Duration in Seconds
100, # Number of Diffusion Steps
10.0, # CFG Scale
],
[
"Generate an energetic and bustling city street scene with distant traffic and close conversations.", # Text prompt
30, # Duration in Seconds
120, # Number of Diffusion Steps
5.0, # CFG Scale
],
[
"Simulate a forest ambiance with birds chirping and wind rustling through the leaves.", # Text prompt
60, # Duration in Seconds
140, # Number of Diffusion Steps
7.5, # CFG Scale
],
[
"Recreate a gentle rainfall with distant thunder.", # Text prompt
35, # Duration in Seconds
110, # Number of Diffusion Steps
8.0, # CFG Scale
],
[
"Imagine a jazz cafe environment with soft music and ambient chatter.", # Text prompt
25, # Duration in Seconds
90, # Number of Diffusion Steps
6.0, # CFG Scale
],
["Rock beat played in a treated studio, session drumming on an acoustic kit.",
30, # Duration in Seconds
100, # Number of Diffusion Steps
7.0, # CFG Scale
]
])
interface.launch()
|