Spaces:

thecollabagepatch
/

magenta

Running

App Files Files Community

thecollabagepatch commited on 2 days ago

Commit

956f1a2

1 Parent(s): c4aed03

keep jamming button

Browse files

Files changed (3) hide show

app.py +173 -161
jam_worker.py +0 -0
utils.py +168 -0

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from magenta_rt import system, audio as au
 import numpy as np
-from fastapi import FastAPI, UploadFile, File, Form
 import tempfile, io, base64, math, threading
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import contextmanager
@@ -8,6 +8,16 @@ import soundfile as sf
 import numpy as np
 from math import gcd
 from scipy.signal import resample_poly
 @contextmanager
 def mrt_overrides(mrt, **kwargs):
@@ -30,133 +40,6 @@ try:
 except Exception:
     _HAS_LOUDNORM = False
-def _measure_lufs(wav: au.Waveform) -> float:
-    # pyloudnorm expects float32/float64, shape (n,) or (n, ch)
-    meter = pyln.Meter(wav.sample_rate)  # defaults to BS.1770-4
-    return float(meter.integrated_loudness(wav.samples))
-def _rms(x: np.ndarray) -> float:
-    if x.size == 0: return 0.0
-    return float(np.sqrt(np.mean(x**2)))
-def match_loudness_to_reference(
-    ref: au.Waveform,
-    target: au.Waveform,
-    method: str = "auto",   # "auto"|"lufs"|"rms"|"none"
-    headroom_db: float = 1.0
-) -> tuple[au.Waveform, dict]:
-    """
-    Scales `target` to match `ref` loudness. Returns (adjusted_wave, stats).
-    """
-    stats = {"method": method, "applied_gain_db": 0.0}
-    if method == "none":
-        return target, stats
-    if method == "auto":
-        method = "lufs" if _HAS_LOUDNORM else "rms"
-    if method == "lufs" and _HAS_LOUDNORM:
-        L_ref = _measure_lufs(ref)
-        L_tgt = _measure_lufs(target)
-        delta_db = L_ref - L_tgt
-        gain = 10.0 ** (delta_db / 20.0)
-        y = target.samples.astype(np.float32) * gain
-        stats.update({"ref_lufs": L_ref, "tgt_lufs_before": L_tgt, "applied_gain_db": delta_db})
-    else:
-        # RMS fallback
-        ra = _rms(ref.samples)
-        rb = _rms(target.samples)
-        if rb <= 1e-12:
-            return target, stats
-        gain = ra / rb
-        y = target.samples.astype(np.float32) * gain
-        stats.update({"ref_rms": ra, "tgt_rms_before": rb, "applied_gain_db": 20*np.log10(max(gain,1e-12))})
-    # simple peak “limiter” to keep headroom
-    limit = 10 ** (-headroom_db / 20.0)   # e.g., -1 dBFS
-    peak = float(np.max(np.abs(y))) if y.size else 0.0
-    if peak > limit:
-        y *= (limit / peak)
-        stats["post_peak_limited"] = True
-    else:
-        stats["post_peak_limited"] = False
-    target.samples = y.astype(np.float32)
-    return target, stats
-# ----------------------------
-# Crossfade stitch (your good path)
-# ----------------------------
-def stitch_generated(chunks, sr, xfade_s):
-    if not chunks:
-        raise ValueError("no chunks")
-    xfade_n = int(round(xfade_s * sr))
-    if xfade_n <= 0:
-        return au.Waveform(np.concatenate([c.samples for c in chunks], axis=0), sr)
-    t = np.linspace(0, np.pi/2, xfade_n, endpoint=False, dtype=np.float32)
-    eq_in, eq_out = np.sin(t)[:, None], np.cos(t)[:, None]
-    first = chunks[0].samples
-    if first.shape[0] < xfade_n:
-        raise ValueError("chunk shorter than crossfade prefix")
-    out = first[xfade_n:].copy()  # drop model pre-roll
-    for i in range(1, len(chunks)):
-        cur = chunks[i].samples
-        if cur.shape[0] < xfade_n:
-            continue
-        head, tail = cur[:xfade_n], cur[xfade_n:]
-        mixed = out[-xfade_n:] * eq_out + head * eq_in
-        out = np.concatenate([out[:-xfade_n], mixed, tail], axis=0)
-    return au.Waveform(out, sr)
-# ----------------------------
-# Bar-aligned token context
-# ----------------------------
-def make_bar_aligned_context(tokens, bpm, fps=25, ctx_frames=250, beats_per_bar=4):
-    frames_per_bar_f = (beats_per_bar * 60.0 / bpm) * fps
-    frames_per_bar = int(round(frames_per_bar_f))
-    if abs(frames_per_bar - frames_per_bar_f) > 1e-3:
-        reps = int(np.ceil(ctx_frames / len(tokens)))
-        return np.tile(tokens, (reps, 1))[-ctx_frames:]
-    reps = int(np.ceil(ctx_frames / len(tokens)))
-    tiled = np.tile(tokens, (reps, 1))
-    end = (len(tiled) // frames_per_bar) * frames_per_bar
-    if end < ctx_frames:
-        return tiled[-ctx_frames:]
-    start = end - ctx_frames
-    return tiled[start:end]
-def hard_trim_seconds(wav: au.Waveform, seconds: float) -> au.Waveform:
-    n = int(round(seconds * wav.sample_rate))
-    return au.Waveform(wav.samples[:n], wav.sample_rate)
-def apply_micro_fades(wav: au.Waveform, ms: int = 5) -> None:
-    n = int(wav.sample_rate * ms / 1000.0)
-    if n > 0 and wav.samples.shape[0] > 2*n:
-        env = np.linspace(0.0, 1.0, n, dtype=np.float32)[:, None]
-        wav.samples[:n]  *= env
-        wav.samples[-n:] *= env[::-1]
-def take_bar_aligned_tail(wav, bpm, beats_per_bar, ctx_seconds, max_bars=None):
-    """
-    Return the LAST N bars whose duration is as close as possible to ctx_seconds,
-    anchored to the end of `wav`, and bar-aligned.
-    """
-    spb = (60.0 / bpm) * beats_per_bar
-    bars_needed = max(1, int(round(ctx_seconds / spb)))
-    if max_bars is not None:
-        bars_needed = min(bars_needed, max_bars)
-    tail_seconds = bars_needed * spb
-    n = int(round(tail_seconds * wav.sample_rate))
-    if n >= wav.samples.shape[0]:
-        return wav
-    return au.Waveform(wav.samples[-n:], wav.sample_rate)
 # ----------------------------
 # Main generation (single combined style vector)
 # ----------------------------
@@ -326,42 +209,18 @@ def generate(
     input_sr = int(inp_info.samplerate)
     target_sr = int(target_sample_rate or input_sr)
-    # 2) Convert magenta output to target_sr if needed
-    # wav.samples: shape [num_samples, num_channels], float32/-1..1 (per your code)
     cur_sr = int(mrt.sample_rate)
-    x = wav.samples  # np.ndarray (S, C)
-    if cur_sr != target_sr:
-        g = gcd(cur_sr, target_sr)
-        up, down = target_sr // g, cur_sr // g
-        # ensure 2D shape (S, C)
-        x = wav.samples
-        if x.ndim == 1:
-            x = x[:, None]
-        y = np.column_stack([resample_poly(x[:, ch], up, down) for ch in range(x.shape[1])])
-    else:
-        y = wav.samples if wav.samples.ndim == 2 else wav.samples[:, None]
-    # 3) Snap to exact frame count for loop-perfect length
     seconds_per_bar = (60.0 / float(bpm)) * int(beats_per_bar)
-    expected_len = int(round(float(bars) * seconds_per_bar * target_sr))
-    if y.shape[0] < expected_len:
-        pad = np.zeros((expected_len - y.shape[0], y.shape[1]), dtype=y.dtype)
-        y = np.vstack([y, pad])
-    elif y.shape[0] > expected_len:
-        y = y[:expected_len, :]
-    total_samples = int(y.shape[0])
     loop_duration_seconds = total_samples / float(target_sr)
-    # 4) Write y into buf as WAV @ target_sr
-    buf = io.BytesIO()
-    sf.write(buf, y, target_sr, subtype="FLOAT", format="WAV")
-    buf.seek(0)
-    audio_b64 = base64.b64encode(buf.read()).decode("utf-8")
-    # 5) Update metadata to be authoritative
     metadata = {
         "bpm": int(round(bpm)),
         "bars": int(bars),
@@ -371,9 +230,9 @@ def generate(
         "loop_weight": loop_weight,
         "loudness": loud_stats,
         "sample_rate": int(target_sr),
-        "channels": int(y.shape[1]),
         "crossfade_seconds": mrt.config.crossfade_length,
-        "total_samples": total_samples,
         "seconds_per_bar": seconds_per_bar,
         "loop_duration_seconds": loop_duration_seconds,
         "guidance_weight": guidance_weight,
@@ -382,6 +241,159 @@ def generate(
     }
     return {"audio_base64": audio_b64, "metadata": metadata}
 @app.get("/health")
 def health():
     return {"ok": True}

 from magenta_rt import system, audio as au
 import numpy as np
+from fastapi import FastAPI, UploadFile, File, Form, Body, HTTPException, Response
 import tempfile, io, base64, math, threading
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import contextmanager
 import numpy as np
 from math import gcd
 from scipy.signal import resample_poly
+from utils import (
+    match_loudness_to_reference, stitch_generated, hard_trim_seconds,
+    apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
+    resample_and_snap, wav_bytes_base64
+)
+from jam_worker import JamWorker, JamParams, JamChunk
+import uuid, threading
+jam_registry: dict[str, JamWorker] = {}
+jam_lock = threading.Lock()
 @contextmanager
 def mrt_overrides(mrt, **kwargs):
 except Exception:
     _HAS_LOUDNORM = False
 # ----------------------------
 # Main generation (single combined style vector)
 # ----------------------------
     input_sr = int(inp_info.samplerate)
     target_sr = int(target_sample_rate or input_sr)
+    # 2) Convert to target SR + snap to exact bars
     cur_sr = int(mrt.sample_rate)
+    x = wav.samples if wav.samples.ndim == 2 else wav.samples[:, None]
     seconds_per_bar = (60.0 / float(bpm)) * int(beats_per_bar)
+    expected_secs = float(bars) * seconds_per_bar
+    x = resample_and_snap(x, cur_sr=cur_sr, target_sr=target_sr, seconds=expected_secs)
+    # 3) Encode WAV once (no extra write)
+    audio_b64, total_samples, channels = wav_bytes_base64(x, target_sr)
     loop_duration_seconds = total_samples / float(target_sr)
+    # 4) Metadata
     metadata = {
         "bpm": int(round(bpm)),
         "bars": int(bars),
         "loop_weight": loop_weight,
         "loudness": loud_stats,
         "sample_rate": int(target_sr),
+        "channels": int(channels),
         "crossfade_seconds": mrt.config.crossfade_length,
+        "total_samples": int(total_samples),
         "seconds_per_bar": seconds_per_bar,
         "loop_duration_seconds": loop_duration_seconds,
         "guidance_weight": guidance_weight,
     }
     return {"audio_base64": audio_b64, "metadata": metadata}
+# ----------------------------
+# the 'keep jamming' button
+# ----------------------------
+@app.post("/jam/start")
+def jam_start(
+    loop_audio: UploadFile = File(...),
+    bpm: float = Form(...),
+    bars_per_chunk: int = Form(4),
+    beats_per_bar: int = Form(4),
+    styles: str = Form(""),
+    style_weights: str = Form(""),
+    loop_weight: float = Form(1.0),
+    loudness_mode: str = Form("auto"),
+    loudness_headroom_db: float = Form(1.0),
+    guidance_weight: float = Form(1.1),
+    temperature: float = Form(1.1),
+    topk: int = Form(40),
+    target_sample_rate: int | None = Form(None),
+):
+    # enforce single active jam per GPU
+    with jam_lock:
+        for sid, w in list(jam_registry.items()):
+            if w.is_alive():
+                raise HTTPException(status_code=429, detail="A jam is already running. Try again later.")
+    # read input + prep context/style (reuse your existing code)
+    data = loop_audio.file.read()
+    if not data: raise HTTPException(status_code=400, detail="Empty file")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        tmp.write(data); tmp_path = tmp.name
+    mrt = get_mrt()
+    loop = au.Waveform.from_file(tmp_path).resample(mrt.sample_rate).as_stereo()
+    # build tail context + style vec (tail-biased)
+    codec_fps = float(mrt.codec.frame_rate)
+    ctx_seconds = float(mrt.config.context_length_frames) / codec_fps
+    loop_tail = take_bar_aligned_tail(loop, bpm, beats_per_bar, ctx_seconds)
+    # style vec = normalized mix of loop_tail + extra styles
+    embeds, weights = [mrt.embed_style(loop_tail)], [float(loop_weight)]
+    extra = [s for s in (styles.split(",") if styles else []) if s.strip()]
+    sw = [float(x) for x in style_weights.split(",")] if style_weights else []
+    for i, s in enumerate(extra):
+        embeds.append(mrt.embed_style(s.strip()))
+        weights.append(sw[i] if i < len(sw) else 1.0)
+    wsum = sum(weights) or 1.0
+    weights = [w / wsum for w in weights]
+    style_vec = np.sum([w * e for w, e in zip(weights, embeds)], axis=0).astype(embeds[0].dtype)
+    # target SR (default input SR)
+    inp_info = sf.info(tmp_path)
+    input_sr = int(inp_info.samplerate)
+    target_sr = int(target_sample_rate or input_sr)
+    params = JamParams(
+        bpm=bpm, beats_per_bar=beats_per_bar, bars_per_chunk=bars_per_chunk,
+        target_sr=target_sr, loudness_mode=loudness_mode, headroom_db=loudness_headroom_db,
+        style_vec=style_vec, ref_loop=loop_tail,
+        guidance_weight=guidance_weight, temperature=temperature, topk=topk
+    )
+    worker = JamWorker(mrt, params)
+    sid = str(uuid.uuid4())
+    with jam_lock:
+        jam_registry[sid] = worker
+    worker.start()
+    return {"session_id": sid}
+@app.get("/jam/next")
+def jam_next(session_id: str, since: int = 0):
+    with jam_lock:
+        worker = jam_registry.get(session_id)
+    if worker is None or not worker.is_alive():
+        raise HTTPException(status_code=404, detail="Session not found")
+    # drain outbox entries with index > since
+    items = []
+    with worker._lock:
+        for ch in worker.outbox:
+            if ch.index > since:
+                items.append({"index": ch.index, "audio_base64": ch.audio_base64, "metadata": ch.metadata})
+        # optional: truncate old items to keep memory bounded
+        if len(worker.outbox) > 32:
+            worker.outbox = worker.outbox[-16:]
+    if not items:
+        return Response(status_code=204)  # nothing yet
+    return {"chunks": items}
+@app.post("/jam/stop")
+def jam_stop(session_id: str = Body(..., embed=True)):
+    with jam_lock:
+        worker = jam_registry.get(session_id)
+    if worker is None:
+        raise HTTPException(status_code=404, detail="Session not found")
+    worker.stop()
+    worker.join(timeout=2.0)
+    with jam_lock:
+        jam_registry.pop(session_id, None)
+    return {"stopped": True}
+@app.post("/jam/update")
+def jam_update(session_id: str = Form(...),
+               guidance_weight: float | None = Form(None),
+               temperature: float | None = Form(None),
+               topk: int | None = Form(None)):
+    with jam_lock:
+        worker = jam_registry.get(session_id)
+    if worker is None or not worker.is_alive():
+        raise HTTPException(status_code=404, detail="Session not found")
+    worker.update_knobs(guidance_weight=guidance_weight, temperature=temperature, topk=topk)
+    return {"ok": True}
+@app.get("/jam/status")
+def jam_status(session_id: str):
+    with jam_lock:
+        worker = jam_registry.get(session_id)
+    if worker is None:
+        raise HTTPException(status_code=404, detail="Session not found")
+    running = worker.is_alive()
+    # Snapshot safely
+    with worker._lock:
+        last_index = int(worker.idx)
+        queued = len(worker.outbox)
+        p = worker.params
+        spb = p.beats_per_bar * (60.0 / p.bpm)
+        chunk_secs = p.bars_per_chunk * spb
+        target_sr = p.target_sr
+        bars_per_chunk = p.bars_per_chunk
+        beats_per_bar = p.beats_per_bar
+        bpm = p.bpm
+    return {
+        "running": running,
+        "last_index": last_index,          # last finished chunk index (0 if none yet)
+        "queued_chunks": queued,           # how many not-yet-fetched chunks are in the outbox
+        "bpm": bpm,
+        "beats_per_bar": beats_per_bar,
+        "bars_per_chunk": bars_per_chunk,
+        "seconds_per_bar": spb,
+        "chunk_duration_seconds": chunk_secs,
+        "target_sample_rate": target_sr,
+        "last_chunk_started_at": worker.last_chunk_started_at,
+        "last_chunk_completed_at": worker.last_chunk_completed_at,
+    }
 @app.get("/health")
 def health():
     return {"ok": True}

jam_worker.py ADDED Viewed

File without changes

utils.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# utils.py
+from __future__ import annotations
+import io, base64, math
+from math import gcd
+import numpy as np
+import soundfile as sf
+from scipy.signal import resample_poly
+# Magenta RT audio types
+from magenta_rt import audio as au
+# Optional loudness
+try:
+    import pyloudnorm as pyln
+    _HAS_LOUDNORM = True
+except Exception:
+    _HAS_LOUDNORM = False
+# ---------- Loudness ----------
+def _measure_lufs(wav: au.Waveform) -> float:
+    meter = pyln.Meter(wav.sample_rate)  # BS.1770-4
+    return float(meter.integrated_loudness(wav.samples))
+def _rms(x: np.ndarray) -> float:
+    if x.size == 0: return 0.0
+    return float(np.sqrt(np.mean(x**2)))
+def match_loudness_to_reference(
+    ref: au.Waveform,
+    target: au.Waveform,
+    method: str = "auto",   # "auto"|"lufs"|"rms"|"none"
+    headroom_db: float = 1.0
+) -> tuple[au.Waveform, dict]:
+    stats = {"method": method, "applied_gain_db": 0.0}
+    if method == "none":
+        return target, stats
+    if method == "auto":
+        method = "lufs" if _HAS_LOUDNORM else "rms"
+    if method == "lufs" and _HAS_LOUDNORM:
+        L_ref = _measure_lufs(ref)
+        L_tgt = _measure_lufs(target)
+        delta_db = L_ref - L_tgt
+        gain = 10.0 ** (delta_db / 20.0)
+        y = target.samples.astype(np.float32) * gain
+        stats.update({"ref_lufs": L_ref, "tgt_lufs_before": L_tgt, "applied_gain_db": delta_db})
+    else:
+        ra = _rms(ref.samples)
+        rb = _rms(target.samples)
+        if rb <= 1e-12:
+            return target, stats
+        gain = ra / rb
+        y = target.samples.astype(np.float32) * gain
+        stats.update({"ref_rms": ra, "tgt_rms_before": rb, "applied_gain_db": 20*np.log10(max(gain,1e-12))})
+    # simple peak “limiter” to keep headroom
+    limit = 10 ** (-headroom_db / 20.0)   # e.g., -1 dBFS
+    peak = float(np.max(np.abs(y))) if y.size else 0.0
+    if peak > limit:
+        y *= (limit / peak)
+        stats["post_peak_limited"] = True
+    else:
+        stats["post_peak_limited"] = False
+    target.samples = y.astype(np.float32)
+    return target, stats
+# ---------- Stitch / fades / trims ----------
+def stitch_generated(chunks, sr: int, xfade_s: float) -> au.Waveform:
+    if not chunks:
+        raise ValueError("no chunks")
+    xfade_n = int(round(xfade_s * sr))
+    if xfade_n <= 0:
+        return au.Waveform(np.concatenate([c.samples for c in chunks], axis=0), sr)
+    t = np.linspace(0, np.pi/2, xfade_n, endpoint=False, dtype=np.float32)
+    eq_in, eq_out = np.sin(t)[:, None], np.cos(t)[:, None]
+    first = chunks[0].samples
+    if first.shape[0] < xfade_n:
+        raise ValueError("chunk shorter than crossfade prefix")
+    out = first[xfade_n:].copy()  # drop model pre-roll
+    for i in range(1, len(chunks)):
+        cur = chunks[i].samples
+        if cur.shape[0] < xfade_n:
+            continue
+        head, tail = cur[:xfade_n], cur[xfade_n:]
+        mixed = out[-xfade_n:] * eq_out + head * eq_in
+        out = np.concatenate([out[:-xfade_n], mixed, tail], axis=0)
+    return au.Waveform(out, sr)
+def hard_trim_seconds(wav: au.Waveform, seconds: float) -> au.Waveform:
+    n = int(round(seconds * wav.sample_rate))
+    return au.Waveform(wav.samples[:n], wav.sample_rate)
+def apply_micro_fades(wav: au.Waveform, ms: int = 5) -> None:
+    n = int(wav.sample_rate * ms / 1000.0)
+    if n > 0 and wav.samples.shape[0] > 2*n:
+        env = np.linspace(0.0, 1.0, n, dtype=np.float32)[:, None]
+        wav.samples[:n]  *= env
+        wav.samples[-n:] *= env[::-1]
+# ---------- Token context helpers ----------
+def make_bar_aligned_context(tokens, bpm, fps=25, ctx_frames=250, beats_per_bar=4):
+    frames_per_bar_f = (beats_per_bar * 60.0 / bpm) * fps
+    frames_per_bar = int(round(frames_per_bar_f))
+    if abs(frames_per_bar - frames_per_bar_f) > 1e-3:
+        reps = int(np.ceil(ctx_frames / len(tokens)))
+        return np.tile(tokens, (reps, 1))[-ctx_frames:]
+    reps = int(np.ceil(ctx_frames / len(tokens)))
+    tiled = np.tile(tokens, (reps, 1))
+    end = (len(tiled) // frames_per_bar) * frames_per_bar
+    if end < ctx_frames:
+        return tiled[-ctx_frames:]
+    start = end - ctx_frames
+    return tiled[start:end]
+def take_bar_aligned_tail(wav: au.Waveform, bpm: float, beats_per_bar: int, ctx_seconds: float, max_bars=None) -> au.Waveform:
+    spb = (60.0 / bpm) * beats_per_bar
+    bars_needed = max(1, int(round(ctx_seconds / spb)))
+    if max_bars is not None:
+        bars_needed = min(bars_needed, max_bars)
+    tail_seconds = bars_needed * spb
+    n = int(round(tail_seconds * wav.sample_rate))
+    if n >= wav.samples.shape[0]:
+        return wav
+    return au.Waveform(wav.samples[-n:], wav.sample_rate)
+# ---------- SR normalize + snap ----------
+def resample_and_snap(x: np.ndarray, cur_sr: int, target_sr: int, seconds: float) -> np.ndarray:
+    """
+    x: np.ndarray shape (S, C), float32
+    Returns: exact-length array (round(seconds*target_sr), C)
+    """
+    if x.ndim == 1:
+        x = x[:, None]
+    if cur_sr != target_sr:
+        g = gcd(cur_sr, target_sr)
+        up, down = target_sr // g, cur_sr // g
+        x = resample_poly(x, up, down, axis=0)
+    expected_len = int(round(seconds * target_sr))
+    if x.shape[0] < expected_len:
+        pad = np.zeros((expected_len - x.shape[0], x.shape[1]), dtype=x.dtype)
+        x = np.vstack([x, pad])
+    elif x.shape[0] > expected_len:
+        x = x[:expected_len, :]
+    return x.astype(np.float32, copy=False)
+# ---------- WAV encode ----------
+def wav_bytes_base64(x: np.ndarray, sr: int) -> tuple[str, int, int]:
+    """
+    x: np.ndarray shape (S, C)
+    returns: (base64_wav, total_samples, channels)
+    """
+    buf = io.BytesIO()
+    sf.write(buf, x, sr, subtype="FLOAT", format="WAV")
+    buf.seek(0)
+    b64 = base64.b64encode(buf.read()).decode("utf-8")
+    return b64, int(x.shape[0]), int(x.shape[1])