Commit
·
8cedcd0
1
Parent(s):
184daaa
reseed functionality
Browse files- app.py +94 -0
- jam_worker.py +164 -13
app.py
CHANGED
@@ -594,6 +594,100 @@ def jam_update(session_id: str = Form(...),
|
|
594 |
worker.update_knobs(guidance_weight=guidance_weight, temperature=temperature, topk=topk)
|
595 |
return {"ok": True}
|
596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
597 |
@app.get("/jam/status")
|
598 |
def jam_status(session_id: str):
|
599 |
with jam_lock:
|
|
|
594 |
worker.update_knobs(guidance_weight=guidance_weight, temperature=temperature, topk=topk)
|
595 |
return {"ok": True}
|
596 |
|
597 |
+
@app.post("/jam/update_styles")
|
598 |
+
def jam_update_styles(session_id: str = Form(...),
|
599 |
+
styles: str = Form(""),
|
600 |
+
style_weights: str = Form(""),
|
601 |
+
loop_weight: float = Form(1.0),
|
602 |
+
use_current_mix_as_style: bool = Form(False)):
|
603 |
+
with jam_lock:
|
604 |
+
worker = jam_registry.get(session_id)
|
605 |
+
if worker is None or not worker.is_alive():
|
606 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
607 |
+
|
608 |
+
embeds, weights = [], []
|
609 |
+
# Optionally re-embed from current combined loop
|
610 |
+
if use_current_mix_as_style and worker.params.combined_loop is not None:
|
611 |
+
embeds.append(worker.mrt.embed_style(worker.params.combined_loop))
|
612 |
+
weights.append(float(loop_weight))
|
613 |
+
|
614 |
+
extra = [s for s in (styles.split(",") if styles else []) if s.strip()]
|
615 |
+
sw = [float(x) for x in style_weights.split(",")] if style_weights else []
|
616 |
+
for i, s in enumerate(extra):
|
617 |
+
embeds.append(worker.mrt.embed_style(s.strip()))
|
618 |
+
weights.append(sw[i] if i < len(sw) else 1.0)
|
619 |
+
|
620 |
+
wsum = sum(weights) or 1.0
|
621 |
+
weights = [w/wsum for w in weights]
|
622 |
+
style_vec = np.sum([w*e for w,e in zip(weights, embeds)], axis=0).astype(np.float32)
|
623 |
+
|
624 |
+
with worker._lock:
|
625 |
+
worker.params.style_vec = style_vec
|
626 |
+
|
627 |
+
return {"ok": True}
|
628 |
+
|
629 |
+
@app.post("/jam/reseed")
|
630 |
+
def jam_reseed(session_id: str = Form(...), loop_audio: UploadFile = File(None)):
|
631 |
+
with jam_lock:
|
632 |
+
worker = jam_registry.get(session_id)
|
633 |
+
if worker is None or not worker.is_alive():
|
634 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
635 |
+
|
636 |
+
# Option 1: use uploaded new “combined” bounce from the app
|
637 |
+
if loop_audio is not None:
|
638 |
+
data = loop_audio.file.read()
|
639 |
+
if not data:
|
640 |
+
raise HTTPException(status_code=400, detail="Empty file")
|
641 |
+
|
642 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
643 |
+
tmp.write(data); path = tmp.name
|
644 |
+
wav = au.Waveform.from_file(path).resample(worker.mrt.sample_rate).as_stereo()
|
645 |
+
else:
|
646 |
+
# Option 2: reseed from what we’ve been streaming (the model side)
|
647 |
+
# (Usually better to reseed from the Swift-side “combined” mix you trust.)
|
648 |
+
|
649 |
+
s = getattr(worker, "_stream", None)
|
650 |
+
if s is None or s.shape[0] == 0:
|
651 |
+
raise HTTPException(status_code=400, detail="No internal stream to reseed from")
|
652 |
+
wav = au.Waveform(s.astype(np.float32, copy=False), int(worker.mrt.sample_rate)).as_stereo()
|
653 |
+
|
654 |
+
worker.reseed_from_waveform(wav)
|
655 |
+
return {"ok": True}
|
656 |
+
|
657 |
+
@app.post("/jam/reseed_splice")
|
658 |
+
def jam_reseed_splice(
|
659 |
+
session_id: str = Form(...),
|
660 |
+
anchor_bars: float = Form(2.0), # how much of the original to re-inject
|
661 |
+
combined_audio: UploadFile = File(None), # preferred: Swift supplies the current combined mix
|
662 |
+
):
|
663 |
+
worker = jam_registry.get(session_id)
|
664 |
+
if worker is None or not worker.is_alive():
|
665 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
666 |
+
|
667 |
+
# Build a waveform to reseed from
|
668 |
+
|
669 |
+
wav = None
|
670 |
+
|
671 |
+
if combined_audio is not None:
|
672 |
+
data = combined_audio.file.read()
|
673 |
+
if not data:
|
674 |
+
raise HTTPException(status_code=400, detail="Empty combined_audio")
|
675 |
+
|
676 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
677 |
+
tmp.write(data)
|
678 |
+
path = tmp.name
|
679 |
+
wav = au.Waveform.from_file(path).resample(worker.mrt.sample_rate).as_stereo()
|
680 |
+
else:
|
681 |
+
# Fallback: reseed from the model’s internal stream (less ideal than the Swift-side bounce)
|
682 |
+
s = getattr(worker, "_stream", None)
|
683 |
+
if s is None or s.shape[0] == 0:
|
684 |
+
raise HTTPException(status_code=400, detail="No audio available to reseed from")
|
685 |
+
wav = au.Waveform(s.astype(np.float32, copy=False), int(worker.mrt.sample_rate)).as_stereo()
|
686 |
+
|
687 |
+
# Perform the splice reseed
|
688 |
+
worker.reseed_splice(wav, anchor_bars=float(anchor_bars))
|
689 |
+
return {"ok": True, "anchor_bars": float(anchor_bars)}
|
690 |
+
|
691 |
@app.get("/jam/status")
|
692 |
def jam_status(session_id: str):
|
693 |
with jam_lock:
|
jam_worker.py
CHANGED
@@ -4,7 +4,7 @@ from dataclasses import dataclass, field
|
|
4 |
import numpy as np
|
5 |
import soundfile as sf
|
6 |
from magenta_rt import audio as au
|
7 |
-
|
8 |
from utils import (
|
9 |
match_loudness_to_reference, stitch_generated, hard_trim_seconds,
|
10 |
apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
|
@@ -59,31 +59,38 @@ class JamWorker(threading.Thread):
|
|
59 |
"""Set up MRT context tokens from the combined loop audio"""
|
60 |
try:
|
61 |
from utils import make_bar_aligned_context, take_bar_aligned_tail
|
62 |
-
|
63 |
codec_fps = float(self.mrt.codec.frame_rate)
|
64 |
ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
|
65 |
-
|
66 |
loop_for_context = take_bar_aligned_tail(
|
67 |
-
self.params.combined_loop,
|
68 |
-
self.params.bpm,
|
69 |
-
self.params.beats_per_bar,
|
70 |
ctx_seconds
|
71 |
)
|
72 |
-
|
73 |
tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
|
74 |
tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
|
75 |
-
|
76 |
context_tokens = make_bar_aligned_context(
|
77 |
-
tokens,
|
78 |
-
bpm=self.params.bpm,
|
79 |
fps=int(self.mrt.codec.frame_rate),
|
80 |
-
ctx_frames=self.mrt.config.context_length_frames,
|
81 |
beats_per_bar=self.params.beats_per_bar
|
82 |
)
|
83 |
-
|
|
|
84 |
self.state.context_tokens = context_tokens
|
85 |
print(f"✅ JamWorker: Set up fresh context from combined loop")
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
except Exception as e:
|
88 |
print(f"❌ Failed to setup context from combined loop: {e}")
|
89 |
|
@@ -189,6 +196,150 @@ class JamWorker(threading.Thread):
|
|
189 |
|
190 |
self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
def run(self):
|
193 |
"""Continuous stream + sliding 8-bar window emitter."""
|
194 |
sr_model = int(self.mrt.sample_rate)
|
|
|
4 |
import numpy as np
|
5 |
import soundfile as sf
|
6 |
from magenta_rt import audio as au
|
7 |
+
from threading import RLock
|
8 |
from utils import (
|
9 |
match_loudness_to_reference, stitch_generated, hard_trim_seconds,
|
10 |
apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
|
|
|
59 |
"""Set up MRT context tokens from the combined loop audio"""
|
60 |
try:
|
61 |
from utils import make_bar_aligned_context, take_bar_aligned_tail
|
62 |
+
|
63 |
codec_fps = float(self.mrt.codec.frame_rate)
|
64 |
ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
|
65 |
+
|
66 |
loop_for_context = take_bar_aligned_tail(
|
67 |
+
self.params.combined_loop,
|
68 |
+
self.params.bpm,
|
69 |
+
self.params.beats_per_bar,
|
70 |
ctx_seconds
|
71 |
)
|
72 |
+
|
73 |
tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
|
74 |
tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
|
75 |
+
|
76 |
context_tokens = make_bar_aligned_context(
|
77 |
+
tokens,
|
78 |
+
bpm=self.params.bpm,
|
79 |
fps=int(self.mrt.codec.frame_rate),
|
80 |
+
ctx_frames=self.mrt.config.context_length_frames,
|
81 |
beats_per_bar=self.params.beats_per_bar
|
82 |
)
|
83 |
+
|
84 |
+
# Install fresh context
|
85 |
self.state.context_tokens = context_tokens
|
86 |
print(f"✅ JamWorker: Set up fresh context from combined loop")
|
87 |
+
|
88 |
+
# NEW: keep a copy of the *original* context tokens for future splice-reseed
|
89 |
+
# (guard so we only set this once, at jam start)
|
90 |
+
with self._lock:
|
91 |
+
if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
|
92 |
+
self._original_context_tokens = np.copy(context_tokens) # shape: [T, depth]
|
93 |
+
|
94 |
except Exception as e:
|
95 |
print(f"❌ Failed to setup context from combined loop: {e}")
|
96 |
|
|
|
196 |
|
197 |
self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
|
198 |
|
199 |
+
def reseed_from_waveform(self, wav):
|
200 |
+
# 1) Re-init state
|
201 |
+
new_state = self.mrt.init_state()
|
202 |
+
|
203 |
+
# 2) Build bar-aligned context tokens from provided audio
|
204 |
+
codec_fps = float(self.mrt.codec.frame_rate)
|
205 |
+
ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
|
206 |
+
from utils import take_bar_aligned_tail, make_bar_aligned_context
|
207 |
+
|
208 |
+
tail = take_bar_aligned_tail(wav, self.params.bpm, self.params.beats_per_bar, ctx_seconds)
|
209 |
+
tokens_full = self.mrt.codec.encode(tail).astype(np.int32)
|
210 |
+
tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
|
211 |
+
context_tokens = make_bar_aligned_context(tokens,
|
212 |
+
bpm=self.params.bpm, fps=int(self.mrt.codec.frame_rate),
|
213 |
+
ctx_frames=self.mrt.config.context_length_frames,
|
214 |
+
beats_per_bar=self.params.beats_per_bar
|
215 |
+
)
|
216 |
+
new_state.context_tokens = context_tokens
|
217 |
+
self.state = new_state
|
218 |
+
self._prepare_stream_for_reseed_handoff()
|
219 |
+
|
220 |
+
def _frames_per_bar(self) -> int:
|
221 |
+
# codec frame-rate (frames/s) -> frames per musical bar
|
222 |
+
fps = float(self.mrt.codec.frame_rate)
|
223 |
+
sec_per_bar = (60.0 / float(self.params.bpm)) * float(self.params.beats_per_bar)
|
224 |
+
return int(round(fps * sec_per_bar))
|
225 |
+
|
226 |
+
def _ctx_frames(self) -> int:
|
227 |
+
# how many codec frames fit in the model’s conditioning window
|
228 |
+
return int(self.mrt.config.context_length_frames)
|
229 |
+
|
230 |
+
def _make_recent_tokens_from_wave(self, wav) -> np.ndarray:
|
231 |
+
"""
|
232 |
+
Encode a waveform and produce a bar-aligned context token window (same shape/depth
|
233 |
+
as state.context_tokens). Uses your existing codec depth.
|
234 |
+
"""
|
235 |
+
tokens_full = self.mrt.codec.encode(wav).astype(np.int32) # [T, rvq_total]
|
236 |
+
tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth] # [T, depth]
|
237 |
+
# If you already have a utility that builds bar-aligned context windows, prefer it.
|
238 |
+
# Otherwise clamp to ctx_frames from the tail (bar-aligned trimming happens in splicer).
|
239 |
+
t = tokens.shape[0]
|
240 |
+
ctx = self._ctx_frames()
|
241 |
+
if t > ctx:
|
242 |
+
tokens = tokens[-ctx:]
|
243 |
+
return tokens
|
244 |
+
|
245 |
+
def _bar_aligned_tail(self, tokens: np.ndarray, bars: float) -> np.ndarray:
|
246 |
+
"""
|
247 |
+
Take a tail slice that is an integer number of codec frames corresponding to `bars`.
|
248 |
+
We round to nearest frame to stay phase-consistent with codec grid.
|
249 |
+
"""
|
250 |
+
frames_per_bar = self._frames_per_bar()
|
251 |
+
want = max(frames_per_bar * int(round(bars)), 0)
|
252 |
+
if want == 0:
|
253 |
+
return tokens[:0] # empty
|
254 |
+
if tokens.shape[0] <= want:
|
255 |
+
return tokens
|
256 |
+
return tokens[-want:]
|
257 |
+
|
258 |
+
def _splice_context(self, original_tokens: np.ndarray, recent_tokens: np.ndarray,
|
259 |
+
anchor_bars: float) -> np.ndarray:
|
260 |
+
"""
|
261 |
+
Build new context by concatenating:
|
262 |
+
anchor = tail from originals (anchor_bars)
|
263 |
+
recent = tail from recent_tokens filling the remainder
|
264 |
+
Then clamp to ctx_frames from the tail (safety).
|
265 |
+
"""
|
266 |
+
ctx_frames = self._ctx_frames()
|
267 |
+
depth = original_tokens.shape[1]
|
268 |
+
|
269 |
+
# 1) Take bar-aligned tail from original
|
270 |
+
anchor = self._bar_aligned_tail(original_tokens, anchor_bars) # [A, depth]
|
271 |
+
|
272 |
+
# 2) Compute how many frames remain for recent
|
273 |
+
a = anchor.shape[0]
|
274 |
+
remain = max(ctx_frames - a, 0)
|
275 |
+
|
276 |
+
# 3) Take bar-aligned recent tail not exceeding 'remain' (rounded to bars)
|
277 |
+
if remain > 0:
|
278 |
+
# how many bars fit in remain?
|
279 |
+
frames_per_bar = self._frames_per_bar()
|
280 |
+
recent_bars_fit = int(remain // frames_per_bar)
|
281 |
+
# if we can’t fit even one bar, just take the exact frame remainder
|
282 |
+
if recent_bars_fit >= 1:
|
283 |
+
want_recent_frames = recent_bars_fit * frames_per_bar
|
284 |
+
recent = recent_tokens[-want_recent_frames:] if recent_tokens.shape[0] > want_recent_frames else recent_tokens
|
285 |
+
else:
|
286 |
+
recent = recent_tokens[-remain:] if recent_tokens.shape[0] > remain else recent_tokens
|
287 |
+
else:
|
288 |
+
recent = recent_tokens[:0]
|
289 |
+
|
290 |
+
# 4) Concat and clamp again (exact)
|
291 |
+
out = np.concatenate([anchor, recent], axis=0) if anchor.size or recent.size else recent_tokens[-ctx_frames:]
|
292 |
+
if out.shape[0] > ctx_frames:
|
293 |
+
out = out[-ctx_frames:]
|
294 |
+
# safety on depth
|
295 |
+
if out.shape[1] != depth:
|
296 |
+
out = out[:, :depth]
|
297 |
+
return out
|
298 |
+
|
299 |
+
def _prepare_stream_for_reseed_handoff(self):
|
300 |
+
"""
|
301 |
+
Keep only a tiny tail to crossfade against the FIRST post-reseed chunk.
|
302 |
+
Reset the emit pointer so the next emitted window starts fresh.
|
303 |
+
"""
|
304 |
+
sr = int(self.mrt.sample_rate)
|
305 |
+
xfade_s = float(self.mrt.config.crossfade_length)
|
306 |
+
xfade_n = int(round(xfade_s * sr))
|
307 |
+
|
308 |
+
# If we have a stream, keep just a tail to crossfade with
|
309 |
+
if getattr(self, "_stream", None) is not None and self._stream.shape[0] > 0:
|
310 |
+
tail = self._stream[-xfade_n:] if self._stream.shape[0] > xfade_n else self._stream
|
311 |
+
self._stream = tail.copy()
|
312 |
+
else:
|
313 |
+
self._stream = None
|
314 |
+
|
315 |
+
# Start a new emission sequence aligned to the new context
|
316 |
+
self._next_emit_start = 0
|
317 |
+
|
318 |
+
def reseed_splice(self, recent_wav, anchor_bars: float):
|
319 |
+
"""
|
320 |
+
Token-splice reseed:
|
321 |
+
- original = the context we captured when the jam started
|
322 |
+
- recent = tokens from the provided recent waveform (usually Swift-combined mix)
|
323 |
+
- anchor_bars controls how much of the original vibe we re-inject
|
324 |
+
"""
|
325 |
+
with self._lock:
|
326 |
+
if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
|
327 |
+
# Fallback: if we somehow don’t have originals, treat current as originals
|
328 |
+
self._original_context_tokens = np.copy(self.state.context_tokens)
|
329 |
+
|
330 |
+
recent_tokens = self._make_recent_tokens_from_wave(recent_wav) # [T, depth]
|
331 |
+
new_ctx = self._splice_context(self._original_context_tokens, recent_tokens, anchor_bars)
|
332 |
+
|
333 |
+
# install the new context window
|
334 |
+
new_state = self.mrt.init_state()
|
335 |
+
new_state.context_tokens = new_ctx
|
336 |
+
self.state = new_state
|
337 |
+
|
338 |
+
self._prepare_stream_for_reseed_handoff()
|
339 |
+
|
340 |
+
# optional: ask streamer to drop an intro crossfade worth of audio right after reseed
|
341 |
+
self._pending_drop_intro_bars = getattr(self, "_pending_drop_intro_bars", 0) + 1
|
342 |
+
|
343 |
def run(self):
|
344 |
"""Continuous stream + sliding 8-bar window emitter."""
|
345 |
sr_model = int(self.mrt.sample_rate)
|