thecollabagepatch commited on
Commit
8cedcd0
·
1 Parent(s): 184daaa

reseed functionality

Browse files
Files changed (2) hide show
  1. app.py +94 -0
  2. jam_worker.py +164 -13
app.py CHANGED
@@ -594,6 +594,100 @@ def jam_update(session_id: str = Form(...),
594
  worker.update_knobs(guidance_weight=guidance_weight, temperature=temperature, topk=topk)
595
  return {"ok": True}
596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  @app.get("/jam/status")
598
  def jam_status(session_id: str):
599
  with jam_lock:
 
594
  worker.update_knobs(guidance_weight=guidance_weight, temperature=temperature, topk=topk)
595
  return {"ok": True}
596
 
597
+ @app.post("/jam/update_styles")
598
+ def jam_update_styles(session_id: str = Form(...),
599
+ styles: str = Form(""),
600
+ style_weights: str = Form(""),
601
+ loop_weight: float = Form(1.0),
602
+ use_current_mix_as_style: bool = Form(False)):
603
+ with jam_lock:
604
+ worker = jam_registry.get(session_id)
605
+ if worker is None or not worker.is_alive():
606
+ raise HTTPException(status_code=404, detail="Session not found")
607
+
608
+ embeds, weights = [], []
609
+ # Optionally re-embed from current combined loop
610
+ if use_current_mix_as_style and worker.params.combined_loop is not None:
611
+ embeds.append(worker.mrt.embed_style(worker.params.combined_loop))
612
+ weights.append(float(loop_weight))
613
+
614
+ extra = [s for s in (styles.split(",") if styles else []) if s.strip()]
615
+ sw = [float(x) for x in style_weights.split(",")] if style_weights else []
616
+ for i, s in enumerate(extra):
617
+ embeds.append(worker.mrt.embed_style(s.strip()))
618
+ weights.append(sw[i] if i < len(sw) else 1.0)
619
+
620
+ wsum = sum(weights) or 1.0
621
+ weights = [w/wsum for w in weights]
622
+ style_vec = np.sum([w*e for w,e in zip(weights, embeds)], axis=0).astype(np.float32)
623
+
624
+ with worker._lock:
625
+ worker.params.style_vec = style_vec
626
+
627
+ return {"ok": True}
628
+
629
+ @app.post("/jam/reseed")
630
+ def jam_reseed(session_id: str = Form(...), loop_audio: UploadFile = File(None)):
631
+ with jam_lock:
632
+ worker = jam_registry.get(session_id)
633
+ if worker is None or not worker.is_alive():
634
+ raise HTTPException(status_code=404, detail="Session not found")
635
+
636
+ # Option 1: use uploaded new “combined” bounce from the app
637
+ if loop_audio is not None:
638
+ data = loop_audio.file.read()
639
+ if not data:
640
+ raise HTTPException(status_code=400, detail="Empty file")
641
+
642
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
643
+ tmp.write(data); path = tmp.name
644
+ wav = au.Waveform.from_file(path).resample(worker.mrt.sample_rate).as_stereo()
645
+ else:
646
+ # Option 2: reseed from what we’ve been streaming (the model side)
647
+ # (Usually better to reseed from the Swift-side “combined” mix you trust.)
648
+
649
+ s = getattr(worker, "_stream", None)
650
+ if s is None or s.shape[0] == 0:
651
+ raise HTTPException(status_code=400, detail="No internal stream to reseed from")
652
+ wav = au.Waveform(s.astype(np.float32, copy=False), int(worker.mrt.sample_rate)).as_stereo()
653
+
654
+ worker.reseed_from_waveform(wav)
655
+ return {"ok": True}
656
+
657
+ @app.post("/jam/reseed_splice")
658
+ def jam_reseed_splice(
659
+ session_id: str = Form(...),
660
+ anchor_bars: float = Form(2.0), # how much of the original to re-inject
661
+ combined_audio: UploadFile = File(None), # preferred: Swift supplies the current combined mix
662
+ ):
663
+ worker = jam_registry.get(session_id)
664
+ if worker is None or not worker.is_alive():
665
+ raise HTTPException(status_code=404, detail="Session not found")
666
+
667
+ # Build a waveform to reseed from
668
+
669
+ wav = None
670
+
671
+ if combined_audio is not None:
672
+ data = combined_audio.file.read()
673
+ if not data:
674
+ raise HTTPException(status_code=400, detail="Empty combined_audio")
675
+
676
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
677
+ tmp.write(data)
678
+ path = tmp.name
679
+ wav = au.Waveform.from_file(path).resample(worker.mrt.sample_rate).as_stereo()
680
+ else:
681
+ # Fallback: reseed from the model’s internal stream (less ideal than the Swift-side bounce)
682
+ s = getattr(worker, "_stream", None)
683
+ if s is None or s.shape[0] == 0:
684
+ raise HTTPException(status_code=400, detail="No audio available to reseed from")
685
+ wav = au.Waveform(s.astype(np.float32, copy=False), int(worker.mrt.sample_rate)).as_stereo()
686
+
687
+ # Perform the splice reseed
688
+ worker.reseed_splice(wav, anchor_bars=float(anchor_bars))
689
+ return {"ok": True, "anchor_bars": float(anchor_bars)}
690
+
691
  @app.get("/jam/status")
692
  def jam_status(session_id: str):
693
  with jam_lock:
jam_worker.py CHANGED
@@ -4,7 +4,7 @@ from dataclasses import dataclass, field
4
  import numpy as np
5
  import soundfile as sf
6
  from magenta_rt import audio as au
7
-
8
  from utils import (
9
  match_loudness_to_reference, stitch_generated, hard_trim_seconds,
10
  apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
@@ -59,31 +59,38 @@ class JamWorker(threading.Thread):
59
  """Set up MRT context tokens from the combined loop audio"""
60
  try:
61
  from utils import make_bar_aligned_context, take_bar_aligned_tail
62
-
63
  codec_fps = float(self.mrt.codec.frame_rate)
64
  ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
65
-
66
  loop_for_context = take_bar_aligned_tail(
67
- self.params.combined_loop,
68
- self.params.bpm,
69
- self.params.beats_per_bar,
70
  ctx_seconds
71
  )
72
-
73
  tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
74
  tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
75
-
76
  context_tokens = make_bar_aligned_context(
77
- tokens,
78
- bpm=self.params.bpm,
79
  fps=int(self.mrt.codec.frame_rate),
80
- ctx_frames=self.mrt.config.context_length_frames,
81
  beats_per_bar=self.params.beats_per_bar
82
  )
83
-
 
84
  self.state.context_tokens = context_tokens
85
  print(f"✅ JamWorker: Set up fresh context from combined loop")
86
-
 
 
 
 
 
 
87
  except Exception as e:
88
  print(f"❌ Failed to setup context from combined loop: {e}")
89
 
@@ -189,6 +196,150 @@ class JamWorker(threading.Thread):
189
 
190
  self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  def run(self):
193
  """Continuous stream + sliding 8-bar window emitter."""
194
  sr_model = int(self.mrt.sample_rate)
 
4
  import numpy as np
5
  import soundfile as sf
6
  from magenta_rt import audio as au
7
+ from threading import RLock
8
  from utils import (
9
  match_loudness_to_reference, stitch_generated, hard_trim_seconds,
10
  apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
 
59
  """Set up MRT context tokens from the combined loop audio"""
60
  try:
61
  from utils import make_bar_aligned_context, take_bar_aligned_tail
62
+
63
  codec_fps = float(self.mrt.codec.frame_rate)
64
  ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
65
+
66
  loop_for_context = take_bar_aligned_tail(
67
+ self.params.combined_loop,
68
+ self.params.bpm,
69
+ self.params.beats_per_bar,
70
  ctx_seconds
71
  )
72
+
73
  tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
74
  tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
75
+
76
  context_tokens = make_bar_aligned_context(
77
+ tokens,
78
+ bpm=self.params.bpm,
79
  fps=int(self.mrt.codec.frame_rate),
80
+ ctx_frames=self.mrt.config.context_length_frames,
81
  beats_per_bar=self.params.beats_per_bar
82
  )
83
+
84
+ # Install fresh context
85
  self.state.context_tokens = context_tokens
86
  print(f"✅ JamWorker: Set up fresh context from combined loop")
87
+
88
+ # NEW: keep a copy of the *original* context tokens for future splice-reseed
89
+ # (guard so we only set this once, at jam start)
90
+ with self._lock:
91
+ if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
92
+ self._original_context_tokens = np.copy(context_tokens) # shape: [T, depth]
93
+
94
  except Exception as e:
95
  print(f"❌ Failed to setup context from combined loop: {e}")
96
 
 
196
 
197
  self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
198
 
199
+ def reseed_from_waveform(self, wav):
200
+ # 1) Re-init state
201
+ new_state = self.mrt.init_state()
202
+
203
+ # 2) Build bar-aligned context tokens from provided audio
204
+ codec_fps = float(self.mrt.codec.frame_rate)
205
+ ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
206
+ from utils import take_bar_aligned_tail, make_bar_aligned_context
207
+
208
+ tail = take_bar_aligned_tail(wav, self.params.bpm, self.params.beats_per_bar, ctx_seconds)
209
+ tokens_full = self.mrt.codec.encode(tail).astype(np.int32)
210
+ tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
211
+ context_tokens = make_bar_aligned_context(tokens,
212
+ bpm=self.params.bpm, fps=int(self.mrt.codec.frame_rate),
213
+ ctx_frames=self.mrt.config.context_length_frames,
214
+ beats_per_bar=self.params.beats_per_bar
215
+ )
216
+ new_state.context_tokens = context_tokens
217
+ self.state = new_state
218
+ self._prepare_stream_for_reseed_handoff()
219
+
220
+ def _frames_per_bar(self) -> int:
221
+ # codec frame-rate (frames/s) -> frames per musical bar
222
+ fps = float(self.mrt.codec.frame_rate)
223
+ sec_per_bar = (60.0 / float(self.params.bpm)) * float(self.params.beats_per_bar)
224
+ return int(round(fps * sec_per_bar))
225
+
226
+ def _ctx_frames(self) -> int:
227
+ # how many codec frames fit in the model’s conditioning window
228
+ return int(self.mrt.config.context_length_frames)
229
+
230
+ def _make_recent_tokens_from_wave(self, wav) -> np.ndarray:
231
+ """
232
+ Encode a waveform and produce a bar-aligned context token window (same shape/depth
233
+ as state.context_tokens). Uses your existing codec depth.
234
+ """
235
+ tokens_full = self.mrt.codec.encode(wav).astype(np.int32) # [T, rvq_total]
236
+ tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth] # [T, depth]
237
+ # If you already have a utility that builds bar-aligned context windows, prefer it.
238
+ # Otherwise clamp to ctx_frames from the tail (bar-aligned trimming happens in splicer).
239
+ t = tokens.shape[0]
240
+ ctx = self._ctx_frames()
241
+ if t > ctx:
242
+ tokens = tokens[-ctx:]
243
+ return tokens
244
+
245
+ def _bar_aligned_tail(self, tokens: np.ndarray, bars: float) -> np.ndarray:
246
+ """
247
+ Take a tail slice that is an integer number of codec frames corresponding to `bars`.
248
+ We round to nearest frame to stay phase-consistent with codec grid.
249
+ """
250
+ frames_per_bar = self._frames_per_bar()
251
+ want = max(frames_per_bar * int(round(bars)), 0)
252
+ if want == 0:
253
+ return tokens[:0] # empty
254
+ if tokens.shape[0] <= want:
255
+ return tokens
256
+ return tokens[-want:]
257
+
258
+ def _splice_context(self, original_tokens: np.ndarray, recent_tokens: np.ndarray,
259
+ anchor_bars: float) -> np.ndarray:
260
+ """
261
+ Build new context by concatenating:
262
+ anchor = tail from originals (anchor_bars)
263
+ recent = tail from recent_tokens filling the remainder
264
+ Then clamp to ctx_frames from the tail (safety).
265
+ """
266
+ ctx_frames = self._ctx_frames()
267
+ depth = original_tokens.shape[1]
268
+
269
+ # 1) Take bar-aligned tail from original
270
+ anchor = self._bar_aligned_tail(original_tokens, anchor_bars) # [A, depth]
271
+
272
+ # 2) Compute how many frames remain for recent
273
+ a = anchor.shape[0]
274
+ remain = max(ctx_frames - a, 0)
275
+
276
+ # 3) Take bar-aligned recent tail not exceeding 'remain' (rounded to bars)
277
+ if remain > 0:
278
+ # how many bars fit in remain?
279
+ frames_per_bar = self._frames_per_bar()
280
+ recent_bars_fit = int(remain // frames_per_bar)
281
+ # if we can’t fit even one bar, just take the exact frame remainder
282
+ if recent_bars_fit >= 1:
283
+ want_recent_frames = recent_bars_fit * frames_per_bar
284
+ recent = recent_tokens[-want_recent_frames:] if recent_tokens.shape[0] > want_recent_frames else recent_tokens
285
+ else:
286
+ recent = recent_tokens[-remain:] if recent_tokens.shape[0] > remain else recent_tokens
287
+ else:
288
+ recent = recent_tokens[:0]
289
+
290
+ # 4) Concat and clamp again (exact)
291
+ out = np.concatenate([anchor, recent], axis=0) if anchor.size or recent.size else recent_tokens[-ctx_frames:]
292
+ if out.shape[0] > ctx_frames:
293
+ out = out[-ctx_frames:]
294
+ # safety on depth
295
+ if out.shape[1] != depth:
296
+ out = out[:, :depth]
297
+ return out
298
+
299
+ def _prepare_stream_for_reseed_handoff(self):
300
+ """
301
+ Keep only a tiny tail to crossfade against the FIRST post-reseed chunk.
302
+ Reset the emit pointer so the next emitted window starts fresh.
303
+ """
304
+ sr = int(self.mrt.sample_rate)
305
+ xfade_s = float(self.mrt.config.crossfade_length)
306
+ xfade_n = int(round(xfade_s * sr))
307
+
308
+ # If we have a stream, keep just a tail to crossfade with
309
+ if getattr(self, "_stream", None) is not None and self._stream.shape[0] > 0:
310
+ tail = self._stream[-xfade_n:] if self._stream.shape[0] > xfade_n else self._stream
311
+ self._stream = tail.copy()
312
+ else:
313
+ self._stream = None
314
+
315
+ # Start a new emission sequence aligned to the new context
316
+ self._next_emit_start = 0
317
+
318
+ def reseed_splice(self, recent_wav, anchor_bars: float):
319
+ """
320
+ Token-splice reseed:
321
+ - original = the context we captured when the jam started
322
+ - recent = tokens from the provided recent waveform (usually Swift-combined mix)
323
+ - anchor_bars controls how much of the original vibe we re-inject
324
+ """
325
+ with self._lock:
326
+ if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
327
+ # Fallback: if we somehow don’t have originals, treat current as originals
328
+ self._original_context_tokens = np.copy(self.state.context_tokens)
329
+
330
+ recent_tokens = self._make_recent_tokens_from_wave(recent_wav) # [T, depth]
331
+ new_ctx = self._splice_context(self._original_context_tokens, recent_tokens, anchor_bars)
332
+
333
+ # install the new context window
334
+ new_state = self.mrt.init_state()
335
+ new_state.context_tokens = new_ctx
336
+ self.state = new_state
337
+
338
+ self._prepare_stream_for_reseed_handoff()
339
+
340
+ # optional: ask streamer to drop an intro crossfade worth of audio right after reseed
341
+ self._pending_drop_intro_bars = getattr(self, "_pending_drop_intro_bars", 0) + 1
342
+
343
  def run(self):
344
  """Continuous stream + sliding 8-bar window emitter."""
345
  sr_model = int(self.mrt.sample_rate)