Spaces:

thecollabagepatch
/

magenta-retry

Paused

App Files Files Community

thecollabagepatch commited on 19 days ago

Commit

4bdf506

1 Parent(s): db53efe

ok one last try

Browse files

Files changed (2) hide show

jam_worker.py +404 -355
utils.py +62 -36

jam_worker.py CHANGED Viewed

@@ -1,5 +1,5 @@
-# jam_worker.py - SIMPLE FIX VERSION
-import threading, time, base64, io, uuid
 from dataclasses import dataclass, field
 import numpy as np
 import soundfile as sf
@@ -8,7 +8,7 @@ from threading import RLock
 from utils import (
     match_loudness_to_reference, stitch_generated, hard_trim_seconds,
     apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
-    resample_and_snap, wav_bytes_base64
 )
 @dataclass
@@ -32,6 +32,34 @@ class JamChunk:
     audio_base64: str
     metadata: dict
 class JamWorker(threading.Thread):
     def __init__(self, mrt, params: JamParams):
         super().__init__(daemon=True)
@@ -39,9 +67,32 @@ class JamWorker(threading.Thread):
         self.params = params
         self.state = mrt.init_state()
-        # ✅ init synchronization + placeholders FIRST
         self._lock = threading.Lock()
-        self._original_context_tokens = None   # so hasattr checks are cheap/clear
         if params.combined_loop is not None:
             self._setup_context_from_combined_loop()
@@ -50,28 +101,39 @@ class JamWorker(threading.Thread):
         self.outbox: list[JamChunk] = []
         self._stop_event = threading.Event()
         self._stream = None
-        self._next_emit_start = 0
-        # NEW: Track delivery state
         self._last_delivered_index = 0
         self._max_buffer_ahead = 5
         # Timing info
         self.last_chunk_started_at = None
         self.last_chunk_completed_at = None
-        self._pending_reseed = None        # {"ctx": np.ndarray, "ref": au.Waveform|None}
-        self._needs_bar_realign = False    # request a one-shot downbeat alignment
-        self._reseed_ref_loop = None       # which loop to align against after reseed
     def _setup_context_from_combined_loop(self):
         """Set up MRT context tokens from the combined loop audio"""
         try:
             from utils import make_bar_aligned_context, take_bar_aligned_tail
-            codec_fps = float(self.mrt.codec.frame_rate)
             ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
             loop_for_context = take_bar_aligned_tail(
@@ -84,452 +146,381 @@ class JamWorker(threading.Thread):
             tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
             tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
             context_tokens = make_bar_aligned_context(
                 tokens,
                 bpm=self.params.bpm,
-                fps=float(self.mrt.codec.frame_rate),  # keep fractional fps
                 ctx_frames=self.mrt.config.context_length_frames,
-                beats_per_bar=self.params.beats_per_bar
             )
-            # Install fresh context
             self.state.context_tokens = context_tokens
-            print(f"✅ JamWorker: Set up fresh context from combined loop")
-            # NEW: keep a copy of the *original* context tokens for future splice-reseed
-            # (guard so we only set this once, at jam start)
             with self._lock:
                 if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
-                    self._original_context_tokens = np.copy(context_tokens)  # shape: [T, depth]
         except Exception as e:
-            print(f"❌ Failed to setup context from combined loop: {e}")
     def stop(self):
         self._stop_event.set()
     def update_knobs(self, *, guidance_weight=None, temperature=None, topk=None):
         with self._lock:
-            if guidance_weight is not None: self.params.guidance_weight = float(guidance_weight)
-            if temperature is not None:     self.params.temperature     = float(temperature)
-            if topk is not None:            self.params.topk            = int(topk)
     def get_next_chunk(self) -> JamChunk | None:
         """Get the next sequential chunk (blocks/waits if not ready)"""
         target_index = self._last_delivered_index + 1
-        # Wait for the target chunk to be ready (with timeout)
-        max_wait = 30.0  # seconds
         start_time = time.time()
         while time.time() - start_time < max_wait and not self._stop_event.is_set():
             with self._lock:
-                # Look for the exact chunk we need
                 for chunk in self.outbox:
                     if chunk.index == target_index:
                         self._last_delivered_index = target_index
-                        print(f"📦 Delivered chunk {target_index}")
                         return chunk
-            # Not ready yet, wait a bit
             time.sleep(0.1)
-        # Timeout or stopped
         return None
     def mark_chunk_consumed(self, chunk_index: int):
         """Mark a chunk as consumed by the frontend"""
         with self._lock:
             self._last_delivered_index = max(self._last_delivered_index, chunk_index)
-            print(f"✅ Chunk {chunk_index} consumed")
     def _should_generate_next_chunk(self) -> bool:
-        """Check if we should generate the next chunk (don't get too far ahead)"""
         with self._lock:
-            # Don't generate if we're already too far ahead
-            if self.idx > self._last_delivered_index + self._max_buffer_ahead:
-                return False
-            return True
-    def _seconds_per_bar(self) -> float:
-        return self.params.beats_per_bar * (60.0 / self.params.bpm)
-    def _snap_and_encode(self, y, seconds, target_sr, bars):
-        cur_sr = int(self.mrt.sample_rate)
-        x = y.samples if y.samples.ndim == 2 else y.samples[:, None]
-        x = resample_and_snap(x, cur_sr=cur_sr, target_sr=target_sr, seconds=seconds)
-        b64, total_samples, channels = wav_bytes_base64(x, target_sr)
-        meta = {
-            "bpm": int(round(self.params.bpm)),
-            "bars": int(bars),
-            "beats_per_bar": int(self.params.beats_per_bar),
-            "sample_rate": int(target_sr),
-            "channels": channels,
-            "total_samples": total_samples,
-            "seconds_per_bar": self._seconds_per_bar(),
-            "loop_duration_seconds": bars * self._seconds_per_bar(),
-            "guidance_weight": self.params.guidance_weight,
-            "temperature": self.params.temperature,
-            "topk": self.params.topk,
-        }
-        return b64, meta
     def _append_model_chunk_to_stream(self, wav):
-        """Incrementally append a model chunk with equal-power crossfade."""
         xfade_s = float(self.mrt.config.crossfade_length)
-        sr = int(self.mrt.sample_rate)
         xfade_n = int(round(xfade_s * sr))
         s = wav.samples if wav.samples.ndim == 2 else wav.samples[:, None]
-        if getattr(self, "_stream", None) is None:
-            # First chunk: drop model pre-roll (xfade head)
             if s.shape[0] > xfade_n:
                 self._stream = s[xfade_n:].astype(np.float32, copy=True)
             else:
                 self._stream = np.zeros((0, s.shape[1]), dtype=np.float32)
-            self._next_emit_start = 0  # pointer into _stream (model SR samples)
             return
-        # Crossfade last xfade_n samples of _stream with head of new s
         if s.shape[0] <= xfade_n or self._stream.shape[0] < xfade_n:
-            # Degenerate safeguard
             self._stream = np.concatenate([self._stream, s], axis=0)
             return
         tail = self._stream[-xfade_n:]
         head = s[:xfade_n]
-        # Equal-power envelopes
         t = np.linspace(0, np.pi/2, xfade_n, endpoint=False, dtype=np.float32)[:, None]
         eq_in, eq_out = np.sin(t), np.cos(t)
         mixed = tail * eq_out + head * eq_in
         self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
     def reseed_from_waveform(self, wav):
-        # 1) Re-init state
         new_state = self.mrt.init_state()
-        # 2) Build bar-aligned context tokens from provided audio
-        codec_fps   = float(self.mrt.codec.frame_rate)
         ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
-        from utils import take_bar_aligned_tail, make_bar_aligned_context
         tail = take_bar_aligned_tail(wav, self.params.bpm, self.params.beats_per_bar, ctx_seconds)
         tokens_full = self.mrt.codec.encode(tail).astype(np.int32)
         tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
-        context_tokens = make_bar_aligned_context(tokens,
-            bpm=self.params.bpm, fps=float(self.mrt.codec.frame_rate),
             ctx_frames=self.mrt.config.context_length_frames,
-            beats_per_bar=self.params.beats_per_bar
         )
         new_state.context_tokens = context_tokens
         self.state = new_state
-        self._prepare_stream_for_reseed_handoff()
-    def _frames_per_bar(self) -> int:
-        # codec frame-rate (frames/s) -> frames per musical bar
-        fps = float(self.mrt.codec.frame_rate)
-        sec_per_bar = (60.0 / float(self.params.bpm)) * float(self.params.beats_per_bar)
-        return int(round(fps * sec_per_bar))
-    def _ctx_frames(self) -> int:
-        # how many codec frames fit in the model’s conditioning window
-        return int(self.mrt.config.context_length_frames)
-    def _make_recent_tokens_from_wave(self, wav) -> np.ndarray:
-        """
-        Encode waveform and produce a BAR-ALIGNED context token window.
-        """
-        tokens_full = self.mrt.codec.encode(wav).astype(np.int32)           # [T, rvq_total]
-        tokens      = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
-        from utils import make_bar_aligned_context
-        ctx = make_bar_aligned_context(
-            tokens,
-            bpm=self.params.bpm,
-            fps=float(self.mrt.codec.frame_rate),  # keep fractional fps
-            ctx_frames=self.mrt.config.context_length_frames,
-            beats_per_bar=self.params.beats_per_bar
-        )
-        return ctx
-    def _bar_aligned_tail(self, tokens: np.ndarray, bars: float) -> np.ndarray:
-        """
-        Take a tail slice that is an integer number of codec frames corresponding to `bars`.
-        We round to nearest frame to stay phase-consistent with codec grid.
-        """
-        frames_per_bar = self._frames_per_bar()
-        want = max(frames_per_bar * int(round(bars)), 0)
-        if want == 0:
-            return tokens[:0]  # empty
-        if tokens.shape[0] <= want:
-            return tokens
-        return tokens[-want:]
-    def _splice_context(self, original_tokens: np.ndarray, recent_tokens: np.ndarray,
-                    anchor_bars: float) -> np.ndarray:
-        import math
-        ctx_frames = self._ctx_frames()
-        depth = original_tokens.shape[1]
-        frames_per_bar = self._frames_per_bar()
-        # 1) Anchor tail (whole bars)
-        anchor = self._bar_aligned_tail(original_tokens, math.floor(anchor_bars))
-        # 2) Fill remainder with recent (prefer whole bars)
-        a = anchor.shape[0]
-        remain = max(ctx_frames - a, 0)
-        recent = recent_tokens[:0]
-        used_recent = 0  # frames taken from the END of recent_tokens
-        if remain > 0:
-            bars_fit = remain // frames_per_bar
-            if bars_fit >= 1:
-                want_recent_frames = int(bars_fit * frames_per_bar)
-                used_recent = min(want_recent_frames, recent_tokens.shape[0])
-                recent = recent_tokens[-used_recent:] if used_recent > 0 else recent_tokens[:0]
-            else:
-                used_recent = min(remain, recent_tokens.shape[0])
-                recent = recent_tokens[-used_recent:] if used_recent > 0 else recent_tokens[:0]
-        # 3) Concat in order [anchor, recent]
-        if anchor.size or recent.size:
-            out = np.concatenate([anchor, recent], axis=0)
-        else:
-            # fallback: just take the last ctx window from recent
-            out = recent_tokens[-ctx_frames:]
-        # 4) Trim if we overshot
-        if out.shape[0] > ctx_frames:
-            out = out[-ctx_frames:]
-        # 5) Snap the **END** to the nearest LOWER bar boundary
-        if frames_per_bar > 0:
-            max_bar_aligned = (out.shape[0] // frames_per_bar) * frames_per_bar
-        else:
-            max_bar_aligned = out.shape[0]
-        if max_bar_aligned > 0 and out.shape[0] != max_bar_aligned:
-            out = out[-max_bar_aligned:]
-        # 6) Left-fill to reach ctx_frames **without moving the END**
-        deficit = ctx_frames - out.shape[0]
-        if deficit > 0:
-            left_parts = []
-            # Prefer frames immediately BEFORE the region we used from 'recent_tokens'
-            if used_recent < recent_tokens.shape[0]:
-                take = min(deficit, recent_tokens.shape[0] - used_recent)
-                if used_recent > 0:
-                    left_parts.append(recent_tokens[-(used_recent + take) : -used_recent])
-                else:
-                    left_parts.append(recent_tokens[-take:])
-            # Then take frames immediately BEFORE the 'anchor' in original_tokens
-            if sum(p.shape[0] for p in left_parts) < deficit and anchor.shape[0] > 0:
-                need = deficit - sum(p.shape[0] for p in left_parts)
-                a_len = anchor.shape[0]
-                avail = max(original_tokens.shape[0] - a_len, 0)
-                take2 = min(need, avail)
-                if take2 > 0:
-                    left_parts.append(original_tokens[-(a_len + take2) : -a_len])
-            # Still short? tile from what's available
-            have = sum(p.shape[0] for p in left_parts)
-            if have < deficit:
-                base = out if out.shape[0] > 0 else (recent_tokens if recent_tokens.shape[0] > 0 else original_tokens)
-                reps = int(np.ceil((deficit - have) / max(1, base.shape[0])))
-                left_parts.append(np.tile(base, (reps, 1))[: (deficit - have)])
-            left = np.concatenate(left_parts, axis=0)
-            out = np.concatenate([left[-deficit:], out], axis=0)
-        # 7) Final guard to exact length
-        if out.shape[0] > ctx_frames:
-            out = out[-ctx_frames:]
-        elif out.shape[0] < ctx_frames:
-            reps = int(np.ceil(ctx_frames / max(1, out.shape[0])))
-            out = np.tile(out, (reps, 1))[-ctx_frames:]
-        # 8) Depth guard
-        if out.shape[1] != depth:
-            out = out[:, :depth]
-        return out
-    def _realign_emit_pointer_to_bar(self, sr_model: int):
-        """Advance _next_emit_start to the next bar boundary in model-sample space."""
-        bar_samps = int(round(self._seconds_per_bar() * sr_model))
-        if bar_samps <= 0:
-            return
-        phase = self._next_emit_start % bar_samps
-        if phase != 0:
-            self._next_emit_start += (bar_samps - phase)
-    def _prepare_stream_for_reseed_handoff(self):
-        # OLD: keep crossfade tail -> causes phase offset
-        # sr = int(self.mrt.sample_rate)
-        # xfade_s = float(self.mrt.config.crossfade_length)
-        # xfade_n = int(round(xfade_s * sr))
-        # if getattr(self, "_stream", None) is not None and self._stream.shape[0] > 0:
-        #     tail = self._stream[-xfade_n:] if self._stream.shape[0] > xfade_n else self._stream
-        #     self._stream = tail.copy()
-        # else:
-        #     self._stream = None
-        # NEW: throw away the tail completely; start fresh
         self._stream = None
-        self._next_emit_start = 0
         self._needs_bar_realign = True
     def reseed_splice(self, recent_wav, anchor_bars: float):
-        """
-        Token-splice reseed queued for the next bar boundary between chunks.
-        """
         with self._lock:
             if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
                 self._original_context_tokens = np.copy(self.state.context_tokens)
-            recent_tokens = self._make_recent_tokens_from_wave(recent_wav)  # [T, depth]
             new_ctx = self._splice_context(self._original_context_tokens, recent_tokens, anchor_bars)
-            # Queue it; the run loop will install right after we finish the current slice
             self._pending_reseed = {"ctx": new_ctx, "ref": recent_wav}
-            # install the new context window
             new_state = self.mrt.init_state()
             new_state.context_tokens = new_ctx
             self.state = new_state
-            self._prepare_stream_for_reseed_handoff()
-            # optional: ask streamer to drop an intro crossfade worth of audio right after reseed
-            self._pending_drop_intro_bars = getattr(self, "_pending_drop_intro_bars", 0) + 1
     def run(self):
-        """Main worker loop — generate into a continuous stream, then emit bar-aligned slices."""
-        spb = self._seconds_per_bar()                     # seconds per bar
-        chunk_secs = self.params.bars_per_chunk * spb
-        xfade = float(self.mrt.config.crossfade_length)   # seconds
-        sr = int(self.mrt.sample_rate)
-        chunk_samps = int(round(chunk_secs * sr))
-        def _need(first_chunk_extra=False):
-            """How many more samples we still need in the stream to emit next slice."""
-            have = 0 if getattr(self, "_stream", None) is None else self._stream.shape[0] - getattr(self, "_next_emit_start", 0)
-            want = chunk_samps
             if first_chunk_extra:
-                # reserve two bars extra so first-chunk onset alignment has material
-                want += int(round(2 * spb * sr))
-            return max(0, want - have)
-        def _mono_env(x: np.ndarray, sr: int, win_ms: float = 10.0) -> np.ndarray:
-            if x.ndim == 2: x = x.mean(axis=1)
-            x = np.abs(x).astype(np.float32)
-            w = max(1, int(round(win_ms * 1e-3 * sr)))
-            if w > 1:
-                kern = np.ones(w, dtype=np.float32) / float(w)
-                x = np.convolve(x, kern, mode="same")
-            d = np.diff(x, prepend=x[:1])
-            d[d < 0] = 0.0
-            return d
-        def _estimate_first_offset_samples(ref_loop_wav, gen_head_wav, sr: int, spb: float) -> int:
-            """Tempo-aware first-downbeat offset (positive => model late)."""
-            try:
-                max_ms = int(max(160.0, min(0.25 * spb * 1000.0, 450.0)))
-                ref = ref_loop_wav if ref_loop_wav.sample_rate == sr else ref_loop_wav.resample(sr)
-                n_bar = int(round(spb * sr))
-                ref_tail = ref.samples[-n_bar:, :] if ref.samples.shape[0] >= n_bar else ref.samples
-                gen_head = gen_head_wav.samples[: int(2 * n_bar), :]
-                if ref_tail.size == 0 or gen_head.size == 0:
-                    return 0
-                # envelopes + z-score
-                import numpy as np
-                def _z(a):
-                    m, s = float(a.mean()), float(a.std() or 1.0); return (a - m) / s
-                e_ref = _z(_mono_env(ref_tail, sr)).astype(np.float32)
-                e_gen = _z(_mono_env(gen_head, sr)).astype(np.float32)
-                # upsample x4 for finer lag
-                def _upsample(a, r=4):
-                    n = len(a); grid = np.arange(n, dtype=np.float32)
-                    fine = np.linspace(0, n - 1, num=n * r, dtype=np.float32)
-                    return np.interp(fine, grid, a).astype(np.float32)
-                up = 4
-                e_ref_u, e_gen_u = _upsample(e_ref, up), _upsample(e_gen, up)
-                max_lag_u = int(round((max_ms / 1000.0) * sr * up))
-                seg = min(len(e_ref_u), len(e_gen_u))
-                e_ref_u = e_ref_u[-seg:]
-                pad = np.zeros(max_lag_u, dtype=np.float32)
-                e_gen_u_pad = np.concatenate([pad, e_gen_u, pad])
-                best_lag_u, best_score = 0, -1e9
-                for lag_u in range(-max_lag_u, max_lag_u + 1):
-                    start = max_lag_u + lag_u
-                    b = e_gen_u_pad[start : start + seg]
-                    denom = (np.linalg.norm(e_ref_u) * np.linalg.norm(b)) or 1.0
-                    score = float(np.dot(e_ref_u, b) / denom)
-                    if score > best_score:
-                        best_score, best_lag_u = score, lag_u
-                return int(round(best_lag_u / up))
-            except Exception:
-                return 0
-        print("🚀 JamWorker started (bar-aligned streaming)…")
         while not self._stop_event.is_set():
             if not self._should_generate_next_chunk():
                 time.sleep(0.25)
                 continue
-            # 1) Generate until we have enough material in the stream
-            need = _need(first_chunk_extra=(self.idx == 0))
-            while need > 0 and not self._stop_event.is_set():
                 with self._lock:
                     style_vec = self.params.style_vec
                     self.mrt.guidance_weight = float(self.params.guidance_weight)
-                    self.mrt.temperature     = float(self.params.temperature)
-                    self.mrt.topk            = int(self.params.topk)
                 wav, self.state = self.mrt.generate_chunk(state=self.state, style=style_vec)
-                self._append_model_chunk_to_stream(wav)   # equal-power xfade into a persistent stream
-                need = _need(first_chunk_extra=(self.idx == 0))
             if self._stop_event.is_set():
                 break
-            # 2) One-time: align the emit pointer to the groove
             if (self.idx == 0 and self.params.combined_loop is not None) or self._needs_bar_realign:
                 ref_loop = self._reseed_ref_loop or self.params.combined_loop
                 if ref_loop is not None:
-                    head_len = min(self._stream.shape[0] - self._next_emit_start, int(round(2 * spb * sr)))
-                    seg = self._stream[self._next_emit_start : self._next_emit_start + head_len]
-                    gen_head = au.Waveform(seg.astype(np.float32, copy=False), sr).as_stereo()
-                    offs = _estimate_first_offset_samples(ref_loop, gen_head, sr, spb)
-                    if offs != 0:
-                        self._next_emit_start = max(0, self._next_emit_start + offs)
-                        print(f"🎯 Offset compensation: {offs/sr:+.3f}s")
-                    self._realign_emit_pointer_to_bar(sr)
                 self._needs_bar_realign = False
                 self._reseed_ref_loop = None
-            # 3) Emit exactly bars_per_chunk × spb from the stream
-            start = self._next_emit_start
-            end = start + chunk_samps
-            if end > self._stream.shape[0]:
-                # shouldn't happen often; generate a bit more and loop
-                continue
-            slice_ = self._stream[start:end]
-            self._next_emit_start = end
-            y = au.Waveform(slice_.astype(np.float32, copy=False), sr).as_stereo()
-            # 4) Post-processing / loudness
             if self.idx == 0 and self.params.ref_loop is not None:
                 y, _ = match_loudness_to_reference(
                     self.params.ref_loop, y,
@@ -539,38 +530,96 @@ class JamWorker(threading.Thread):
             else:
                 apply_micro_fades(y, 3)
-            # 5) Resample + exact-length snap + encode
-            b64, meta = self._snap_and_encode(
-                y, seconds=chunk_secs, target_sr=self.params.target_sr, bars=self.params.bars_per_chunk
-            )
-            meta["xfade_seconds"] = xfade
-            # 6) Publish
             with self._lock:
                 self.idx += 1
-                self.outbox.append(JamChunk(index=self.idx, audio_base64=b64, metadata=meta))
                 if len(self.outbox) > 10:
                     cutoff = self._last_delivered_index - 5
                     self.outbox = [ch for ch in self.outbox if ch.index > cutoff]
-                # 👉 If a reseed was requested, apply it *now*, between chunks
                 if self._pending_reseed is not None:
                     pkg = self._pending_reseed
                     self._pending_reseed = None
                     new_state = self.mrt.init_state()
-                    new_state.context_tokens = pkg["ctx"]          # exact (ctx_frames, depth)
                     self.state = new_state
-                    # start a fresh stream and schedule one-time alignment
                     self._stream = None
-                    self._next_emit_start = 0
-                    self._reseed_ref_loop = pkg.get("ref") or self.params.combined_loop
                     self._needs_bar_realign = True
-                    print("🔁 Reseed installed at bar boundary; will realign before next slice")
-            print(f"✅ Completed chunk {self.idx}")
-        print("🛑 JamWorker stopped")

+# jam_worker.py - COMPREHENSIVE REWRITE FOR PRECISE TIMING
+import threading, time, base64, io, uuid, math
 from dataclasses import dataclass, field
 import numpy as np
 import soundfile as sf
 from utils import (
     match_loudness_to_reference, stitch_generated, hard_trim_seconds,
     apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
+    resample_and_snap, wav_bytes_base64, StreamingResampler
 )
 @dataclass
     audio_base64: str
     metadata: dict
+@dataclass
+class TimingState:
+    """Precise timing state tracking"""
+    # Fractional bar position (never rounded until final emission)
+    emit_position_bars: float = 0.0
+    # Sample-accurate positions in the stream
+    stream_position_samples: int = 0
+    # Accumulated timing error for correction
+    fractional_error_bars: float = 0.0
+    # Codec frame timing
+    frames_per_bar: float = 0.0
+    samples_per_bar: float = 0.0
+    def advance_by_bars(self, bars: float):
+        """Advance timing by exact fractional bars"""
+        self.emit_position_bars += bars
+        self.fractional_error_bars += bars - int(bars)
+        # Correct for accumulated error when it gets significant
+        if abs(self.fractional_error_bars) > 0.5:
+            correction = int(round(self.fractional_error_bars))
+            self.fractional_error_bars -= correction
+            return correction  # bars to skip/rewind
+        return 0
 class JamWorker(threading.Thread):
     def __init__(self, mrt, params: JamParams):
         super().__init__(daemon=True)
         self.params = params
         self.state = mrt.init_state()
+        # Core timing calculations (keep as floats for precision)
+        self._codec_fps = float(self.mrt.codec.frame_rate)  # 25.0
+        self._model_sr = int(self.mrt.sample_rate)          # 48000
+        self._target_sr = int(params.target_sr)
+        # Critical: these stay as floats to preserve fractional precision
+        self._seconds_per_bar = float(params.beats_per_bar * 60.0 / params.bpm)
+        self._frames_per_bar = self._seconds_per_bar * self._codec_fps
+        self._samples_per_bar_model = self._seconds_per_bar * self._model_sr
+        self._samples_per_bar_target = self._seconds_per_bar * self._target_sr
+        # Timing state
+        self._timing = TimingState(
+            frames_per_bar=self._frames_per_bar,
+            samples_per_bar=self._samples_per_bar_model
+        )
+        # Warn about problematic BPMs
+        frame_error = abs(self._frames_per_bar - round(self._frames_per_bar))
+        if frame_error > 0.01:
+            print(f"⚠️ Warning: {params.bpm} BPM creates {frame_error:.3f} frame drift per bar")
+            print(f"   This may cause gradual timing drift in long jams")
+        # Synchronization + placeholders
         self._lock = threading.Lock()
+        self._original_context_tokens = None
         if params.combined_loop is not None:
             self._setup_context_from_combined_loop()
         self.outbox: list[JamChunk] = []
         self._stop_event = threading.Event()
+        # Stream state
         self._stream = None
+        self._stream_write_pos = 0  # Where we append new model output
+        # Delivery tracking
         self._last_delivered_index = 0
         self._max_buffer_ahead = 5
+        # Streaming resampler for precise SR conversion
+        self._resampler = None
+        if self._target_sr != self._model_sr:
+            self._resampler = StreamingResampler(
+                in_sr=self._model_sr,
+                out_sr=self._target_sr,
+                channels=2,
+                quality="VHQ"
+            )
         # Timing info
         self.last_chunk_started_at = None
         self.last_chunk_completed_at = None
+        # Control flags
+        self._pending_reseed = None
+        self._needs_bar_realign = False
+        self._reseed_ref_loop = None
     def _setup_context_from_combined_loop(self):
         """Set up MRT context tokens from the combined loop audio"""
         try:
             from utils import make_bar_aligned_context, take_bar_aligned_tail
+            codec_fps = self._codec_fps
             ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
             loop_for_context = take_bar_aligned_tail(
             tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
             tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
+            # Use enhanced context alignment for fractional BPMs
             context_tokens = make_bar_aligned_context(
                 tokens,
                 bpm=self.params.bpm,
+                fps=self._codec_fps,
                 ctx_frames=self.mrt.config.context_length_frames,
+                beats_per_bar=self.params.beats_per_bar,
+                precise_timing=True  # Use new precise mode
             )
             self.state.context_tokens = context_tokens
+            print(f"Context setup: {context_tokens.shape[0]} frames, {self._frames_per_bar:.3f} frames/bar")
+            # Store original context for splice reseeding
             with self._lock:
                 if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
+                    self._original_context_tokens = np.copy(context_tokens)
         except Exception as e:
+            print(f"Failed to setup context from combined loop: {e}")
     def stop(self):
         self._stop_event.set()
     def update_knobs(self, *, guidance_weight=None, temperature=None, topk=None):
         with self._lock:
+            if guidance_weight is not None:
+                self.params.guidance_weight = float(guidance_weight)
+            if temperature is not None:
+                self.params.temperature = float(temperature)
+            if topk is not None:
+                self.params.topk = int(topk)
     def get_next_chunk(self) -> JamChunk | None:
         """Get the next sequential chunk (blocks/waits if not ready)"""
         target_index = self._last_delivered_index + 1
+        max_wait = 30.0
         start_time = time.time()
         while time.time() - start_time < max_wait and not self._stop_event.is_set():
             with self._lock:
                 for chunk in self.outbox:
                     if chunk.index == target_index:
                         self._last_delivered_index = target_index
+                        print(f"Delivered chunk {target_index} (bars {chunk.metadata.get('bar_range', 'unknown')})")
                         return chunk
             time.sleep(0.1)
         return None
     def mark_chunk_consumed(self, chunk_index: int):
         """Mark a chunk as consumed by the frontend"""
         with self._lock:
             self._last_delivered_index = max(self._last_delivered_index, chunk_index)
     def _should_generate_next_chunk(self) -> bool:
+        """Check if we should generate the next chunk"""
         with self._lock:
+            return self.idx <= self._last_delivered_index + self._max_buffer_ahead
+    def _get_precise_chunk_samples(self, bars: float) -> int:
+        """Get exact sample count for fractional bars at model SR"""
+        exact_seconds = bars * self._seconds_per_bar
+        return int(round(exact_seconds * self._model_sr))
     def _append_model_chunk_to_stream(self, wav):
+        """Append model output to continuous stream with crossfading"""
         xfade_s = float(self.mrt.config.crossfade_length)
+        sr = self._model_sr
         xfade_n = int(round(xfade_s * sr))
         s = wav.samples if wav.samples.ndim == 2 else wav.samples[:, None]
+        if self._stream is None:
+            # First chunk: drop model pre-roll
             if s.shape[0] > xfade_n:
                 self._stream = s[xfade_n:].astype(np.float32, copy=True)
             else:
                 self._stream = np.zeros((0, s.shape[1]), dtype=np.float32)
+            self._stream_write_pos = self._stream.shape[0]
             return
+        # Crossfade with equal-power curves
         if s.shape[0] <= xfade_n or self._stream.shape[0] < xfade_n:
+            # Degenerate case
             self._stream = np.concatenate([self._stream, s], axis=0)
+            self._stream_write_pos = self._stream.shape[0]
             return
+        # Standard crossfade
         tail = self._stream[-xfade_n:]
         head = s[:xfade_n]
         t = np.linspace(0, np.pi/2, xfade_n, endpoint=False, dtype=np.float32)[:, None]
         eq_in, eq_out = np.sin(t), np.cos(t)
         mixed = tail * eq_out + head * eq_in
         self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
+        self._stream_write_pos = self._stream.shape[0]
+    def _extract_precise_chunk(self, start_bars: float, chunk_bars: float) -> np.ndarray:
+        """Extract exactly chunk_bars worth of audio starting at start_bars"""
+        start_samples = self._get_precise_chunk_samples(start_bars)
+        chunk_samples = self._get_precise_chunk_samples(chunk_bars)
+        end_samples = start_samples + chunk_samples
+        if end_samples > self._stream.shape[0]:
+            return None  # Not enough audio generated yet
+        return self._stream[start_samples:end_samples]
+    def _perform_onset_alignment(self, ref_loop: au.Waveform) -> float:
+        """Estimate timing offset between generated audio and reference"""
+        if self._stream is None or self._stream.shape[0] < self._model_sr:
+            return 0.0
+        try:
+            # Take first ~2 seconds of generated audio
+            gen_samples = min(int(2.0 * self._model_sr), self._stream.shape[0])
+            gen_head = au.Waveform(
+                self._stream[:gen_samples].astype(np.float32, copy=False),
+                self._model_sr
+            ).as_stereo()
+            # Reference: last bar of the loop
+            ref_samples = int(self._seconds_per_bar * ref_loop.sample_rate)
+            if ref_loop.samples.shape[0] >= ref_samples:
+                ref_tail = au.Waveform(
+                    ref_loop.samples[-ref_samples:],
+                    ref_loop.sample_rate
+                ).resample(self._model_sr).as_stereo()
+            else:
+                ref_tail = ref_loop.resample(self._model_sr).as_stereo()
+            # Cross-correlation based alignment
+            def envelope(x, sr):
+                if x.ndim == 2:
+                    x = x.mean(axis=1)
+                x = np.abs(x).astype(np.float32)
+                # Simple smoothing
+                win = max(1, int(0.01 * sr))  # 10ms window
+                if win > 1:
+                    kernel = np.ones(win) / win
+                    x = np.convolve(x, kernel, mode='same')
+                return x
+            env_ref = envelope(ref_tail.samples, self._model_sr)
+            env_gen = envelope(gen_head.samples, self._model_sr)
+            # Limit search range to reasonable offset
+            max_offset_samples = int(0.2 * self._model_sr)  # 200ms max
+            # Normalize for correlation
+            env_ref = (env_ref - env_ref.mean()) / (env_ref.std() + 1e-8)
+            env_gen = (env_gen - env_gen.mean()) / (env_gen.std() + 1e-8)
+            # Find best correlation
+            best_offset = 0
+            best_corr = -1.0
+            search_len = min(len(env_ref), len(env_gen) - max_offset_samples)
+            if search_len > 0:
+                for offset in range(0, max_offset_samples, 4):  # subsample for speed
+                    if offset + search_len >= len(env_gen):
+                        break
+                    corr = np.corrcoef(env_ref[:search_len], env_gen[offset:offset+search_len])[0,1]
+                    if not np.isnan(corr) and corr > best_corr:
+                        best_corr = corr
+                        best_offset = offset
+            offset_seconds = best_offset / self._model_sr
+            print(f"Onset alignment: {offset_seconds:.3f}s offset (correlation: {best_corr:.3f})")
+            return offset_seconds
+        except Exception as e:
+            print(f"Onset alignment failed: {e}")
+            return 0.0
+    def _align_to_bar_boundary(self):
+        """Align timing state to next bar boundary"""
+        current_bar = self._timing.emit_position_bars
+        next_bar = math.ceil(current_bar)
+        if abs(next_bar - current_bar) > 1e-6:
+            skip_bars = next_bar - current_bar
+            skip_samples = self._get_precise_chunk_samples(skip_bars)
+            self._timing.stream_position_samples += skip_samples
+            self._timing.emit_position_bars = next_bar
+            print(f"Aligned to bar {next_bar:.0f}, skipped {skip_bars:.4f} bars")
     def reseed_from_waveform(self, wav):
+        """Full context replacement reseed"""
         new_state = self.mrt.init_state()
+        # Build new context from waveform
+        codec_fps = self._codec_fps
         ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
         tail = take_bar_aligned_tail(wav, self.params.bpm, self.params.beats_per_bar, ctx_seconds)
         tokens_full = self.mrt.codec.encode(tail).astype(np.int32)
         tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
+        context_tokens = make_bar_aligned_context(
+            tokens,
+            bpm=self.params.bpm,
+            fps=self._codec_fps,
             ctx_frames=self.mrt.config.context_length_frames,
+            beats_per_bar=self.params.beats_per_bar,
+            precise_timing=True
         )
         new_state.context_tokens = context_tokens
         self.state = new_state
+        # Reset stream
         self._stream = None
+        self._stream_write_pos = 0
+        self._timing = TimingState(
+            frames_per_bar=self._frames_per_bar,
+            samples_per_bar=self._samples_per_bar_model
+        )
         self._needs_bar_realign = True
+        self._reseed_ref_loop = wav
     def reseed_splice(self, recent_wav, anchor_bars: float):
+        """Token-splice reseed"""
         with self._lock:
             if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
                 self._original_context_tokens = np.copy(self.state.context_tokens)
+            # Build new context via splicing
+            recent_tokens = self._make_recent_tokens_from_wave(recent_wav)
             new_ctx = self._splice_context(self._original_context_tokens, recent_tokens, anchor_bars)
             self._pending_reseed = {"ctx": new_ctx, "ref": recent_wav}
+            # Install immediately
             new_state = self.mrt.init_state()
             new_state.context_tokens = new_ctx
             self.state = new_state
+            # Reset stream state
+            self._stream = None
+            self._stream_write_pos = 0
+            self._timing = TimingState(
+                frames_per_bar=self._frames_per_bar,
+                samples_per_bar=self._samples_per_bar_model
+            )
+            self._needs_bar_realign = True
+    def _make_recent_tokens_from_wave(self, wav) -> np.ndarray:
+        """Encode waveform to context tokens with precise alignment"""
+        tokens_full = self.mrt.codec.encode(wav).astype(np.int32)
+        tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
+        context_tokens = make_bar_aligned_context(
+            tokens,
+            bpm=self.params.bpm,
+            fps=self._codec_fps,
+            ctx_frames=self.mrt.config.context_length_frames,
+            beats_per_bar=self.params.beats_per_bar,
+            precise_timing=True
+        )
+        return context_tokens
+    def _splice_context(self, original_tokens: np.ndarray, recent_tokens: np.ndarray, anchor_bars: float) -> np.ndarray:
+        """Enhanced context splicing with fractional bar handling"""
+        ctx_frames = int(self.mrt.config.context_length_frames)
+        # Convert anchor bars to codec frames (keep fractional precision)
+        anchor_frames_f = anchor_bars * self._frames_per_bar
+        anchor_frames = int(round(anchor_frames_f))
+        # Take anchor from original
+        anchor = original_tokens[-anchor_frames:] if anchor_frames <= original_tokens.shape[0] else original_tokens
+        # Fill remainder with recent tokens
+        remain_frames = ctx_frames - anchor.shape[0]
+        if remain_frames > 0:
+            recent = recent_tokens[-remain_frames:] if remain_frames <= recent_tokens.shape[0] else recent_tokens
+        else:
+            recent = recent_tokens[:0]  # empty
+        # Combine
+        if anchor.size > 0 and recent.size > 0:
+            spliced = np.concatenate([recent, anchor], axis=0)
+        elif anchor.size > 0:
+            spliced = anchor
+        else:
+            spliced = recent_tokens[-ctx_frames:]
+        # Ensure exact length
+        if spliced.shape[0] > ctx_frames:
+            spliced = spliced[-ctx_frames:]
+        elif spliced.shape[0] < ctx_frames:
+            # Tile to fill
+            reps = int(np.ceil(ctx_frames / max(1, spliced.shape[0])))
+            spliced = np.tile(spliced, (reps, 1))[-ctx_frames:]
+        return spliced
     def run(self):
+        """Main generation loop with precise timing"""
+        chunk_bars = float(self.params.bars_per_chunk)
+        chunk_samples = self._get_precise_chunk_samples(chunk_bars)
+        xfade_s = float(self.mrt.config.crossfade_length)
+        def _samples_needed(first_chunk_extra=False):
+            """Calculate samples needed in stream for next emission"""
+            available = 0 if self._stream is None else (
+                self._stream.shape[0] - self._timing.stream_position_samples
+            )
+            required = chunk_samples
             if first_chunk_extra:
+                # Extra material for onset alignment
+                extra_samples = self._get_precise_chunk_samples(2.0)
+                required += extra_samples
+            return max(0, required - available)
+        print(f"JamWorker started: {self.params.bpm} BPM, {self._frames_per_bar:.3f} frames/bar, {chunk_bars} bars/chunk")
         while not self._stop_event.is_set():
             if not self._should_generate_next_chunk():
                 time.sleep(0.25)
                 continue
+            # 1) Generate until we have enough audio
+            needed = _samples_needed(first_chunk_extra=(self.idx == 0))
+            while needed > 0 and not self._stop_event.is_set():
                 with self._lock:
                     style_vec = self.params.style_vec
                     self.mrt.guidance_weight = float(self.params.guidance_weight)
+                    self.mrt.temperature = float(self.params.temperature)
+                    self.mrt.topk = int(self.params.topk)
                 wav, self.state = self.mrt.generate_chunk(state=self.state, style=style_vec)
+                self._append_model_chunk_to_stream(wav)
+                needed = _samples_needed(first_chunk_extra=(self.idx == 0))
             if self._stop_event.is_set():
                 break
+            # 2) First chunk: perform onset alignment
             if (self.idx == 0 and self.params.combined_loop is not None) or self._needs_bar_realign:
                 ref_loop = self._reseed_ref_loop or self.params.combined_loop
                 if ref_loop is not None:
+                    offset_seconds = self._perform_onset_alignment(ref_loop)
+                    if abs(offset_seconds) > 0.01:  # More than 10ms
+                        offset_samples = int(round(offset_seconds * self._model_sr))
+                        self._timing.stream_position_samples = max(0, offset_samples)
+                        print(f"Applied onset offset: {offset_seconds:.3f}s")
+                self._align_to_bar_boundary()
                 self._needs_bar_realign = False
                 self._reseed_ref_loop = None
+            # 3) Extract precise chunk
+            chunk_start_bars = self._timing.emit_position_bars
+            slice_audio = self._extract_precise_chunk(chunk_start_bars, chunk_bars)
+            if slice_audio is None:
+                continue  # Need more generation
+            # Update timing state
+            correction = self._timing.advance_by_bars(chunk_bars)
+            if correction != 0:
+                print(f"Applied {correction} bar timing correction")
+            self._timing.stream_position_samples += chunk_samples
+            # 4) Create waveform and process
+            y = au.Waveform(slice_audio.astype(np.float32, copy=False), self._model_sr).as_stereo()
+            # Loudness matching and fades
             if self.idx == 0 and self.params.ref_loop is not None:
                 y, _ = match_loudness_to_reference(
                     self.params.ref_loop, y,
             else:
                 apply_micro_fades(y, 3)
+            # 5) Sample rate conversion
+            if self._resampler is not None:
+                # Use streaming resampler for precise conversion
+                resampled = self._resampler.process(y.samples, final=False)
+                # Ensure exact target length
+                target_samples = int(round(chunk_bars * self._samples_per_bar_target))
+                if resampled.shape[0] != target_samples:
+                    if resampled.shape[0] < target_samples:
+                        pad_samples = target_samples - resampled.shape[0]
+                        pad = np.zeros((pad_samples, resampled.shape[1]), dtype=resampled.dtype)
+                        resampled = np.vstack([resampled, pad])
+                    else:
+                        resampled = resampled[:target_samples]
+                final_audio = resampled
+                final_sr = self._target_sr
+            else:
+                # No resampling needed
+                final_audio = y.samples
+                final_sr = self._model_sr
+            # 6) Encode to base64
+            b64, total_samples, channels = wav_bytes_base64(final_audio, final_sr)
+            # 7) Create metadata with timing info
+            actual_duration = total_samples / final_sr
+            bar_range = f"{chunk_start_bars:.2f}-{self._timing.emit_position_bars:.2f}"
+            meta = {
+                "bpm": int(round(self.params.bpm)),
+                "bars": int(self.params.bars_per_chunk),
+                "beats_per_bar": int(self.params.beats_per_bar),
+                "sample_rate": int(final_sr),
+                "channels": int(channels),
+                "total_samples": int(total_samples),
+                "seconds_per_bar": self._seconds_per_bar,
+                "loop_duration_seconds": actual_duration,
+                "bar_range": bar_range,
+                "timing_state": {
+                    "emit_position_bars": self._timing.emit_position_bars,
+                    "frames_per_bar": self._frames_per_bar,
+                    "fractional_error": self._timing.fractional_error_bars,
+                },
+                "xfade_seconds": xfade_s,
+                "guidance_weight": self.params.guidance_weight,
+                "temperature": self.params.temperature,
+                "topk": self.params.topk,
+            }
+            # 8) Publish chunk
             with self._lock:
                 self.idx += 1
+                chunk = JamChunk(index=self.idx, audio_base64=b64, metadata=meta)
+                self.outbox.append(chunk)
+                # Cleanup old chunks
                 if len(self.outbox) > 10:
                     cutoff = self._last_delivered_index - 5
                     self.outbox = [ch for ch in self.outbox if ch.index > cutoff]
+                # Handle pending reseeds
                 if self._pending_reseed is not None:
                     pkg = self._pending_reseed
                     self._pending_reseed = None
                     new_state = self.mrt.init_state()
+                    new_state.context_tokens = pkg["ctx"]
                     self.state = new_state
+                    # Reset timing and stream
                     self._stream = None
+                    self._stream_write_pos = 0
+                    self._timing = TimingState(
+                        frames_per_bar=self._frames_per_bar,
+                        samples_per_bar=self._samples_per_bar_model
+                    )
+                    self._reseed_ref_loop = pkg.get("ref")
                     self._needs_bar_realign = True
+                    print("Reseed applied at bar boundary")
+            drift_ms = abs(self._timing.fractional_error_bars) * self._seconds_per_bar * 1000
+            print(f"Completed chunk {self.idx} ({bar_range} bars, {drift_ms:.1f}ms drift)")
+        print("JamWorker stopped")
+        # Clean up resampler
+        if self._resampler is not None:
+            try:
+                self._resampler.flush()
+            except:
+                pass

utils.py CHANGED Viewed

@@ -109,55 +109,81 @@ def apply_micro_fades(wav: au.Waveform, ms: int = 5) -> None:
 # ---------- Token context helpers ----------
-def make_bar_aligned_context(tokens, bpm, fps=25.0, ctx_frames=250, beats_per_bar=4):
     """
     Return a ctx_frames-long slice of `tokens` whose **end** lands on the nearest
-    whole-bar boundary in codec-frame space, even when frames_per_bar is fractional.
-    tokens: np.ndarray of shape (T, D) or (T,) where T = codec frames
-    bpm: float
-    fps: float (codec frames per second; keep this as float)
-    ctx_frames: int (length of context window in codec frames)
-    beats_per_bar: int
     """
     if tokens is None:
         raise ValueError("tokens is None")
     tokens = np.asarray(tokens)
     if tokens.ndim == 1:
-        tokens = tokens[:, None]  # promote to (T, 1) for uniform tiling
     T = tokens.shape[0]
     if T == 0:
         return tokens
     fps = float(fps)
-    frames_per_bar_f = (beats_per_bar * 60.0 / float(bpm)) * fps  # float frames per bar
-    # Tile a little more than we need so we can always snap the END to a bar boundary
-    reps = int(np.ceil((ctx_frames + T) / float(T))) + 1
-    tiled = np.tile(tokens, (reps, 1))
-    total = tiled.shape[0]
-    # How many whole bars fit?
-    k_bars = int(np.floor(total / frames_per_bar_f))
-    if k_bars <= 0:
-        # Fallback: just take the last ctx_frames
-        window = tiled[-ctx_frames:]
-        return window
-    # Snap END index to the nearest integer frame at a whole-bar boundary
-    end_idx = int(round(k_bars * frames_per_bar_f))
-    end_idx = min(max(end_idx, ctx_frames), total)
-    start_idx = end_idx - ctx_frames
-    if start_idx < 0:
-        start_idx = 0
-        end_idx = ctx_frames
-    window = tiled[start_idx:end_idx]
-    # Guard against rare off-by-one due to rounding
     if window.shape[0] < ctx_frames:
         pad = np.tile(tokens, (int(np.ceil((ctx_frames - window.shape[0]) / T)), 1))
         window = np.vstack([window, pad])[:ctx_frames]

 # ---------- Token context helpers ----------
+def make_bar_aligned_context(tokens, bpm, fps=25.0, ctx_frames=250, beats_per_bar=4, precise_timing=False):
     """
     Return a ctx_frames-long slice of `tokens` whose **end** lands on the nearest
+    whole-bar boundary in codec-frame space.
+    NEW: precise_timing mode handles fractional frames per bar more carefully.
     """
     if tokens is None:
         raise ValueError("tokens is None")
     tokens = np.asarray(tokens)
     if tokens.ndim == 1:
+        tokens = tokens[:, None]
     T = tokens.shape[0]
     if T == 0:
         return tokens
     fps = float(fps)
+    frames_per_bar_f = (beats_per_bar * 60.0 / float(bpm)) * fps
+    if precise_timing and abs(frames_per_bar_f - round(frames_per_bar_f)) > 1e-6:
+        # We have fractional frames per bar - use a different strategy
+        # Instead of trying to align to exact bar boundaries, align to the closest
+        # multiple of frames_per_bar_f that gives us integer frame positions
+        # Tile enough to work with
+        reps = max(2, int(np.ceil((ctx_frames + T) / float(T))))
+        tiled = np.tile(tokens, (reps, 1))
+        total = tiled.shape[0]
+        # Find the best integer end position that's close to a bar boundary
+        best_end = ctx_frames
+        best_error = float('inf')
+        # Check positions around the naive ctx_frames endpoint
+        for candidate_end in range(max(ctx_frames - 50, ctx_frames), min(total, ctx_frames + 50)):
+            # How many fractional bars does this represent?
+            fractional_bars = candidate_end / frames_per_bar_f
+            # How far from an integer number of bars?
+            bar_error = abs(fractional_bars - round(fractional_bars))
+            if bar_error < best_error:
+                best_error = bar_error
+                best_end = candidate_end
+        end_idx = best_end
+        start_idx = max(0, end_idx - ctx_frames)
+        window = tiled[start_idx:end_idx]
+        # Report timing info for debugging
+        actual_bars = end_idx / frames_per_bar_f
+        print(f"Context aligned to {actual_bars:.3f} bars (error: {best_error:.4f})")
+    else:
+        # Original logic for integer frames per bar
+        reps = int(np.ceil((ctx_frames + T) / float(T))) + 1
+        tiled = np.tile(tokens, (reps, 1))
+        total = tiled.shape[0]
+        k_bars = int(np.floor(total / frames_per_bar_f))
+        if k_bars <= 0:
+            window = tiled[-ctx_frames:]
+            return window
+        end_idx = int(round(k_bars * frames_per_bar_f))
+        end_idx = min(max(end_idx, ctx_frames), total)
+        start_idx = end_idx - ctx_frames
+        if start_idx < 0:
+            start_idx = 0
+            end_idx = ctx_frames
+        window = tiled[start_idx:end_idx]
+    # Ensure exact length
     if window.shape[0] < ctx_frames:
         pad = np.tile(tokens, (int(np.ceil((ctx_frames - window.shape[0]) / T)), 1))
         window = np.vstack([window, pad])[:ctx_frames]