Spaces:

thecollabagepatch
/

magenta

Running

App Files Files Community

thecollabagepatch commited on 5 days ago

Commit

f8b3793

1 Parent(s): 8cf69d0

ofc sample rates

Browse files

Files changed (1) hide show

app.py +57 -32

app.py CHANGED Viewed

@@ -4,6 +4,10 @@ from fastapi import FastAPI, UploadFile, File, Form
 import tempfile, io, base64, math, threading
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import contextmanager
 @contextmanager
 def mrt_overrides(mrt, **kwargs):
@@ -257,10 +261,10 @@ def generate(
     loop_weight: float = Form(1.0),
     loudness_mode: str = Form("auto"),
     loudness_headroom_db: float = Form(1.0),
-    # NEW per-request knobs
     guidance_weight: float = Form(5.0),
     temperature: float = Form(1.1),
     topk: int = Form(40),
 ):
     # Read file
     data = loop_audio.file.read()
@@ -293,45 +297,66 @@ def generate(
             loudness_headroom_db=loudness_headroom_db,
         )
-    total_samples = int(wav.samples.shape[0])
-    sample_rate = int(get_mrt().sample_rate)  # or mrt.sample_rate (same instance here)
-    loop_duration_seconds = total_samples / float(sample_rate)
-    # Also include the bar math (useful for sanity checks downstream)
     seconds_per_bar = (60.0 / float(bpm)) * int(beats_per_bar)
-    # Return base64 WAV + minimal metadata
     buf = io.BytesIO()
-    # add format="WAV" when writing to a file-like object
-    wav.write(buf, subtype="FLOAT", format="WAV")
     buf.seek(0)
     audio_b64 = base64.b64encode(buf.read()).decode("utf-8")
-    return {
-        "audio_base64": audio_b64,
-        "metadata": {
-            "bpm": int(round(bpm)),
-            "bars": int(bars),
-            "beats_per_bar": int(beats_per_bar),
-            "styles": extra_styles,
-            "style_weights": weights,
-            "loop_weight": loop_weight,
-            "loudness": loud_stats,
-            "sample_rate": sample_rate,
-            "channels": mrt.num_channels,
-            "crossfade_seconds": mrt.config.crossfade_length,
-            # New timing fields
-            "total_samples": total_samples,
-            "seconds_per_bar": seconds_per_bar,
-            "loop_duration_seconds": loop_duration_seconds,
-            # Echo the actual knobs used
-            "guidance_weight": guidance_weight,
-            "temperature": temperature,
-            "topk": topk,
-        },
     }
 @app.get("/health")
 def health():

 import tempfile, io, base64, math, threading
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import contextmanager
+import soundfile as sf
+import numpy as np
+from math import gcd
+from scipy.signal import resample_poly
 @contextmanager
 def mrt_overrides(mrt, **kwargs):
     loop_weight: float = Form(1.0),
     loudness_mode: str = Form("auto"),
     loudness_headroom_db: float = Form(1.0),
     guidance_weight: float = Form(5.0),
     temperature: float = Form(1.1),
     topk: int = Form(40),
+    target_sample_rate: int | None = Form(None),  # <-- add this
 ):
     # Read file
     data = loop_audio.file.read()
             loudness_headroom_db=loudness_headroom_db,
         )
+    # 1) Figure out the desired SR
+    inp_info = sf.info(tmp_path)
+    input_sr = int(inp_info.samplerate)
+    target_sr = int(target_sample_rate or input_sr)
+    # 2) Convert magenta output to target_sr if needed
+    # wav.samples: shape [num_samples, num_channels], float32/-1..1 (per your code)
+    cur_sr = int(mrt.sample_rate)
+    x = wav.samples  # np.ndarray (S, C)
+    if cur_sr != target_sr:
+        g = gcd(cur_sr, target_sr)
+        up, down = target_sr // g, cur_sr // g
+        # ensure 2D shape (S, C)
+        x = wav.samples
+        if x.ndim == 1:
+            x = x[:, None]
+        y = np.column_stack([resample_poly(x[:, ch], up, down) for ch in range(x.shape[1])])
+    else:
+        y = wav.samples if wav.samples.ndim == 2 else wav.samples[:, None]
+    # 3) Snap to exact frame count for loop-perfect length
     seconds_per_bar = (60.0 / float(bpm)) * int(beats_per_bar)
+    expected_len = int(round(float(bars) * seconds_per_bar * target_sr))
+    if y.shape[0] < expected_len:
+        pad = np.zeros((expected_len - y.shape[0], y.shape[1]), dtype=y.dtype)
+        y = np.vstack([y, pad])
+    elif y.shape[0] > expected_len:
+        y = y[:expected_len, :]
+    total_samples = int(y.shape[0])
+    loop_duration_seconds = total_samples / float(target_sr)
+    # 4) Write y into buf as WAV @ target_sr
     buf = io.BytesIO()
+    sf.write(buf, y, target_sr, subtype="FLOAT", format="WAV")
     buf.seek(0)
     audio_b64 = base64.b64encode(buf.read()).decode("utf-8")
+    # 5) Update metadata to be authoritative
+    metadata = {
+        "bpm": int(round(bpm)),
+        "bars": int(bars),
+        "beats_per_bar": int(beats_per_bar),
+        "styles": extra_styles,
+        "style_weights": weights,
+        "loop_weight": loop_weight,
+        "loudness": loud_stats,
+        "sample_rate": int(target_sr),
+        "channels": int(y.shape[1]),
+        "crossfade_seconds": mrt.config.crossfade_length,
+        "total_samples": total_samples,
+        "seconds_per_bar": seconds_per_bar,
+        "loop_duration_seconds": loop_duration_seconds,
+        "guidance_weight": guidance_weight,
+        "temperature": temperature,
+        "topk": topk,
     }
+    return {"audio_base64": audio_b64, "metadata": metadata}
 @app.get("/health")
 def health():