canary-qwen-2.5b

Running

piotrzelasko commited on Jul 2

Commit

ea54579

1 Parent(s): 954cfbc

Reduce peak GPU memory use during model init

Signed-off-by: Piotr Żelasko <[email protected]>

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,10 +16,8 @@ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
 CHUNK_SECONDS = 40.0  # max audio length seen by the model
 BATCH_SIZE = 8  # for parallel transcription of audio longer than CHUNK_SECONDS
-with device:
-    torch.set_default_dtype(torch.bfloat16)  # speed up start-up time
-    model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().to(device)
-    torch.set_default_dtype(torch.float32)
 def timestamp(idx: int):

 CHUNK_SECONDS = 40.0  # max audio length seen by the model
 BATCH_SIZE = 8  # for parallel transcription of audio longer than CHUNK_SECONDS
+model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().to(device)
 def timestamp(idx: int):