canary-qwen-2.5b

Running

App Files Files Community

piotrzelasko commited on Jul 3

Commit

1a71365

1 Parent(s): ea54579

Add LLM capabilities in the demo

Browse files

Signed-off-by: Piotr Żelasko <[email protected]>

Files changed (1) hide show

app.py +67 -10

app.py CHANGED Viewed

@@ -45,6 +45,7 @@ def transcribe(audio_filepath):
         raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
     utt_id = uuid.uuid4()
     pred_text = []
     chunk_idx = 0
     for batch in as_batches(audio_filepath, str(utt_id)):
         audio, audio_lens = batch.load_audio(collate=True)
@@ -57,16 +58,33 @@ def transcribe(audio_filepath):
             )
         texts = [model.tokenizer.ids_to_text(oids) for oids in output_ids.cpu()]
         for t in texts:
-            pred_text.append(f"{timestamp(chunk_idx)} {t}\n\n")
             chunk_idx += 1
-    return ' '.join(pred_text)
 with gr.Blocks(
     title="NeMo Canary-Qwen-2.5B Model",
     css="""
         textarea { font-size: 18px;}
-        #model_output_text_box span {
             font-size: 18px;
             font-weight: bold;
         }
@@ -89,17 +107,50 @@ with gr.Blocks(
         with gr.Column():
-            gr.HTML("<p><b>Step 2:</b> Run the model.</p>")
-            go_button = gr.Button(
                 value="Run model",
                 variant="primary", # make "primary" so it stands out (default is "secondary")
             )
-            model_output_text_box = gr.Textbox(
-                label="Model Output",
-                elem_id="model_output_text_box",
             )
     with gr.Row():
@@ -110,10 +161,16 @@ with gr.Blocks(
             "</p>"
         )
-    go_button.click(
         fn=transcribe,
         inputs=[audio_file],
-        outputs=[model_output_text_box]
     )

         raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
     utt_id = uuid.uuid4()
     pred_text = []
+    pred_text_ts = []
     chunk_idx = 0
     for batch in as_batches(audio_filepath, str(utt_id)):
         audio, audio_lens = batch.load_audio(collate=True)
             )
         texts = [model.tokenizer.ids_to_text(oids) for oids in output_ids.cpu()]
         for t in texts:
+            pred_text.append(t)
+            pred_text_ts.append(f"{timestamp(chunk_idx)} {t}\n\n")
             chunk_idx += 1
+    return ''.join(pred_text_ts), ' '.join(pred_text)
+def postprocess(transcript, prompt):
+    with torch.inference_mode(), model.llm.disable_adapter():
+        output_ids = model.generate(
+            prompts=[[{"role": "user", "content": f"{prompt}\n\n{transcript}"}]],
+            max_new_tokens=2048,
+        )
+    ans = model.tokenizer.ids_to_text(output_ids[0].cpu())
+    ans = ans.split("<|im_start|>assistant")[-1]  # get rid of the prompt
+    if "<think>" in ans:
+        ans = ans.split("<think>")[-1]
+        thoughts, ans = ans.split("</think>")[-1]  # get rid of the thinking
+    else:
+        thoughts = ""
+    return ans.strip(), thoughts
 with gr.Blocks(
     title="NeMo Canary-Qwen-2.5B Model",
     css="""
         textarea { font-size: 18px;}
+        #transcript_box span {
             font-size: 18px;
             font-weight: bold;
         }
         with gr.Column():
+            gr.HTML("<p><b>Step 2:</b> Transcribe the audio.</p>")
+            asr_button = gr.Button(
                 value="Run model",
                 variant="primary", # make "primary" so it stands out (default is "secondary")
             )
+            transcript_box = gr.Textbox(
+                label="Model Transcript",
+                elem_id="transcript_box",
             )
+            raw_transcript = gr.State()
+    with gr.Row():
+        with gr.Column():
+            gr.HTML("<p><b>Step 3:</b> Prompt the model.</p>")
+            prompt_box = gr.Textbox(
+                "Summarize the following:",
+                label="Prompt",
+                elem_id="prompt_box",
+            )
+        with gr.Column():
+            gr.HTML("<p><b>Step 4:</b> See the outcome!</p>")
+            llm_button = gr.Button(
+                value="Apply the prompt",
+                variant="primary", # make "primary" so it stands out (default is "secondary")
+            )
+            think_box = gr.Textbox(
+                label="Assistant's Thinking",
+                elem_id="think_box",
+            )
+            magic_box = gr.Textbox(
+                label="Assistant's Response",
+                elem_id="magic_box",
+            )
     with gr.Row():
             "</p>"
         )
+    asr_button.click(
         fn=transcribe,
         inputs=[audio_file],
+        outputs=[transcript_box, raw_transcript]
+    )
+    llm_button.click(
+        fn=postprocess,
+        inputs=[raw_transcript, prompt_box],
+        outputs=[magic_box, think_box]
     )