piotrzelasko commited on
Commit
1a71365
·
1 Parent(s): ea54579

Add LLM capabilities in the demo

Browse files

Signed-off-by: Piotr Żelasko <[email protected]>

Files changed (1) hide show
  1. app.py +67 -10
app.py CHANGED
@@ -45,6 +45,7 @@ def transcribe(audio_filepath):
45
  raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
46
  utt_id = uuid.uuid4()
47
  pred_text = []
 
48
  chunk_idx = 0
49
  for batch in as_batches(audio_filepath, str(utt_id)):
50
  audio, audio_lens = batch.load_audio(collate=True)
@@ -57,16 +58,33 @@ def transcribe(audio_filepath):
57
  )
58
  texts = [model.tokenizer.ids_to_text(oids) for oids in output_ids.cpu()]
59
  for t in texts:
60
- pred_text.append(f"{timestamp(chunk_idx)} {t}\n\n")
 
61
  chunk_idx += 1
62
- return ' '.join(pred_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  with gr.Blocks(
66
  title="NeMo Canary-Qwen-2.5B Model",
67
  css="""
68
  textarea { font-size: 18px;}
69
- #model_output_text_box span {
70
  font-size: 18px;
71
  font-weight: bold;
72
  }
@@ -89,17 +107,50 @@ with gr.Blocks(
89
 
90
  with gr.Column():
91
 
92
- gr.HTML("<p><b>Step 2:</b> Run the model.</p>")
93
 
94
- go_button = gr.Button(
95
  value="Run model",
96
  variant="primary", # make "primary" so it stands out (default is "secondary")
97
  )
98
 
99
- model_output_text_box = gr.Textbox(
100
- label="Model Output",
101
- elem_id="model_output_text_box",
102
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  with gr.Row():
105
 
@@ -110,10 +161,16 @@ with gr.Blocks(
110
  "</p>"
111
  )
112
 
113
- go_button.click(
114
  fn=transcribe,
115
  inputs=[audio_file],
116
- outputs=[model_output_text_box]
 
 
 
 
 
 
117
  )
118
 
119
 
 
45
  raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
46
  utt_id = uuid.uuid4()
47
  pred_text = []
48
+ pred_text_ts = []
49
  chunk_idx = 0
50
  for batch in as_batches(audio_filepath, str(utt_id)):
51
  audio, audio_lens = batch.load_audio(collate=True)
 
58
  )
59
  texts = [model.tokenizer.ids_to_text(oids) for oids in output_ids.cpu()]
60
  for t in texts:
61
+ pred_text.append(t)
62
+ pred_text_ts.append(f"{timestamp(chunk_idx)} {t}\n\n")
63
  chunk_idx += 1
64
+ return ''.join(pred_text_ts), ' '.join(pred_text)
65
+
66
+
67
+ def postprocess(transcript, prompt):
68
+ with torch.inference_mode(), model.llm.disable_adapter():
69
+ output_ids = model.generate(
70
+ prompts=[[{"role": "user", "content": f"{prompt}\n\n{transcript}"}]],
71
+ max_new_tokens=2048,
72
+ )
73
+ ans = model.tokenizer.ids_to_text(output_ids[0].cpu())
74
+ ans = ans.split("<|im_start|>assistant")[-1] # get rid of the prompt
75
+ if "<think>" in ans:
76
+ ans = ans.split("<think>")[-1]
77
+ thoughts, ans = ans.split("</think>")[-1] # get rid of the thinking
78
+ else:
79
+ thoughts = ""
80
+ return ans.strip(), thoughts
81
 
82
 
83
  with gr.Blocks(
84
  title="NeMo Canary-Qwen-2.5B Model",
85
  css="""
86
  textarea { font-size: 18px;}
87
+ #transcript_box span {
88
  font-size: 18px;
89
  font-weight: bold;
90
  }
 
107
 
108
  with gr.Column():
109
 
110
+ gr.HTML("<p><b>Step 2:</b> Transcribe the audio.</p>")
111
 
112
+ asr_button = gr.Button(
113
  value="Run model",
114
  variant="primary", # make "primary" so it stands out (default is "secondary")
115
  )
116
 
117
+ transcript_box = gr.Textbox(
118
+ label="Model Transcript",
119
+ elem_id="transcript_box",
120
  )
121
+ raw_transcript = gr.State()
122
+
123
+ with gr.Row():
124
+
125
+ with gr.Column():
126
+
127
+ gr.HTML("<p><b>Step 3:</b> Prompt the model.</p>")
128
+
129
+ prompt_box = gr.Textbox(
130
+ "Summarize the following:",
131
+ label="Prompt",
132
+ elem_id="prompt_box",
133
+ )
134
+
135
+ with gr.Column():
136
+
137
+ gr.HTML("<p><b>Step 4:</b> See the outcome!</p>")
138
+
139
+ llm_button = gr.Button(
140
+ value="Apply the prompt",
141
+ variant="primary", # make "primary" so it stands out (default is "secondary")
142
+ )
143
+
144
+ think_box = gr.Textbox(
145
+ label="Assistant's Thinking",
146
+ elem_id="think_box",
147
+ )
148
+
149
+ magic_box = gr.Textbox(
150
+ label="Assistant's Response",
151
+ elem_id="magic_box",
152
+ )
153
+
154
 
155
  with gr.Row():
156
 
 
161
  "</p>"
162
  )
163
 
164
+ asr_button.click(
165
  fn=transcribe,
166
  inputs=[audio_file],
167
+ outputs=[transcript_box, raw_transcript]
168
+ )
169
+
170
+ llm_button.click(
171
+ fn=postprocess,
172
+ inputs=[raw_transcript, prompt_box],
173
+ outputs=[magic_box, think_box]
174
  )
175
 
176