VanYsa commited on
Commit
7143888
·
1 Parent(s): 9279bb0

Update app.py

Browse files

Transcription and chat input are now working. No Bot LLM response yet

Files changed (1) hide show
  1. app.py +23 -32
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  import json
3
  import librosa
@@ -15,13 +17,13 @@ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTask
15
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
16
 
17
  SAMPLE_RATE = 16000 # Hz
18
- MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
19
  DESCRIPTION = '''
20
  <div>
21
  <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
22
- <p style='text-align: center'>This is a demo of a voice chat that accepts an audio input up to 10 minutes long. Translation and text are limited to the English language.</p>
23
  <p>This Space uses nvidia's canaray 1b model to transcribe input audio, LLama3 for LLM and VITS for TTS <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
24
- <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
25
  <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
26
  </div>
27
  '''
@@ -32,7 +34,8 @@ PLACEHOLDER = """
32
  </div>
33
  """
34
 
35
- model = ASRModel.from_pretrained("nvidia/canary-1b")
 
36
  model.eval()
37
 
38
  # make sure beam size always 1 for consistency
@@ -68,9 +71,9 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
68
 
69
  duration = librosa.get_duration(y=data, sr=sr)
70
 
71
- if duration / 60.0 > MAX_AUDIO_MINUTES:
72
  raise gr.Error(
73
- f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
74
  "If you wish, you may trim the audio using the Audio viewer in Step 1 "
75
  "(click on the scissors icon to start trimming audio)."
76
  )
@@ -113,33 +116,16 @@ def transcribe(audio_filepath):
113
  fout.write(line + '\n')
114
 
115
  # call transcribe, passing in manifest filepath
116
- if duration < 40:
117
- output_text = model.transcribe(manifest_filepath)[0]
118
- else: # do buffered inference
119
- with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
120
- with torch.no_grad():
121
- hyps = get_buffered_pred_feat_multitaskAED(
122
- frame_asr,
123
- model.cfg.preprocessor,
124
- model_stride_in_secs,
125
- model.device,
126
- manifest=manifest_filepath,
127
- filepaths=None,
128
- )
129
-
130
- output_text = hyps[0].text
131
 
132
  return output_text
133
 
134
  def add_message(history, message):
135
- for x in message["files"]:
136
- history.append(((x,), None))
137
- if message["text"] is not None:
138
- history.append((message["text"], None))
139
- return history, gr.MultimodalTextbox(value=None, interactive=False)
140
 
141
  def bot(history):
142
- response = "**That's cool!**"
143
  history[-1][1] = ""
144
  for character in response:
145
  history[-1][1] += character
@@ -186,11 +172,16 @@ with gr.Blocks(
186
 
187
  chat_input = gr.Textbox(
188
  label="Transcribed text:",
 
 
189
  elem_id="chat_input",
 
190
  )
191
- # chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
192
- # bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
193
- # bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
 
 
194
 
195
  go_button.click(
196
  fn=transcribe,
@@ -198,6 +189,6 @@ with gr.Blocks(
198
  outputs = [chat_input]
199
  )
200
 
201
- print(torch. cuda. is_available())
202
  demo.queue()
203
- demo.launch()
 
 
1
+
2
+
3
  import gradio as gr
4
  import json
5
  import librosa
 
17
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
18
 
19
  SAMPLE_RATE = 16000 # Hz
20
+ MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
21
  DESCRIPTION = '''
22
  <div>
23
  <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
24
+ <p style='text-align: center'>This is a demo of a voice chat that accepts an audio input up to 40 seconds long. Transcription and responses are limited to the English language.</p>
25
  <p>This Space uses nvidia's canaray 1b model to transcribe input audio, LLama3 for LLM and VITS for TTS <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
26
+ <p>May not work with audio files longer than 40 seconds if cuda is not available <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
27
  <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
28
  </div>
29
  '''
 
34
  </div>
35
  """
36
 
37
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
39
  model.eval()
40
 
41
  # make sure beam size always 1 for consistency
 
71
 
72
  duration = librosa.get_duration(y=data, sr=sr)
73
 
74
+ if duration > MAX_AUDIO_SECONDS:
75
  raise gr.Error(
76
+ f"This demo can transcribe up to {MAX_AUDIO_SECONDS} seconds of audio. "
77
  "If you wish, you may trim the audio using the Audio viewer in Step 1 "
78
  "(click on the scissors icon to start trimming audio)."
79
  )
 
116
  fout.write(line + '\n')
117
 
118
  # call transcribe, passing in manifest filepath
119
+ output_text = model.transcribe(manifest_filepath)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  return output_text
122
 
123
  def add_message(history, message):
124
+ history.append((message, None))
125
+ return history, gr.Textbox(value="", interactive=False)
 
 
 
126
 
127
  def bot(history):
128
+ response = "**That's cool!**" #TODO Llama3 response
129
  history[-1][1] = ""
130
  for character in response:
131
  history[-1][1] += character
 
172
 
173
  chat_input = gr.Textbox(
174
  label="Transcribed text:",
175
+ interactive=True,
176
+ placeholder="Enter message",
177
  elem_id="chat_input",
178
+ visible=False
179
  )
180
+ clear = gr.ClearButton([chatbot])
181
+
182
+ chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot, chat_input])
183
+ bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
184
+ bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
185
 
186
  go_button.click(
187
  fn=transcribe,
 
189
  outputs = [chat_input]
190
  )
191
 
 
192
  demo.queue()
193
+ if __name__ == "__main__":
194
+ demo.launch()