Update app.py
Browse filesTranscription and chat input are now working. No Bot LLM response yet
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import librosa
|
@@ -15,13 +17,13 @@ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTask
|
|
15 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
16 |
|
17 |
SAMPLE_RATE = 16000 # Hz
|
18 |
-
|
19 |
DESCRIPTION = '''
|
20 |
<div>
|
21 |
<h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
|
22 |
-
<p style='text-align: center'>This is a demo of a voice chat that accepts an audio input up to
|
23 |
<p>This Space uses nvidia's canaray 1b model to transcribe input audio, LLama3 for LLM and VITS for TTS <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
|
24 |
-
<p
|
25 |
<p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
|
26 |
</div>
|
27 |
'''
|
@@ -32,7 +34,8 @@ PLACEHOLDER = """
|
|
32 |
</div>
|
33 |
"""
|
34 |
|
35 |
-
|
|
|
36 |
model.eval()
|
37 |
|
38 |
# make sure beam size always 1 for consistency
|
@@ -68,9 +71,9 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
|
|
68 |
|
69 |
duration = librosa.get_duration(y=data, sr=sr)
|
70 |
|
71 |
-
if duration
|
72 |
raise gr.Error(
|
73 |
-
f"This demo can transcribe up to {
|
74 |
"If you wish, you may trim the audio using the Audio viewer in Step 1 "
|
75 |
"(click on the scissors icon to start trimming audio)."
|
76 |
)
|
@@ -113,33 +116,16 @@ def transcribe(audio_filepath):
|
|
113 |
fout.write(line + '\n')
|
114 |
|
115 |
# call transcribe, passing in manifest filepath
|
116 |
-
|
117 |
-
output_text = model.transcribe(manifest_filepath)[0]
|
118 |
-
else: # do buffered inference
|
119 |
-
with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
|
120 |
-
with torch.no_grad():
|
121 |
-
hyps = get_buffered_pred_feat_multitaskAED(
|
122 |
-
frame_asr,
|
123 |
-
model.cfg.preprocessor,
|
124 |
-
model_stride_in_secs,
|
125 |
-
model.device,
|
126 |
-
manifest=manifest_filepath,
|
127 |
-
filepaths=None,
|
128 |
-
)
|
129 |
-
|
130 |
-
output_text = hyps[0].text
|
131 |
|
132 |
return output_text
|
133 |
|
134 |
def add_message(history, message):
|
135 |
-
|
136 |
-
|
137 |
-
if message["text"] is not None:
|
138 |
-
history.append((message["text"], None))
|
139 |
-
return history, gr.MultimodalTextbox(value=None, interactive=False)
|
140 |
|
141 |
def bot(history):
|
142 |
-
response = "**That's cool!**"
|
143 |
history[-1][1] = ""
|
144 |
for character in response:
|
145 |
history[-1][1] += character
|
@@ -186,11 +172,16 @@ with gr.Blocks(
|
|
186 |
|
187 |
chat_input = gr.Textbox(
|
188 |
label="Transcribed text:",
|
|
|
|
|
189 |
elem_id="chat_input",
|
|
|
190 |
)
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
194 |
|
195 |
go_button.click(
|
196 |
fn=transcribe,
|
@@ -198,6 +189,6 @@ with gr.Blocks(
|
|
198 |
outputs = [chat_input]
|
199 |
)
|
200 |
|
201 |
-
print(torch. cuda. is_available())
|
202 |
demo.queue()
|
203 |
-
|
|
|
|
1 |
+
|
2 |
+
|
3 |
import gradio as gr
|
4 |
import json
|
5 |
import librosa
|
|
|
17 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
18 |
|
19 |
SAMPLE_RATE = 16000 # Hz
|
20 |
+
MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
|
21 |
DESCRIPTION = '''
|
22 |
<div>
|
23 |
<h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
|
24 |
+
<p style='text-align: center'>This is a demo of a voice chat that accepts an audio input up to 40 seconds long. Transcription and responses are limited to the English language.</p>
|
25 |
<p>This Space uses nvidia's canaray 1b model to transcribe input audio, LLama3 for LLM and VITS for TTS <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
|
26 |
+
<p>May not work with audio files longer than 40 seconds if cuda is not available <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
|
27 |
<p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
|
28 |
</div>
|
29 |
'''
|
|
|
34 |
</div>
|
35 |
"""
|
36 |
|
37 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
38 |
+
model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
|
39 |
model.eval()
|
40 |
|
41 |
# make sure beam size always 1 for consistency
|
|
|
71 |
|
72 |
duration = librosa.get_duration(y=data, sr=sr)
|
73 |
|
74 |
+
if duration > MAX_AUDIO_SECONDS:
|
75 |
raise gr.Error(
|
76 |
+
f"This demo can transcribe up to {MAX_AUDIO_SECONDS} seconds of audio. "
|
77 |
"If you wish, you may trim the audio using the Audio viewer in Step 1 "
|
78 |
"(click on the scissors icon to start trimming audio)."
|
79 |
)
|
|
|
116 |
fout.write(line + '\n')
|
117 |
|
118 |
# call transcribe, passing in manifest filepath
|
119 |
+
output_text = model.transcribe(manifest_filepath)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
return output_text
|
122 |
|
123 |
def add_message(history, message):
|
124 |
+
history.append((message, None))
|
125 |
+
return history, gr.Textbox(value="", interactive=False)
|
|
|
|
|
|
|
126 |
|
127 |
def bot(history):
|
128 |
+
response = "**That's cool!**" #TODO Llama3 response
|
129 |
history[-1][1] = ""
|
130 |
for character in response:
|
131 |
history[-1][1] += character
|
|
|
172 |
|
173 |
chat_input = gr.Textbox(
|
174 |
label="Transcribed text:",
|
175 |
+
interactive=True,
|
176 |
+
placeholder="Enter message",
|
177 |
elem_id="chat_input",
|
178 |
+
visible=False
|
179 |
)
|
180 |
+
clear = gr.ClearButton([chatbot])
|
181 |
+
|
182 |
+
chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot, chat_input])
|
183 |
+
bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
|
184 |
+
bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
|
185 |
|
186 |
go_button.click(
|
187 |
fn=transcribe,
|
|
|
189 |
outputs = [chat_input]
|
190 |
)
|
191 |
|
|
|
192 |
demo.queue()
|
193 |
+
if __name__ == "__main__":
|
194 |
+
demo.launch()
|