Spaces:

VanYsa
/

MyAlexa

Paused

App Files Files Community

VanYsa commited on Apr 28, 2024

Commit

7143888

1 Parent(s): 9279bb0

Update app.py

Browse files

Transcription and chat input are now working. No Bot LLM response yet

Files changed (1) hide show

app.py +23 -32

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import gradio as gr
 import json
 import librosa
@@ -15,13 +17,13 @@ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTask
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
-MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
 DESCRIPTION = '''
 <div>
 <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
-<p style='text-align: center'>This is a demo of a voice chat that accepts an audio input up to 10 minutes long. Translation and text are limited to the English language.</p>
 <p>This Space uses nvidia's canaray 1b model to transcribe input audio, LLama3 for LLM and VITS for TTS <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
-<p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
 <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
 </div>
 '''
@@ -32,7 +34,8 @@ PLACEHOLDER = """
 </div>
 """
-model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
 # make sure beam size always 1 for consistency
@@ -68,9 +71,9 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
 	duration = librosa.get_duration(y=data, sr=sr)
-	if duration / 60.0 > MAX_AUDIO_MINUTES:
 		raise gr.Error(
-			f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
 			"If you wish, you may trim the audio using the Audio viewer in Step 1 "
 			"(click on the scissors icon to start trimming audio)."
 		)
@@ -113,33 +116,16 @@ def transcribe(audio_filepath):
 			fout.write(line + '\n')
 		# call transcribe, passing in manifest filepath
-		if duration < 40:
-			output_text = model.transcribe(manifest_filepath)[0]
-		else: # do buffered inference
-			with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
-				with torch.no_grad():
-					hyps = get_buffered_pred_feat_multitaskAED(
-						frame_asr,
-						model.cfg.preprocessor,
-						model_stride_in_secs,
-						model.device,
-						manifest=manifest_filepath,
-						filepaths=None,
-					)
-					output_text = hyps[0].text
 	return output_text
 def add_message(history, message):
-    for x in message["files"]:
-        history.append(((x,), None))
-    if message["text"] is not None:
-        history.append((message["text"], None))
-    return history, gr.MultimodalTextbox(value=None, interactive=False)
 def bot(history):
-    response = "**That's cool!**"
     history[-1][1] = ""
     for character in response:
         history[-1][1] += character
@@ -186,11 +172,16 @@ with gr.Blocks(
 			chat_input = gr.Textbox(
 				label="Transcribed text:",
 				elem_id="chat_input",
 			)
-	# chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
-	# bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
-	# bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
 	go_button.click(
 		fn=transcribe,
@@ -198,6 +189,6 @@ with gr.Blocks(
 		outputs = [chat_input]
 	)
-print(torch. cuda. is_available())
 demo.queue()
-demo.launch()

 import gradio as gr
 import json
 import librosa
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
+MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
 <div>
 <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
+<p style='text-align: center'>This is a demo of a voice chat that accepts an audio input up to 40 seconds long. Transcription and responses are limited to the English language.</p>
 <p>This Space uses nvidia's canaray 1b model to transcribe input audio, LLama3 for LLM and VITS for TTS <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
+<p>May not work with audio files longer than 40 seconds if cuda is not available <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
 <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
 </div>
 '''
 </div>
 """
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
 model.eval()
 # make sure beam size always 1 for consistency
 	duration = librosa.get_duration(y=data, sr=sr)
+	if duration > MAX_AUDIO_SECONDS:
 		raise gr.Error(
+			f"This demo can transcribe up to {MAX_AUDIO_SECONDS} seconds of audio. "
 			"If you wish, you may trim the audio using the Audio viewer in Step 1 "
 			"(click on the scissors icon to start trimming audio)."
 		)
 			fout.write(line + '\n')
 		# call transcribe, passing in manifest filepath
+		output_text = model.transcribe(manifest_filepath)[0]
 	return output_text
 def add_message(history, message):
+    history.append((message, None))
+    return history, gr.Textbox(value="", interactive=False)
 def bot(history):
+    response = "**That's cool!**" #TODO Llama3 response
     history[-1][1] = ""
     for character in response:
         history[-1][1] += character
 			chat_input = gr.Textbox(
 				label="Transcribed text:",
+				interactive=True,
+				placeholder="Enter message",
 				elem_id="chat_input",
+				visible=False
 			)
+			clear = gr.ClearButton([chatbot])
+	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot, chat_input])
+	bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
+	bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
 	go_button.click(
 		fn=transcribe,
 		outputs = [chat_input]
 	)
 demo.queue()
+if __name__ == "__main__":
+    demo.launch()