Spaces:

VanYsa
/

MyAlexa

Paused

App Files Files Community

VanYsa commited on Apr 28, 2024

Commit

c9b4e47

1 Parent(s): 7143888

Update app.py

Browse files

Edited texts and added a no function repeat button

Files changed (1) hide show

app.py +52 -25

app.py CHANGED Viewed

@@ -21,10 +21,10 @@ MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
 <div>
 <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
-<p style='text-align: center'>This is a demo of a voice chat that accepts an audio input up to 40 seconds long. Transcription and responses are limited to the English language.</p>
-<p>This Space uses nvidia's canaray 1b model to transcribe input audio, LLama3 for LLM and VITS for TTS <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
-<p>May not work with audio files longer than 40 seconds if cuda is not available <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
-<p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
 </div>
 '''
 PLACEHOLDER = """
@@ -63,7 +63,7 @@ amp_dtype = torch.float16
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
-	Do not convert and raise error if audio too long.
 	Returns output filename and duration.
 	"""
@@ -90,6 +90,11 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
 def transcribe(audio_filepath):
 	if audio_filepath is None:
 		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
@@ -121,25 +126,28 @@ def transcribe(audio_filepath):
 	return output_text
 def add_message(history, message):
-    history.append((message, None))
-    return history, gr.Textbox(value="", interactive=False)
 def bot(history):
-    response = "**That's cool!**" #TODO Llama3 response
-    history[-1][1] = ""
-    for character in response:
-        history[-1][1] += character
-        time.sleep(0.05)
-        yield history
 with gr.Blocks(
-	title="NeMo Canary Model",
 	css="""
 		textarea { font-size: 18px;}
-		#chat_input span {
-			font-size: 18px;
-			font-weight: bold;
-		}
 	""",
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
@@ -163,30 +171,49 @@ with gr.Blocks(
 		with gr.Column():
-			gr.HTML("<p><b>Step 3:</b> Transcribe audio</p>")
-			go_button = gr.Button(
-				value="Transcribe audio",
-				variant="primary", # make "primary" so it stands out (default is "secondary")
 			)
 			chat_input = gr.Textbox(
 				label="Transcribed text:",
-				interactive=True,
 				placeholder="Enter message",
 				elem_id="chat_input",
 				visible=False
 			)
-			clear = gr.ClearButton([chatbot])
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot, chat_input])
 	bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
 	bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
-	go_button.click(
 		fn=transcribe,
 		inputs = [audio_file],
 		outputs = [chat_input]
 	)
 demo.queue()

 DESCRIPTION = '''
 <div>
 <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
+<p style='text-align: center'>MyAlexa is a demo of a voice chat assistant that accepts audio input and outputs a voice response with chat logs. </p>
+<p>This space uses <a href="https://huggingface.co/nvidia/canary-1b"><b>NVIDIA Canary 1B</b></a> for Automatic Speech-to-text Recognition (ASR), <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama 3 8B Insruct</b></a> for the large language model (LLM) and <a href="https://https://huggingface.co/docs/transformers/en/model_doc/vits"><b>VITS</b></a> for text to speech (TTS).</p>
+<p>This demo accepts inputs not more than 40 seconds long.</p>
+<p>Transcription and responses are limited to the English language.</p>
 </div>
 '''
 PLACEHOLDER = """
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
+	Do not convert and raise error if audio is too long.
 	Returns output filename and duration.
 	"""
 def transcribe(audio_filepath):
+	"""
+	Transcribes a converted audio file.
+	Set to english language with punctuations.
+	Returns the output text.
+	"""
 	if audio_filepath is None:
 		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
 	return output_text
 def add_message(history, message):
+	"""
+	Adds the input message in the chatbot.
+	Returns the updated chatbot with an empty input textbox.
+	"""
+	history.append((message, None))
+	return history, gr.Textbox(value="", interactive=False)
 def bot(history):
+	"""
+	Prints the LLM's response in the chatbot
+	"""
+	response = "**That's cool!**" #TODO Llama3 response
+	history[-1][1] = ""
+	for character in response:
+		history[-1][1] += character
+		time.sleep(0.05)
+		yield history
 with gr.Blocks(
+	title="MyAlexa",
 	css="""
 		textarea { font-size: 18px;}
 	""",
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
 		with gr.Column():
+			gr.HTML("<p><b>Step 2:</b> Enter audio as input and wait for MyAlexa's response.</p>")
+			submit_button = gr.Button(
+				value="Submit audio",
+				variant="primary"
 			)
 			chat_input = gr.Textbox(
 				label="Transcribed text:",
+				interactive=False,
 				placeholder="Enter message",
 				elem_id="chat_input",
 				visible=False
 			)
+			gr.HTML("<p><b>Optional:</b> Enter audio as input and wait for MyAlexa's response.</p>")
+			repeat_button = gr.Button(
+				value="Repeat audio",
+				variant="secondary"
+			)
+			gr.HTML("<p><b>Optional:</b> Clear the chatbox.</p>")
+			clear = gr.ClearButton(
+				components=[chatbot],
+				value="Clear chat",
+			)
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot, chat_input])
 	bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
 	bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
+	submit_button.click(
 		fn=transcribe,
 		inputs = [audio_file],
 		outputs = [chat_input]
+	)
+	repeat_button.click(
+		fn=None,
+		inputs = None,
+		outputs = None
 	)
 demo.queue()