Voice-To-Voice_test

Sleeping

App Files Files Community

syedmudassir16 commited on Sep 24, 2024

Commit

8413e00

verified ·

1 Parent(s): c163697

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -10

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
 # Initialize the ASR pipeline
 asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
 def speech_to_text(speech):
     """Converts speech to text using the ASR pipeline."""
     return asr(speech)["text"]
@@ -51,19 +53,18 @@ def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, r
             return playlist_message
     return output
 def format_prompt(message, history):
     """Formats the prompt including fixed instructions and conversation history."""
-    fixed_prompt = """
-    You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
-    Note: Do not write anything else other than the classified mood if classified.
-    Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.
-    Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.
-    Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.
             Examples
             User: What is C programming?
@@ -223,14 +224,13 @@ def format_prompt(message, history):
             User: Lets turn up the music and have some fun!
             LLM Response: Party
-"""
     prompt = f"{fixed_prompt}\n"
     for user_prompt, bot_response in history:
         prompt += f"User: {user_prompt}\nLLM Response: {bot_response}\n"
     prompt += f"User: {message}\nLLM Response:"
     return prompt
 async def text_to_speech(text):
     communicate = edge_tts.Communicate(text)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
@@ -252,6 +252,9 @@ async def generate_audio(history):
         return audio_path
     return None
 # Gradio interface setup
 with gr.Blocks() as demo:
     gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat")
@@ -266,6 +269,9 @@ with gr.Blocks() as demo:
         submit = gr.Button("Send")
         voice_input = gr.Audio(sources="microphone", type="filepath", label="Voice Input")
     # Handle text input
     msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
         generate_audio, inputs=[state], outputs=[audio_output]

 # Initialize the ASR pipeline
 asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
+INITIAL_MESSAGE = "Hi! I'm your music buddy—tell me about your mood and the type of tunes you're in the mood for today!"
 def speech_to_text(speech):
     """Converts speech to text using the ASR pipeline."""
     return asr(speech)["text"]
             return playlist_message
     return output
 def format_prompt(message, history):
     """Formats the prompt including fixed instructions and conversation history."""
+    fixed_prompt= """
+            You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user’s mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
+            Note: Do not write anything else other than the classified mood if classified.
+            Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.
+            Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.
+            Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.
             Examples
             User: What is C programming?
             User: Lets turn up the music and have some fun!
             LLM Response: Party
+            """
     prompt = f"{fixed_prompt}\n"
     for user_prompt, bot_response in history:
         prompt += f"User: {user_prompt}\nLLM Response: {bot_response}\n"
     prompt += f"User: {message}\nLLM Response:"
     return prompt
 async def text_to_speech(text):
     communicate = edge_tts.Communicate(text)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
         return audio_path
     return None
+def init_chat():
+    return [("", INITIAL_MESSAGE)], [("", INITIAL_MESSAGE)], None
 # Gradio interface setup
 with gr.Blocks() as demo:
     gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat")
         submit = gr.Button("Send")
         voice_input = gr.Audio(sources="microphone", type="filepath", label="Voice Input")
+    # Initialize chat with greeting
+    demo.load(init_chat, outputs=[state, chatbot, audio_output])
     # Handle text input
     msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
         generate_audio, inputs=[state], outputs=[audio_output]