Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,46 +18,24 @@ generation_config_multi = model_multi.default_generation_config
|
|
| 18 |
# ---------------------------------
|
| 19 |
# MULTI-TURN INFERENCE FUNCTION
|
| 20 |
# ---------------------------------
|
| 21 |
-
|
| 22 |
-
# try:
|
| 23 |
-
# if audio_file is not None:
|
| 24 |
-
# current_audio = audio_file # Update state if a new file is uploaded
|
| 25 |
-
|
| 26 |
-
# if current_audio is None:
|
| 27 |
-
# return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio
|
| 28 |
-
|
| 29 |
-
# sound = llava.Sound(current_audio)
|
| 30 |
-
# prompt = f"<sound>\n{user_input}"
|
| 31 |
-
|
| 32 |
-
# response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
|
| 33 |
-
|
| 34 |
-
# history.append((user_input, response))
|
| 35 |
-
# return history, history, current_audio
|
| 36 |
-
# except Exception as e:
|
| 37 |
-
# history.append((user_input, f"β Error: {str(e)}"))
|
| 38 |
-
# return history, history, current_audio
|
| 39 |
-
|
| 40 |
-
def multi_turn_chat(user_input, audio_file, history, audio_history):
|
| 41 |
try:
|
| 42 |
if audio_file is not None:
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
if not audio_history:
|
| 46 |
-
return history + [("System", "β Please upload an audio file before chatting.")], history, audio_history
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
prompt = f"<sound>\n{user_input}"
|
| 53 |
-
response = model_multi.generate_content(audio_sounds + [prompt], generation_config=generation_config_multi)
|
| 54 |
|
| 55 |
-
|
| 56 |
-
return history, history, audio_history
|
| 57 |
|
|
|
|
|
|
|
| 58 |
except Exception as e:
|
| 59 |
history.append((user_input, f"β Error: {str(e)}"))
|
| 60 |
-
return history, history,
|
| 61 |
|
| 62 |
|
| 63 |
def speech_prompt_infer(audio_prompt_file):
|
|
@@ -142,16 +120,13 @@ with gr.Blocks(css="""
|
|
| 142 |
user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
| 143 |
btn_multi = gr.Button("Send")
|
| 144 |
history_state = gr.State([]) # Chat history
|
| 145 |
-
|
| 146 |
-
audio_history_state = gr.State([]) # List of audio file paths
|
| 147 |
|
| 148 |
|
| 149 |
btn_multi.click(
|
| 150 |
fn=multi_turn_chat,
|
| 151 |
-
inputs=[user_input_multi, audio_input_multi, history_state,
|
| 152 |
-
outputs=[chatbot, history_state,
|
| 153 |
-
# inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
|
| 154 |
-
# outputs=[chatbot, history_state, current_audio_state]
|
| 155 |
)
|
| 156 |
gr.Examples(
|
| 157 |
examples=[
|
|
@@ -207,7 +182,7 @@ To enable these capabilities, we propose several large-scale training datasets c
|
|
| 207 |
|
| 208 |
π‘ Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
|
| 209 |
|
| 210 |
-
π‘ Audio Flamingo 3 supports on-demand thinking for chain-of-
|
| 211 |
|
| 212 |
π‘ Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
|
| 213 |
|
|
|
|
| 18 |
# ---------------------------------
|
| 19 |
# MULTI-TURN INFERENCE FUNCTION
|
| 20 |
# ---------------------------------
|
| 21 |
+
def multi_turn_chat(user_input, audio_file, history, current_audio):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
try:
|
| 23 |
if audio_file is not None:
|
| 24 |
+
current_audio = audio_file # Update state if a new file is uploaded
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
if current_audio is None:
|
| 27 |
+
return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio
|
| 28 |
|
| 29 |
+
sound = llava.Sound(current_audio)
|
| 30 |
prompt = f"<sound>\n{user_input}"
|
|
|
|
| 31 |
|
| 32 |
+
response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
|
|
|
|
| 33 |
|
| 34 |
+
history.append((user_input, response))
|
| 35 |
+
return history, history, current_audio
|
| 36 |
except Exception as e:
|
| 37 |
history.append((user_input, f"β Error: {str(e)}"))
|
| 38 |
+
return history, history, current_audio
|
| 39 |
|
| 40 |
|
| 41 |
def speech_prompt_infer(audio_prompt_file):
|
|
|
|
| 120 |
user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
| 121 |
btn_multi = gr.Button("Send")
|
| 122 |
history_state = gr.State([]) # Chat history
|
| 123 |
+
current_audio_state = gr.State(None) # Most recent audio file path
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
btn_multi.click(
|
| 127 |
fn=multi_turn_chat,
|
| 128 |
+
inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
|
| 129 |
+
outputs=[chatbot, history_state, current_audio_state]
|
|
|
|
|
|
|
| 130 |
)
|
| 131 |
gr.Examples(
|
| 132 |
examples=[
|
|
|
|
| 182 |
|
| 183 |
π‘ Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
|
| 184 |
|
| 185 |
+
π‘ Audio Flamingo 3 supports on-demand thinking for chain-of-thought reasoning.
|
| 186 |
|
| 187 |
π‘ Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
|
| 188 |
|