Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -177,10 +177,12 @@ def synthesize_speech(text):
|
|
| 177 |
print(f"Speech synthesis error: {str(e)}")
|
| 178 |
return None
|
| 179 |
|
| 180 |
-
#
|
|
|
|
|
|
|
| 181 |
def predict_speaker(audio, model, processor):
|
| 182 |
if audio is None:
|
| 183 |
-
return "Aucun audio détecté.",
|
| 184 |
|
| 185 |
try:
|
| 186 |
audio_data, sr = sf.read(audio)
|
|
@@ -191,7 +193,7 @@ def predict_speaker(audio, model, processor):
|
|
| 191 |
|
| 192 |
with torch.no_grad():
|
| 193 |
output = model(input_tensor)
|
| 194 |
-
print(output)
|
| 195 |
probabilities = F.softmax(output, dim=1)
|
| 196 |
confidence, predicted_class = torch.max(probabilities, 1)
|
| 197 |
|
|
@@ -203,28 +205,45 @@ def predict_speaker(audio, model, processor):
|
|
| 203 |
probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
|
| 204 |
|
| 205 |
# Recognize speech
|
| 206 |
-
recognized_text = recognize_speech(audio)
|
| 207 |
|
| 208 |
-
return result, probs_dict, recognized_text,predicted_speaker
|
| 209 |
|
| 210 |
except Exception as e:
|
| 211 |
-
return f"Erreur : {str(e)}",
|
| 212 |
|
| 213 |
-
#
|
| 214 |
-
def
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
#
|
| 228 |
def create_interface():
|
| 229 |
processor = AudioProcessor()
|
| 230 |
|
|
@@ -239,49 +258,29 @@ def create_interface():
|
|
| 239 |
value="model_3.pth",
|
| 240 |
label="Choisissez le modèle"
|
| 241 |
)
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
record_btn = gr.Button("Reconnaître")
|
|
|
|
| 244 |
with gr.Column():
|
| 245 |
result_text = gr.Textbox(label="Résultat")
|
| 246 |
plot_output = gr.Plot(label="Confiance par locuteur")
|
| 247 |
recognized_text = gr.Textbox(label="Texte reconnu")
|
| 248 |
-
audio_output = gr.Audio(label="Synthèse vocale",
|
| 249 |
-
|
| 250 |
-
def recognize(audio, selected_model):
|
| 251 |
-
model = load_model(model_filename=selected_model)
|
| 252 |
-
res, probs, text,locuteur = predict_speaker(audio, model, processor)
|
| 253 |
-
|
| 254 |
-
# Generate plot
|
| 255 |
-
fig = None
|
| 256 |
-
if probs:
|
| 257 |
-
fig, ax = plt.subplots()
|
| 258 |
-
ax.bar(probs.keys(), probs.values(), color='skyblue')
|
| 259 |
-
ax.set_ylim([0, 1])
|
| 260 |
-
ax.set_ylabel("Confiance")
|
| 261 |
-
ax.set_xlabel("Locuteurs")
|
| 262 |
-
plt.xticks(rotation=45)
|
| 263 |
-
|
| 264 |
-
# Generate speech synthesis if text was recognized
|
| 265 |
-
synth_audio = None
|
| 266 |
-
if text and "error" not in text.lower():
|
| 267 |
-
synth_text = f"{locuteur} said : {text}"
|
| 268 |
-
synth_audio = synthesize_speech(synth_text)
|
| 269 |
-
|
| 270 |
-
return res, fig, text, synth_audio
|
| 271 |
-
|
| 272 |
-
record_btn.click(fn=recognize,
|
| 273 |
-
inputs=[audio_input, model_selector],
|
| 274 |
-
outputs=[result_text, plot_output, recognized_text, audio_output])
|
| 275 |
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
|
| 282 |
return interface
|
| 283 |
|
| 284 |
-
# Lancer
|
| 285 |
if __name__ == "__main__":
|
| 286 |
app = create_interface()
|
| 287 |
-
app.launch()
|
|
|
|
| 177 |
print(f"Speech synthesis error: {str(e)}")
|
| 178 |
return None
|
| 179 |
|
| 180 |
+
# ... (keep all previous imports and class definitions)
|
| 181 |
+
|
| 182 |
+
# Updated predict_speaker function to return consistent values
|
| 183 |
def predict_speaker(audio, model, processor):
|
| 184 |
if audio is None:
|
| 185 |
+
return "Aucun audio détecté.", {}, "Aucun texte reconnu", "Inconnu" # Now returns 4 values
|
| 186 |
|
| 187 |
try:
|
| 188 |
audio_data, sr = sf.read(audio)
|
|
|
|
| 193 |
|
| 194 |
with torch.no_grad():
|
| 195 |
output = model(input_tensor)
|
| 196 |
+
print(output) # Debug output
|
| 197 |
probabilities = F.softmax(output, dim=1)
|
| 198 |
confidence, predicted_class = torch.max(probabilities, 1)
|
| 199 |
|
|
|
|
| 205 |
probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
|
| 206 |
|
| 207 |
# Recognize speech
|
| 208 |
+
recognized_text = recognize_speech(audio) if speech_recognizer else "Modèle de reconnaissance vocale non disponible"
|
| 209 |
|
| 210 |
+
return result, probs_dict, recognized_text, predicted_speaker # Now returns 4 values
|
| 211 |
|
| 212 |
except Exception as e:
|
| 213 |
+
return f"Erreur : {str(e)}", {}, "Erreur de reconnaissance", "Inconnu"
|
| 214 |
|
| 215 |
+
# Updated recognize function
|
| 216 |
+
def recognize(audio, selected_model):
|
| 217 |
+
model = load_model(model_filename=selected_model)
|
| 218 |
+
if model is None:
|
| 219 |
+
return "Erreur: Modèle non chargé", None, "Erreur", None
|
| 220 |
+
|
| 221 |
+
res, probs, text, speaker = predict_speaker(audio, model, processor) # Now expects 4 values
|
| 222 |
+
|
| 223 |
+
# Generate plot
|
| 224 |
+
fig = None
|
| 225 |
+
if probs:
|
| 226 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 227 |
+
ax.bar(probs.keys(), probs.values(), color='skyblue')
|
| 228 |
+
ax.set_ylim([0, 1])
|
| 229 |
+
ax.set_ylabel("Confiance")
|
| 230 |
+
ax.set_xlabel("Locuteurs")
|
| 231 |
+
ax.set_title("Probabilités de reconnaissance")
|
| 232 |
+
plt.xticks(rotation=45)
|
| 233 |
+
plt.tight_layout()
|
| 234 |
+
|
| 235 |
+
# Generate speech synthesis if text was recognized
|
| 236 |
+
synth_audio = None
|
| 237 |
+
if synthesizer is not None and text and "erreur" not in text.lower():
|
| 238 |
+
try:
|
| 239 |
+
synth_text = f"Le locuteur {speaker} a dit : {text}" if speaker else f"Le locuteur a dit : {text}"
|
| 240 |
+
synth_audio = synthesize_speech(synth_text)
|
| 241 |
+
except Exception as e:
|
| 242 |
+
print(f"Erreur de synthèse vocale: {e}")
|
| 243 |
+
|
| 244 |
+
return res, fig, text, synth_audio if synth_audio else None
|
| 245 |
|
| 246 |
+
# Updated interface creation
|
| 247 |
def create_interface():
|
| 248 |
processor = AudioProcessor()
|
| 249 |
|
|
|
|
| 258 |
value="model_3.pth",
|
| 259 |
label="Choisissez le modèle"
|
| 260 |
)
|
| 261 |
+
|
| 262 |
+
with gr.Tab("Microphone"):
|
| 263 |
+
mic_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙️ Enregistrer depuis le microphone")
|
| 264 |
+
|
| 265 |
+
with gr.Tab("Upload Audio"):
|
| 266 |
+
file_input = gr.Audio(sources=["upload"], type="filepath", label="📁 Télécharger un fichier audio")
|
| 267 |
+
|
| 268 |
record_btn = gr.Button("Reconnaître")
|
| 269 |
+
|
| 270 |
with gr.Column():
|
| 271 |
result_text = gr.Textbox(label="Résultat")
|
| 272 |
plot_output = gr.Plot(label="Confiance par locuteur")
|
| 273 |
recognized_text = gr.Textbox(label="Texte reconnu")
|
| 274 |
+
audio_output = gr.Audio(label="Synthèse vocale", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
+
record_btn.click(
|
| 277 |
+
fn=recognize,
|
| 278 |
+
inputs=[gr.inputs.Union([mic_input, file_input]), model_selector],
|
| 279 |
+
outputs=[result_text, plot_output, recognized_text, audio_output]
|
| 280 |
+
)
|
| 281 |
|
| 282 |
return interface
|
| 283 |
|
|
|
|
| 284 |
if __name__ == "__main__":
|
| 285 |
app = create_interface()
|
| 286 |
+
app.launch(share=True)
|