File size: 3,738 Bytes
5959802
 
 
 
 
 
 
 
d71aaa4
 
 
 
 
 
 
 
 
 
 
 
 
5959802
 
d71aaa4
5959802
d71aaa4
5959802
d71aaa4
5959802
d71aaa4
5959802
 
d71aaa4
5959802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d71aaa4
5959802
d71aaa4
5959802
 
 
 
 
 
 
 
d71aaa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5959802
 
 
 
 
d71aaa4
 
5959802
 
d71aaa4
 
 
5959802
d71aaa4
 
 
 
 
 
5959802
 
 
d71aaa4
5959802
 
d71aaa4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# interfaceV2.py

import gradio as gr
import shutil
import os
import subprocess
import sys
from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion
from call_openai_api import moni as rtff

# Rutas
AUDIO_RECORD_PATH = os.path.abspath("./assets/audio/grabacion_gradio.wav")
VIDEO_PATH = os.path.abspath("./assets/video/data_video_sun.mp4")
TRANSCRIPTION_TEXT_PATH = os.path.abspath("./results/transcripcion.txt")
RESULT_AUDIO_TEMP_PATH = os.path.abspath("./results/audiov2.wav")
RESULT_AUDIO_FINAL_PATH = os.path.abspath("./assets/audio/audio.wav")
RESULT_VIDEO_PATH = os.path.abspath("./results/result_voice.mp4")
TEXT_TO_SPEECH_PATH = os.path.abspath("./src/text_to_speech.py")
RUN_INFERENCE_PATH = os.path.abspath("./src/run_inference.py")


def transcribir_con_progreso(audio_path):
    progreso = gr.Progress()
    progreso(0, "Iniciando transcripción...")
    model_name = "openai/whisper-large"
    progreso(25, "Cargando modelo Whisper...")
    transcripcion = transcribe_audio(audio_path, model_name)
    progreso(75, "Guardando transcripción...")
    guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH)
    progreso(100, "Transcripción completada.")
    return transcripcion


def generar_audio_desde_texto():
    result = subprocess.run(
        [sys.executable, TEXT_TO_SPEECH_PATH],
        capture_output=True,
        text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}")

    if os.path.exists(RESULT_AUDIO_TEMP_PATH):
        os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True)
        shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH)
        return RESULT_AUDIO_FINAL_PATH
    else:
        return None


def procesar_video_audio():
    result = subprocess.run(
        [sys.executable, RUN_INFERENCE_PATH, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH],
        capture_output=True,
        text=True
    )
    if os.path.exists(RESULT_VIDEO_PATH):
        return RESULT_VIDEO_PATH
    else:
        return None


def flujo_completo(audio_file_path):
    try:
        shutil.copy(audio_file_path, AUDIO_RECORD_PATH)
        transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH)
        respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH)
        audio_generado = generar_audio_desde_texto()
        video_path = procesar_video_audio()

        return "Grabación recibida", AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path

    except Exception as e:
        return (
            f"Error durante el flujo completo: {str(e)}",
            None,
            f"Error: {str(e)}",
            None,
            None
        )


def interfaz():
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
                audio_input = gr.Audio(source="microphone", type="filepath", label="Graba tu voz")
                estado_grabacion = gr.Textbox(label="Estado", interactive=False)

            with gr.Column():
                output_audio = gr.Audio(label="Audio grabado", interactive=False)
                output_audio_speech = gr.Audio(label="Audio TTS", interactive=False)
                video_resultado = gr.Video(label="Video procesado", interactive=False)
                texto_transcripcion = gr.Textbox(label="Texto transcrito")

        audio_input.change(
            flujo_completo,
            inputs=audio_input,
            outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado]
        )

    return demo


if __name__ == "__main__":
    demo = interfaz()
    demo.launch()