File size: 3,783 Bytes
5959802
 
 
 
 
 
 
 
d71aaa4
 
3c49341
 
 
d71aaa4
3c49341
d71aaa4
 
 
 
 
 
 
 
 
 
5959802
 
d71aaa4
5959802
d71aaa4
5959802
d71aaa4
5959802
d71aaa4
5959802
 
d71aaa4
5959802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d71aaa4
5959802
d71aaa4
5959802
 
 
 
 
 
 
 
d71aaa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5959802
 
 
 
 
d71aaa4
 
5959802
 
d71aaa4
 
 
5959802
d71aaa4
 
 
 
 
 
5959802
 
 
d71aaa4
5959802
 
d71aaa4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# interfaceV2.py

import gradio as gr
import shutil
import os
import subprocess
import sys
from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion
from call_openai_api import moni as rtff




# Rutas
sys.path.append(os.path.abspath("./src"))
AUDIO_RECORD_PATH = os.path.abspath("./assets/audio/grabacion_gradio.wav")
VIDEO_PATH = os.path.abspath("./assets/video/data_video_sun.mp4")
TRANSCRIPTION_TEXT_PATH = os.path.abspath("./results/transcripcion.txt")
RESULT_AUDIO_TEMP_PATH = os.path.abspath("./results/audiov2.wav")
RESULT_AUDIO_FINAL_PATH = os.path.abspath("./assets/audio/audio.wav")
RESULT_VIDEO_PATH = os.path.abspath("./results/result_voice.mp4")
TEXT_TO_SPEECH_PATH = os.path.abspath("./src/text_to_speech.py")
RUN_INFERENCE_PATH = os.path.abspath("./src/run_inference.py")


def transcribir_con_progreso(audio_path):
    progreso = gr.Progress()
    progreso(0, "Iniciando transcripción...")
    model_name = "openai/whisper-large"
    progreso(25, "Cargando modelo Whisper...")
    transcripcion = transcribe_audio(audio_path, model_name)
    progreso(75, "Guardando transcripción...")
    guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH)
    progreso(100, "Transcripción completada.")
    return transcripcion


def generar_audio_desde_texto():
    result = subprocess.run(
        [sys.executable, TEXT_TO_SPEECH_PATH],
        capture_output=True,
        text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}")

    if os.path.exists(RESULT_AUDIO_TEMP_PATH):
        os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True)
        shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH)
        return RESULT_AUDIO_FINAL_PATH
    else:
        return None


def procesar_video_audio():
    result = subprocess.run(
        [sys.executable, RUN_INFERENCE_PATH, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH],
        capture_output=True,
        text=True
    )
    if os.path.exists(RESULT_VIDEO_PATH):
        return RESULT_VIDEO_PATH
    else:
        return None


def flujo_completo(audio_file_path):
    try:
        shutil.copy(audio_file_path, AUDIO_RECORD_PATH)
        transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH)
        respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH)
        audio_generado = generar_audio_desde_texto()
        video_path = procesar_video_audio()

        return "Grabación recibida", AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path

    except Exception as e:
        return (
            f"Error durante el flujo completo: {str(e)}",
            None,
            f"Error: {str(e)}",
            None,
            None
        )


def interfaz():
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
                audio_input = gr.Audio(source="microphone", type="filepath", label="Graba tu voz")
                estado_grabacion = gr.Textbox(label="Estado", interactive=False)

            with gr.Column():
                output_audio = gr.Audio(label="Audio grabado", interactive=False)
                output_audio_speech = gr.Audio(label="Audio TTS", interactive=False)
                video_resultado = gr.Video(label="Video procesado", interactive=False)
                texto_transcripcion = gr.Textbox(label="Texto transcrito")

        audio_input.change(
            flujo_completo,
            inputs=audio_input,
            outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado]
        )

    return demo


if __name__ == "__main__":
    demo = interfaz()
    demo.launch()