File size: 3,816 Bytes
aeb448f
5959802
 
 
 
 
 
3c49341
aeb448f
 
3c49341
aeb448f
 
3c49341
d71aaa4
 
 
 
 
 
 
 
 
 
 
5959802
 
d71aaa4
5959802
d71aaa4
5959802
d71aaa4
5959802
d71aaa4
5959802
 
d71aaa4
5959802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d71aaa4
5959802
d71aaa4
5959802
 
 
 
 
 
 
 
d71aaa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5959802
 
 
 
 
d71aaa4
 
5959802
 
d71aaa4
 
 
5959802
d71aaa4
 
 
 
 
 
5959802
 
 
d71aaa4
5959802
 
d71aaa4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# interfaceV3.py

import gradio as gr
import shutil
import os
import subprocess
import sys

# AÑADIR RUTA DEL MÓDULO CUSTOM
sys.path.append(os.path.abspath("./src"))

from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion
from call_openai_api import moni as rtff

# Rutas
AUDIO_RECORD_PATH = os.path.abspath("./assets/audio/grabacion_gradio.wav")
VIDEO_PATH = os.path.abspath("./assets/video/data_video_sun.mp4")
TRANSCRIPTION_TEXT_PATH = os.path.abspath("./results/transcripcion.txt")
RESULT_AUDIO_TEMP_PATH = os.path.abspath("./results/audiov2.wav")
RESULT_AUDIO_FINAL_PATH = os.path.abspath("./assets/audio/audio.wav")
RESULT_VIDEO_PATH = os.path.abspath("./results/result_voice.mp4")
TEXT_TO_SPEECH_PATH = os.path.abspath("./src/text_to_speech.py")
RUN_INFERENCE_PATH = os.path.abspath("./src/run_inference.py")


def transcribir_con_progreso(audio_path):
    progreso = gr.Progress()
    progreso(0, "Iniciando transcripción...")
    model_name = "openai/whisper-large"
    progreso(25, "Cargando modelo Whisper...")
    transcripcion = transcribe_audio(audio_path, model_name)
    progreso(75, "Guardando transcripción...")
    guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH)
    progreso(100, "Transcripción completada.")
    return transcripcion


def generar_audio_desde_texto():
    result = subprocess.run(
        [sys.executable, TEXT_TO_SPEECH_PATH],
        capture_output=True,
        text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}")

    if os.path.exists(RESULT_AUDIO_TEMP_PATH):
        os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True)
        shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH)
        return RESULT_AUDIO_FINAL_PATH
    else:
        return None


def procesar_video_audio():
    result = subprocess.run(
        [sys.executable, RUN_INFERENCE_PATH, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH],
        capture_output=True,
        text=True
    )
    if os.path.exists(RESULT_VIDEO_PATH):
        return RESULT_VIDEO_PATH
    else:
        return None


def flujo_completo(audio_file_path):
    try:
        shutil.copy(audio_file_path, AUDIO_RECORD_PATH)
        transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH)
        respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH)
        audio_generado = generar_audio_desde_texto()
        video_path = procesar_video_audio()

        return "Grabación recibida", AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path

    except Exception as e:
        return (
            f"Error durante el flujo completo: {str(e)}",
            None,
            f"Error: {str(e)}",
            None,
            None
        )


def interfaz():
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
                audio_input = gr.Audio(source="microphone", type="filepath", label="Graba tu voz")
                estado_grabacion = gr.Textbox(label="Estado", interactive=False)

            with gr.Column():
                output_audio = gr.Audio(label="Audio grabado", interactive=False)
                output_audio_speech = gr.Audio(label="Audio TTS", interactive=False)
                video_resultado = gr.Video(label="Video procesado", interactive=False)
                texto_transcripcion = gr.Textbox(label="Texto transcrito")

        audio_input.change(
            flujo_completo,
            inputs=audio_input,
            outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado]
        )

    return demo


if __name__ == "__main__":
    demo = interfaz()
    demo.launch()