Spaces:

fireedman
/

EKNA_V1

Running

App Files Files Community

fireedman commited on 8 days ago

Commit

d4757ae

0 Parent(s):

Primer commit, creo que faltan los modelos pesados

Browse files

Files changed (22) hide show

.gitignore +24 -0
estructura_proyecto.txt +34 -0
requirements.txt +31 -0
results/OpenAI_response.txt +5 -0
results/transcripcion.txt +1 -0
setup.py +287 -0
src/.gradio/certificate.pem +31 -0
src/audio_recorder.py +48 -0
src/call_openai_api.py +80 -0
src/convert_models.py +16 -0
src/gradio_helper.py +26 -0
src/interface.py +60 -0
src/interfaceV2.py +183 -0
src/ov_inference.py +637 -0
src/ov_wav2lip_helper.py +68 -0
src/run_inference.py +67 -0
src/text_to_speech.py +36 -0
src/utils/notebook_utils.py +708 -0
src/utils/pip_helper.py +10 -0
src/whisper_audio_extractor.py +47 -0
src/whisper_audio_transcriber.py +109 -0
tests/test_whisper_audio_extractor.py +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+#Ignorar entorno virtual
+env/
+#Ignorar archivos y carpetas de compilacion
+__pycache__/
+*.pyc
+*.pyo
+*.py[cod]
+.vscode/
+.DS_Store
+#Ignorar archivos de log y salida de pruebas
+*.log
+*.out
+*.tmp
+#Ignorar modelos y checkpoints
+models/
+checkpoints/
+src/Wav2Lip/
+assets/
+data/
+#Archivos temporales y de sistema

estructura_proyecto.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+proyecto_root/
+│
+├── assets/
+   ├── video/
+│   │   ├──data_video_sun_5s.mp4
+│   └── audio/
+│       └──data_audio_sun_5s.wav
+│   └── # Archivos de datos, audio y video de prueba, como `data_audio_sun_5s.wav`
+│
+├── checkpoints/
+│   └── # Modelos y checkpoints preentrenados, como `wav2lip_gan.pth`
+│
+├── models/
+│   └── # Modelos convertidos a OpenVINO IR, como `face_detection.xml` y `wav2lip.xml`
+│
+├── src/
+|   ├── utils/
+|   ├── Wav2Lip/
+│   ├── convert_models.py
+│   ├── gradio_helper.py
+│   ├── ov_inference.py
+│   ├── ov_wav2lip_helper.py
+│   └── run_inference
+│
+├── tests/
+│   └── # Scripts de pruebas para verificar la funcionalidad de tu código
+│
+├── results/
+│   └── result_voice.mp4
+│
+├── requirements.txt            # Lista de dependencias del proyecto
+├── setup.py                    # Script de configuración del proyecto
+├── estructura_proyecto.py                    # Script de configuración del proyecto
+└── README.md                   # Documentación del proyecto

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+openvino>=2024.4.0
+huggingface_hub
+torch>=2.1
+gradio>=4.19
+librosa==0.9.2
+opencv-contrib-python
+opencv-python
+IPython
+tqdm
+numba
+numpy
+openai-whisper
+sounddevice
+scipy
+transformers>=4.35
+torchvision>=0.18.1
+onnx>=1.16.1
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+openvino
+openvino-tokenizers
+openvino-genai
+datasets
+soundfile>=0.12
+python-ffmpeg<=1.0.16
+nncf>=2.13.0
+jiwer
+gtts

results/OpenAI_response.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Hola, prueba en marcha,
+María con IA se realza,
+Nuevo modelo se lanza,
+Incorporación, esperanza,
+Ser mejor, nuestra balanza.

results/transcripcion.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Hola, esta es una prueba para ver si podemos incorporar este modelo a María, María RB.

setup.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# 2024/03/11 setup.py
+import os
+import subprocess
+import sys
+import requests
+from pathlib import Path
+# Definición de las carpetas del proyecto
+PROJECT_DIRECTORIES = [
+    "assets",
+    "assets/audio",
+    "assets/video",
+    "checkpoints",
+    "models",
+    "src",
+    "src/utils",
+    "tests",
+    "results"
+]
+# URLs de las utilidades de OpenVINO Notebooks
+OPENVINO_UTILS = {
+    "notebook_utils.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
+    "pip_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py"
+}
+# URLs de los archivos de ayuda de Wav2Lip
+WAV2LIP_HELPERS = {
+    "gradio_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/gradio_helper.py",
+    "ov_inference.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/ov_inference.py",
+    "ov_wav2lip_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/ov_wav2lip_helper.py"
+}
+WAV2LIP_HELPERS_DIR = Path("src")
+OPENVINO_UTILS_DIR = Path("src/utils")
+# URLs de los archivos de ejemplo de entrada
+EXAMPLE_FILES = {
+    "audio_example": {
+        "filename": "data_audio_sun_5s.wav",
+        "url": "https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_audio_sun_5s.wav?raw=true",
+        "folder": "assets/audio"
+    },
+    "video_example": {
+        "filename": "data_video_sun_5s.mp4",
+        "url": "https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_video_sun_5s.mp4?raw=true",
+        "folder": "assets/video"
+    }
+}
+# Función para crear la estructura general del proyecto
+def create_project_structure():
+    """
+    Crea la estructura de las carpetas del proyecto
+    """
+    for directory in PROJECT_DIRECTORIES:
+        path = Path(directory)
+        if not path.exists():
+            path.mkdir(parents=True, exist_ok=True)
+            print(f"Carpeta '{directory}' creada.")
+        else:
+            print(f"Carpeta '{directory}' ya existe.")
+# Función para crear el entorno virtual
+def create_virtual_environment():
+    """
+    Crea el entorno virtual si no existe.
+    """
+    env_path = Path("env")
+    if not env_path.exists():
+        print("Creando el entorno virtual...")
+        subprocess.check_call([sys.executable, "-m", "venv", "env"])
+        print(f"Entorno virtual creado en '{env_path}'.")
+    else:
+        print(f"El entorno virtual '{env_path}' ya existe.")
+# Función que activa y define pip y python
+def activate_virtual_environment():
+    """
+    Activa el entorno virtual y devuelve las rutas de pip y python.
+    """
+    if os.name == 'nt':  # Windows
+        python_path = str(Path("env") / "Scripts" / "python.exe")
+        pip_path = str(Path("env") / "Scripts" / "pip.exe")
+    else:  # Unix/MacOS
+        python_path = str(Path("env") / "bin" / "python")
+        pip_path = str(Path("env") / "bin" / "pip")
+    # Actualizar pip a la última versión en el entorno virtual usando python -m pip
+    try:
+        subprocess.check_call([python_path, "-m", "pip", "install", "--upgrade", "pip"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        print("pip actualizado a la última versión.")
+    except subprocess.CalledProcessError:
+        print("Error al actualizar pip.")
+    try:
+        subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar tqdm.")
+    return python_path, pip_path
+# Funcion para instalar las dependencias desde requirements.txt con barra de progreso
+def install_requirements(pip_path):
+    """
+    Instala las dependencias de requirements.txt con una barra de progreso.
+    """
+    print("Instalando dependencias...")
+    # Instalar tqdm en el entorno virtual si no está instalado
+    try:
+        subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar tqdm.")
+    from tqdm import tqdm  # Importar tqdm para la barra de progreso
+    # Leer requirements.txt y mostrar barra de progreso
+    requirements_path = Path("requirements.txt")
+    if not requirements_path.exists():
+        print("Archivo requirements.txt no encontrado.")
+        return
+    with open(requirements_path, "r") as f:
+        dependencies = f.read().splitlines()
+    # Instalar cada dependencia con barra de progreso
+    for dependency in tqdm(dependencies, desc="Instalando dependencias", unit="paquete"):
+        try:
+            subprocess.check_call([pip_path, "install", dependency], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        except subprocess.CalledProcessError:
+            print(f"\nError al instalar {dependency}.")
+    print("Todas las dependencias fueron instaladas correctamente.")
+# Funcion para descargar los archivos de utilidades de OpenVINO Notebooks
+def download_openvino_utils(pip_path):
+    """
+    Descarga los archivos de utilidades de OpenVINO Notebooks en src/utils si no existen.
+    """
+    # Crear la carpeta de utilidades si no existe
+    OPENVINO_UTILS_DIR.mkdir(parents=True, exist_ok=True)
+    # Instalar requests en el entorno virtual si no está instalado
+    try:
+        subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar requests.")
+        # Instalar tqdm en el entorno virtual si no está instalado
+    try:
+        subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar tqdm.")
+    from tqdm import tqdm  # Importar tqdm para la barra de progreso
+    for filename, url in tqdm(OPENVINO_UTILS.items(), desc="Descargando utilidades de OpenVINO", unit="archivo"):
+        file_path = OPENVINO_UTILS_DIR / filename
+        if not file_path.exists():
+            response = requests.get(url)
+            if response.status_code == 200:
+                with open(file_path, "wb") as f:
+                    f.write(response.content)
+            else:
+                print(f"Error al descargar {filename} desde {url}")
+# Función para descargar los archivos de ayuda específicos de Wav2Lip
+def download_wav2lip_helpers(pip_path):
+    """
+    Descarga los archivos de ayuda específicos de Wav2Lip si no existen.
+    """
+    WAV2LIP_HELPERS_DIR.mkdir(parents=True, exist_ok=True)  # Crea `src` si no existe
+    # Instalar requests en el entorno virtual si no está instalado
+    try:
+        subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar requests.")
+    try:
+        subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar tqdm.")
+    from tqdm import tqdm  # Importar tqdm para la barra de progreso
+    for filename, url in tqdm(WAV2LIP_HELPERS.items(), desc="Descargando ayudas de Wav2Lip", unit="archivo"):
+        file_path = WAV2LIP_HELPERS_DIR / filename
+        if not file_path.exists():
+            response = requests.get(url)
+            if response.status_code == 200:
+                with open(file_path, "wb") as f:
+                    f.write(response.content)
+# Función para descargar los archivos de ejemplo de entrada (audio y video)
+def download_example_files():
+    """
+    Descarga los archivos de ejemplo de entrada (audio y video) en sus carpetas correspondientes.
+    """
+    # Instalar requests en el entorno virtual si no está instalado
+    try:
+        subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar requests.")
+    try:
+        subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar tqdm.")
+    from tqdm import tqdm  # Importar tqdm para la barra de progreso
+    for example_name, example_info in tqdm(EXAMPLE_FILES.items(), desc="Descargando archivos de ejemplo", unit="archivo"):
+        folder_path = Path(example_info["folder"])
+        file_path = folder_path / example_info["filename"]
+        # Crear la carpeta si no existe
+        folder_path.mkdir(parents=True, exist_ok=True)
+        # Descargar el archivo si no existe
+        if not file_path.exists():
+            response = requests.get(example_info["url"])
+            if response.status_code == 200:
+                with open(file_path, "wb") as f:
+                    f.write(response.content)
+def clone_wav2lip_repo():
+    """
+    Clona el repositorio oficial de Wav2Lip, ocultando el progreso mediante tqdm.
+    """
+    repo_url = "https://github.com/Rudrabha/Wav2Lip"
+    clone_path = "src/Wav2Lip"
+    try:
+        subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar requests.")
+    try:
+        subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        print("Error al instalar tqdm.")
+    from tqdm import tqdm  # Importar tqdm para la barra de progreso
+    # Verifica si el repositorio ya existe para evitar clonarlo nuevamente
+    if os.path.exists(clone_path):
+        print(f"El repositorio '{clone_path}' ya existe.")
+        return
+    # Inicia el proceso de clonación con tqdm para ocultar el progreso
+    print("Clonando el repositorio de Wav2Lip...")
+    with tqdm(total=100, desc="Clonación en progreso", ncols=100, bar_format="{l_bar}{bar}") as pbar:
+        # Ejecuta el comando de clonación
+        exit_code = subprocess.call(["git", "clone", repo_url, clone_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        if exit_code != 0:
+            raise Exception("Error: La clonación del repositorio ha fallado.")
+        else:
+            pbar.update(100)
+            print("Repositorio clonado exitosamente en 'Wav2Lip'.")
+if __name__ == "__main__":
+    create_project_structure()
+    create_virtual_environment()
+    python_path, pip_path = activate_virtual_environment()
+    download_openvino_utils(pip_path)
+    download_wav2lip_helpers(pip_path)
+    download_example_files()
+    install_requirements(pip_path)
+    clone_wav2lip_repo()

src/.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

src/audio_recorder.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# audio_recorder.py
+import sounddevice as sd
+from scipy.io.wavfile import write
+import os
+# Ruta para guardar el archivo de audio en el directorio `assets/audio/`
+AUDIO_PATH = os.path.join("..", "assets", "audio", "grabacion_8s.wav")
+def listar_dispositivos():
+    """
+    Lista todos los dispositivos de audio disponibles en el sistema.
+    """
+    print("Dispositivos de audio disponibles:")
+    dispositivos = sd.query_devices()
+    for idx, dispositivo in enumerate(dispositivos):
+        print(f"{idx}: {dispositivo['name']} - {'Entrada' if dispositivo['max_input_channels'] > 0 else 'Salida'}")
+    print("\nSelecciona el índice del dispositivo de entrada que prefieras para grabar audio.")
+def record_audio(duration=8, sample_rate=44100, device_index=None):
+    """
+    Graba el audio desde el micrófono durante un tiempo específico y lo guarda como archivo WAV.
+    Args:
+        duration (int): Duración de la grabación en segundos.
+        sample_rate (int): Frecuencia de muestreo del audio.
+        device_index (int): Índice del dispositivo de audio a utilizar.
+    """
+    print("Grabando...")
+    # Iniciar la grabación con un canal
+    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, device=device_index)
+    sd.wait()  # Espera a que la grabación termine
+    # Guardar el archivo de audio
+    write(AUDIO_PATH, sample_rate, audio_data)
+    print(f"Grabación completada. Archivo guardado en: {AUDIO_PATH}")
+if __name__ == "__main__":
+    # Paso 1: Listar dispositivos de audio
+    listar_dispositivos()
+    # Aquí esperaremos tu selección del índice del dispositivo
+    device_index = int(input("Introduce el índice del dispositivo de entrada que deseas utilizar: "))
+    # Paso 2: Grabar audio con el dispositivo seleccionado
+    record_audio(device_index=device_index)

src/call_openai_api.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+from dotenv import load_dotenv
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from pathlib import Path
+#Cargar variables de entorno desde el archivo .env
+# Ruta relativa al archivo .env en models/
+project_root = Path(__file__).resolve().parent.parent  # Sube al nivel raíz del proyecto
+env_path = project_root / "models" / ".env"           # Ruta completa al archivo .env
+load_dotenv(dotenv_path=env_path)
+#Configuracion de la clave de la api
+api_key = os.getenv("OPENAI_API_KEY")
+if not api_key:
+    raise ValueError("No se encontro la clave de API")
+OPENAI_KEY_VAL = api_key
+llm = ChatOpenAI(
+    openai_api_key = OPENAI_KEY_VAL,
+    temperature = 0.7,
+    model = "gpt-4"
+)
+#plantilla del prompt con el texto leido del archivo
+template ="""
+Eres un asistente de IA que orienta a los alumnos a ser mejores personas. Haz una haiku de 5 lineas sobre lo que te estan comentando. Da siempre la respuesta en Español
+Texto:{texto}
+Respuesta:
+"""
+prompt = PromptTemplate(
+    input_variables = ["texto"],
+    template = template
+)
+chain = LLMChain(
+    llm = llm,
+    prompt = prompt
+)
+#def save_summary_to_file(summary_text, filename = 'response.txt'):
+def save_summary_to_file(summary_text, filename = 'C:/programacionEjercicios/miwav2lipv6/results/OpenAI_response.txt'):
+    try:
+        with open(filename,'w', encoding='utf-8') as file:
+            file.write(summary_text)
+        print(f"El resumen se ha guardado exitosamente en {filename}")
+    except Exception as e:
+        print(f"Ocurrio un error al guardar el resumen {e}")
+def read_text_from_file(filename):
+    try:
+        with open(filename, 'r') as file:
+            return file.read()
+    except Exception as e:
+        print(f"Error al leer el archivo {filename}: {e}")
+        return ""
+#def main():
+def moni(archivo):
+    #texto_usuario = input("Ingresa un texto para resumir:")
+    #texto_usuario = read_text_from_file("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
+    texto_usuario = read_text_from_file(archivo)
+    resultado = chain.run(texto = texto_usuario)
+    #Mostrar el resumen generado
+    print("\nResumen generado:")
+    print(resultado)
+    save_summary_to_file(resultado)
+    return resultado
+#
+if __name__ == "__main__":
+    moni()

src/convert_models.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import sys
+from pathlib import Path
+# Añade `src` a `sys.path` para que Python encuentre el módulo `utils`
+sys.path.append(str(Path(__file__).resolve().parent))
+# Importa la función desde utils/notebook_utils.py
+from utils.notebook_utils import download_file
+from ov_wav2lip_helper import download_and_convert_models
+OV_FACE_DETECTION_MODEL_PATH = Path("../miwav2lipv6/models/face_detection.xml")
+OV_WAV2LIP_MODEL_PATH = Path("../miwav2lipv6/models/wav2lip.xml")
+download_and_convert_models(OV_FACE_DETECTION_MODEL_PATH, OV_WAV2LIP_MODEL_PATH)

src/gradio_helper.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import Callable
+import gradio as gr
+import numpy as np
+examples = [
+    [
+        #"data_video_sun_5s.mp4",
+        "data_video_sun.mp4",
+        "data_audio_sun_5s.wav",
+    ],
+]
+def make_demo(fn: Callable):
+    demo = gr.Interface(
+        fn=fn,
+        inputs=[
+            gr.Video(label="Face video"),
+            gr.Audio(label="Audio", type="filepath"),
+        ],
+        outputs="video",
+        examples=examples,
+        allow_flagging="never",
+    )
+    return demo

src/interface.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# interface.py
+import gradio as gr
+import sounddevice as sd
+from scipy.io.wavfile import write
+import tempfile
+import shutil
+import os
+# Rutas de video y audio con absolutas para evitar errores de acceso
+AUDIO_COPY_PATH = os.path.abspath(os.path.join("..", "miwav2lipv6","assets", "audio", "grabacion_gradio.wav"))
+#VIDEO_PATH = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun_5s.mp4")
+VIDEO_PATH = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun.mp4")
+# Verificar la existencia del video
+if not os.path.exists(VIDEO_PATH):
+    print(f"Advertencia: El archivo de video no se encontró en la ruta {VIDEO_PATH}")
+# Función para grabar audio
+def grabar_audio(duration=8, sample_rate=44100):
+    print("Grabando...")
+    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
+    sd.wait()  # Espera a que la grabación termine
+    # Guardar archivo temporal de audio
+    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    write(temp_audio.name, sample_rate, audio_data)
+    print("Grabación completada. Archivo temporal guardado en:", temp_audio.name)
+    # Verificar y crear `assets/audio` si no existe
+    os.makedirs(os.path.dirname(AUDIO_COPY_PATH), exist_ok=True)
+    # Copiar a `assets/audio`
+    shutil.copy(temp_audio.name, AUDIO_COPY_PATH)
+    print(f"Copia de la grabación guardada en: {AUDIO_COPY_PATH}")
+    return AUDIO_COPY_PATH
+# Función principal para la interfaz de Gradio
+def interfaz():
+    with gr.Blocks() as demo:
+        gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
+        # Crear un botón de grabación
+        with gr.Row():
+            grabar_button = gr.Button("Iniciar Grabación")
+        # Mostrar el audio grabado a la derecha
+        output_audio = gr.Audio(label="Grabación de Audio", type="filepath")
+        # Asignar la función al botón
+        grabar_button.click(grabar_audio, outputs=output_audio)
+    return demo
+# Ejecuta la interfaz con la ruta absoluta en allowed_paths
+if __name__ == "__main__":
+    demo = interfaz()
+    demo.launch(allowed_paths=[os.path.dirname(AUDIO_COPY_PATH)])

src/interfaceV2.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# interfaceV2.py
+import gradio as gr
+import sounddevice as sd
+from scipy.io.wavfile import write
+import tempfile
+import shutil
+import os
+import subprocess
+import sys
+from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion
+from call_openai_api import  moni as rtff   # Asegúrate de que el archivo call_open_api.py esté en el mismo directorio
+# Paths to files (adjusted as per your specified structure)
+AUDIO_RECORD_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/audio/grabacion_gradio.wav")
+#VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/video/data_video_sun_5s.mp4")
+VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/video/data_video_sun.mp4")
+#TRANSCRIPTION_TEXT_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
+TRANSCRIPTION_TEXT_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
+RESULT_AUDIO_TEMP_PATH = os.path.abspath( "C:/programacionEjercicios/miwav2lipv6/results/audiov2.wav")
+RESULT_AUDIO_FINAL_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/audio/audio.wav")
+RESULT_VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/result_voice.mp4")
+TEXT_TO_SPEECH_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/src/text_to_speech.py")
+# Function to record 8-second audio
+def grabar_audio(duration=8, sample_rate=44100):
+    print("Starting recording...")
+    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
+    print(f"Recording in progress for {duration} seconds...")
+    sd.wait()
+    print("Recording completed.")
+    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    write(temp_audio.name, sample_rate, audio_data)
+    print("Audio temporarily saved at:", temp_audio.name)
+    temp_audio.close()  # Asegúrate de cerrarlo antes de usarlo
+    os.makedirs(os.path.dirname(AUDIO_RECORD_PATH), exist_ok=True)
+    shutil.copy(temp_audio.name, AUDIO_RECORD_PATH)
+    print(f"Recording copied to: {AUDIO_RECORD_PATH}")
+    return AUDIO_RECORD_PATH, "Recording completed."
+# Function to transcribe audio with Whisper
+def transcribir_con_progreso(audio_path):
+    progreso = gr.Progress()
+    progreso(0, "Starting transcription...")
+    model_name = "openai/whisper-large"
+    progreso(25, "Loading Whisper model...")
+    transcripcion = transcribe_audio(audio_path, model_name)
+    progreso(75, "Saving transcription...")
+    guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH)
+    progreso(100, "Transcription completed.")
+    if not os.path.exists(TRANSCRIPTION_TEXT_PATH):
+        raise FileNotFoundError(f"El archivo {TRANSCRIPTION_TEXT_PATH} no se generó.")
+    return transcripcion
+# Function to convert text to audio using text_to_speech.py
+def generar_audio_desde_texto():
+    print("Generating audio from text...")
+    result = subprocess.run(
+        [sys.executable, TEXT_TO_SPEECH_PATH],
+        capture_output=True,
+        text=True
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}")
+    if result.stdout:
+        print("Output:", result.stdout)
+    if result.stderr:
+        print("Errors:", result.stderr)
+    if os.path.exists(RESULT_AUDIO_TEMP_PATH):
+        print(f"Temporary audio generated at: {RESULT_AUDIO_TEMP_PATH}")
+        os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True)
+        shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH)
+        print(f"Final audio copied to: {RESULT_AUDIO_FINAL_PATH}")
+        return RESULT_AUDIO_FINAL_PATH
+    else:
+        print(f"Error: Audio file was not generated in {RESULT_AUDIO_FINAL_PATH} ")
+        return None
+# Function to process video and audio using run_inference.py with the generated audio file
+def procesar_video_audio():
+    print("Starting video and audio processing...")
+    run_inference_path = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/src/run_inference.py")
+    result = subprocess.run(
+        [sys.executable, run_inference_path, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH],
+        capture_output=True,
+        text=True
+    )
+    if result.stdout:
+        print("Output:", result.stdout)
+    if result.stderr:
+        print("Errors:", result.stderr)
+    if os.path.exists(RESULT_VIDEO_PATH):
+        print(f"Processed video saved at: {RESULT_VIDEO_PATH}")
+        return RESULT_VIDEO_PATH
+    else:
+        print("Error: Video file was not generated at 'results/result_voice.mp4'")
+        return None
+# Gradio Interface Configuration
+def interfaz():
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
+                grabar_button = gr.Button("Comenzando la grabacion de audio")
+                estado_grabacion = gr.Textbox(label="Recording Status", interactive=False)
+            with gr.Column():
+                output_audio = gr.Audio(AUDIO_RECORD_PATH, label="Audio Grabado", interactive=False)
+                output_audio_speech = gr.Audio(RESULT_AUDIO_FINAL_PATH, label="Audio TTS", interactive=False)
+                video_resultado = gr.Video(RESULT_VIDEO_PATH,label="Video procesado", interactive=False)
+                texto_transcripcion = gr.Textbox(label="Texto transcrito")
+                progreso_transcripcion = gr.Textbox(label="Transcription Status", interactive=False)
+            # Full flow: recording, transcription, text-to-speech, and video processing
+            """
+            def flujo_completo():
+                _, mensaje_grabacion = grabar_audio()
+                transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH)
+                audio_generado = generar_audio_desde_texto()
+                video_path = procesar_video_audio()
+                # Ensure function always returns 5 outputs for Gradio, even in error cases
+                if video_path and audio_generado:
+                    return mensaje_grabacion, AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path
+                else:
+                    return mensaje_grabacion, AUDIO_RECORD_PATH, transcripcion, audio_generado or "Audio generation failed", video_path or "Video generation failed"
+            """
+            def flujo_completo():
+                try:
+                    print("Inicio del flujo completo...")
+                    # Grabar audio
+                    audio_path, mensaje_grabacion = grabar_audio()
+                    print("Audio grabado en:", audio_path)
+                    # Transcribir audio
+                    transcripcion = transcribir_con_progreso(audio_path)
+                    print("Transcripción completada:", transcripcion)
+                    #respuesta_openai = rtff(transcripcion)
+                    respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH)
+                    print("Respuesta generada  por OpenAI")
+                    # Generar audio desde texto
+                    audio_generado = generar_audio_desde_texto()
+                    print("Audio generado:", audio_generado)
+                    # Procesar video y audio
+                    video_path = procesar_video_audio()
+                    print("Video procesado en:", video_path)
+                    # Devolver resultados si todo fue exitoso
+                    return mensaje_grabacion, audio_path, transcripcion, audio_generado, video_path
+                except Exception as e:
+                    # Imprime el error en la terminal y regresa mensajes de error a la interfaz
+                    print("Error detectado en flujo completo:", str(e))
+                    return (
+                        "Error durante el flujo completo",
+                        None,  # Audio grabado
+                        f"Error: {str(e)}",  # Transcripción
+                        None,  # Audio generado
+                        None   # Video procesado
+                            )
+            grabar_button.click(
+                flujo_completo,
+                outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado]
+            )
+    return demo
+if __name__ == "__main__":
+    demo = interfaz()
+    demo.launch(allowed_paths=["C:/programacionEjercicios/miwav2lipv6/assets", "C:/programacionEjercicios/miwav2lipv6/results"])

src/ov_inference.py ADDED Viewed

	@@ -0,0 +1,637 @@

+from glob import glob
+from enum import Enum
+import math
+import subprocess
+import cv2
+import numpy as np
+from tqdm import tqdm
+import torch
+import torch.nn.functional as F
+from Wav2Lip import audio
+import openvino as ov
+device = "cpu"
+def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
+    xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
+    dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
+    dw, dh = math.log(ww / aww), math.log(hh / ahh)
+    return dx, dy, dw, dh
+def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
+    xc, yc = dx * aww + axc, dy * ahh + ayc
+    ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
+    x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
+    return x1, y1, x2, y2
+def nms(dets, thresh):
+    if 0 == len(dets):
+        return []
+    x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
+        xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
+        w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
+        ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+    return keep
+def encode(matched, priors, variances):
+    """Encode the variances from the priorbox layers into the ground truth boxes
+    we have matched (based on jaccard overlap) with the prior boxes.
+    Args:
+        matched: (tensor) Coords of ground truth for each prior in point-form
+            Shape: [num_priors, 4].
+        priors: (tensor) Prior boxes in center-offset form
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        encoded boxes (tensor), Shape: [num_priors, 4]
+    """
+    # dist b/t match center and prior's center
+    g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
+    # encode variance
+    g_cxcy /= variances[0] * priors[:, 2:]
+    # match wh / prior wh
+    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
+    g_wh = torch.log(g_wh) / variances[1]
+    # return target for smooth_l1_loss
+    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+    boxes = torch.cat((priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+def batch_decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+    boxes = torch.cat((priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:], priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
+    boxes[:, :, :2] -= boxes[:, :, 2:] / 2
+    boxes[:, :, 2:] += boxes[:, :, :2]
+    return boxes
+def get_smoothened_boxes(boxes, T):
+    for i in range(len(boxes)):
+        if i + T > len(boxes):
+            window = boxes[len(boxes) - T :]
+        else:
+            window = boxes[i : i + T]
+        boxes[i] = np.mean(window, axis=0)
+    return boxes
+def detect(net, img, device):
+    img = img - np.array([104, 117, 123])
+    img = img.transpose(2, 0, 1)
+    img = img.reshape((1,) + img.shape)
+    img = torch.from_numpy(img).float().to(device)
+    BB, CC, HH, WW = img.size()
+    results = net({"x": img})
+    olist = [torch.Tensor(results[i]) for i in range(12)]
+    bboxlist = []
+    for i in range(len(olist) // 2):
+        olist[i * 2] = F.softmax(olist[i * 2], dim=1)
+    olist = [oelem.data.cpu() for oelem in olist]
+    for i in range(len(olist) // 2):
+        ocls, oreg = olist[i * 2], olist[i * 2 + 1]
+        FB, FC, FH, FW = ocls.size()  # feature map size
+        stride = 2 ** (i + 2)  # 4,8,16,32,64,128
+        anchor = stride * 4
+        poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
+        for Iindex, hindex, windex in poss:
+            axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
+            score = ocls[0, 1, hindex, windex]
+            loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
+            priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
+            variances = [0.1, 0.2]
+            box = decode(loc, priors, variances)
+            x1, y1, x2, y2 = box[0] * 1.0
+            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
+            bboxlist.append([x1, y1, x2, y2, score])
+    bboxlist = np.array(bboxlist)
+    if 0 == len(bboxlist):
+        bboxlist = np.zeros((1, 5))
+    return bboxlist
+def batch_detect(net, imgs, device):
+    imgs = imgs - np.array([104, 117, 123])
+    imgs = imgs.transpose(0, 3, 1, 2)
+    imgs = torch.from_numpy(imgs).float().to(device)
+    BB, CC, HH, WW = imgs.size()
+    results = net({"x": imgs.numpy()})
+    olist = [torch.Tensor(results[i]) for i in range(12)]
+    bboxlist = []
+    for i in range(len(olist) // 2):
+        olist[i * 2] = F.softmax(olist[i * 2], dim=1)
+        # olist[i * 2] = (olist[i * 2], dim=1)
+    olist = [oelem.data.cpu() for oelem in olist]
+    for i in range(len(olist) // 2):
+        ocls, oreg = olist[i * 2], olist[i * 2 + 1]
+        FB, FC, FH, FW = ocls.size()  # feature map size
+        stride = 2 ** (i + 2)  # 4,8,16,32,64,128
+        anchor = stride * 4
+        poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
+        for Iindex, hindex, windex in poss:
+            axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
+            score = ocls[:, 1, hindex, windex]
+            loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
+            priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
+            variances = [0.1, 0.2]
+            box = batch_decode(loc, priors, variances)
+            box = box[:, 0] * 1.0
+            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
+            bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
+    bboxlist = np.array(bboxlist)
+    if 0 == len(bboxlist):
+        bboxlist = np.zeros((1, BB, 5))
+    return bboxlist
+def flip_detect(net, img, device):
+    img = cv2.flip(img, 1)
+    b = detect(net, img, device)
+    bboxlist = np.zeros(b.shape)
+    bboxlist[:, 0] = img.shape[1] - b[:, 2]
+    bboxlist[:, 1] = b[:, 1]
+    bboxlist[:, 2] = img.shape[1] - b[:, 0]
+    bboxlist[:, 3] = b[:, 3]
+    bboxlist[:, 4] = b[:, 4]
+    return bboxlist
+def pts_to_bb(pts):
+    min_x, min_y = np.min(pts, axis=0)
+    max_x, max_y = np.max(pts, axis=0)
+    return np.array([min_x, min_y, max_x, max_y])
+class OVFaceDetector(object):
+    """An abstract class representing a face detector.
+    Any other face detection implementation must subclass it. All subclasses
+    must implement ``detect_from_image``, that return a list of detected
+    bounding boxes. Optionally, for speed considerations detect from path is
+    recommended.
+    """
+    def __init__(self, device, verbose):
+        self.device = device
+        self.verbose = verbose
+    def detect_from_image(self, tensor_or_path):
+        """Detects faces in a given image.
+        This function detects the faces present in a provided BGR(usually)
+        image. The input can be either the image itself or the path to it.
+        Arguments:
+            tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
+            to an image or the image itself.
+        Example::
+            >>> path_to_image = 'data/image_01.jpg'
+            ...   detected_faces = detect_from_image(path_to_image)
+            [A list of bounding boxes (x1, y1, x2, y2)]
+            >>> image = cv2.imread(path_to_image)
+            ...   detected_faces = detect_from_image(image)
+            [A list of bounding boxes (x1, y1, x2, y2)]
+        """
+        raise NotImplementedError
+    def detect_from_directory(self, path, extensions=[".jpg", ".png"], recursive=False, show_progress_bar=True):
+        """Detects faces from all the images present in a given directory.
+        Arguments:
+            path {string} -- a string containing a path that points to the folder containing the images
+        Keyword Arguments:
+            extensions {list} -- list of string containing the extensions to be
+            consider in the following format: ``.extension_name`` (default:
+            {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
+            folder recursively (default: {False}) show_progress_bar {bool} --
+            display a progressbar (default: {True})
+        Example:
+        >>> directory = 'data'
+        ...   detected_faces = detect_from_directory(directory)
+        {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
+        """
+        if self.verbose:
+            logger = logging.getLogger(__name__)
+        if len(extensions) == 0:
+            if self.verbose:
+                logger.error("Expected at list one extension, but none was received.")
+            raise ValueError
+        if self.verbose:
+            logger.info("Constructing the list of images.")
+        additional_pattern = "/**/*" if recursive else "/*"
+        files = []
+        for extension in extensions:
+            files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
+        if self.verbose:
+            logger.info("Finished searching for images. %s images found", len(files))
+            logger.info("Preparing to run the detection.")
+        predictions = {}
+        for image_path in tqdm(files, disable=not show_progress_bar):
+            if self.verbose:
+                logger.info("Running the face detector on image: %s", image_path)
+            predictions[image_path] = self.detect_from_image(image_path)
+        if self.verbose:
+            logger.info("The detector was successfully run on all %s images", len(files))
+        return predictions
+    @property
+    def reference_scale(self):
+        raise NotImplementedError
+    @property
+    def reference_x_shift(self):
+        raise NotImplementedError
+    @property
+    def reference_y_shift(self):
+        raise NotImplementedError
+    @staticmethod
+    def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
+        """Convert path (represented as a string) or torch.tensor to a numpy.ndarray
+        Arguments:
+            tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
+        """
+        if isinstance(tensor_or_path, str):
+            return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
+        elif torch.is_tensor(tensor_or_path):
+            # Call cpu in case its coming from cuda
+            return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
+        elif isinstance(tensor_or_path, np.ndarray):
+            return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
+        else:
+            raise TypeError
+class OVSFDDetector(OVFaceDetector):
+    def __init__(self, device, path_to_detector="models/face_detection.xml", verbose=False):
+        super(OVSFDDetector, self).__init__(device, verbose)
+        core = ov.Core()
+        self.face_detector = core.compile_model(path_to_detector, self.device)
+    def detect_from_image(self, tensor_or_path):
+        image = self.tensor_or_path_to_ndarray(tensor_or_path)
+        bboxlist = detect(self.face_detector, image, device="cpu")
+        keep = nms(bboxlist, 0.3)
+        bboxlist = bboxlist[keep, :]
+        bboxlist = [x for x in bboxlist if x[-1] > 0.5]
+        return bboxlist
+    def detect_from_batch(self, images):
+        bboxlists = batch_detect(self.face_detector, images, device="cpu")
+        keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
+        bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
+        bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
+        return bboxlists
+    @property
+    def reference_scale(self):
+        return 195
+    @property
+    def reference_x_shift(self):
+        return 0
+    @property
+    def reference_y_shift(self):
+        return 0
+class LandmarksType(Enum):
+    """Enum class defining the type of landmarks to detect.
+    ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
+    ``_2halfD`` - this points represent the projection of the 3D points into 3D
+    ``_3D`` - detect the points ``(x,y,z)``` in a 3D space
+    """
+    _2D = 1
+    _2halfD = 2
+    _3D = 3
+class NetworkSize(Enum):
+    # TINY = 1
+    # SMALL = 2
+    # MEDIUM = 3
+    LARGE = 4
+    def __new__(cls, value):
+        member = object.__new__(cls)
+        member._value_ = value
+        return member
+    def __int__(self):
+        return self.value
+class OVFaceAlignment:
+    def __init__(
+        self, landmarks_type, network_size=NetworkSize.LARGE, device="CPU", flip_input=False, verbose=False, path_to_detector="models/face_detection.xml"
+    ):
+        self.device = device
+        self.flip_input = flip_input
+        self.landmarks_type = landmarks_type
+        self.verbose = verbose
+        network_size = int(network_size)
+        self.face_detector = OVSFDDetector(device=device, path_to_detector=path_to_detector, verbose=verbose)
+    def get_detections_for_batch(self, images):
+        images = images[..., ::-1]
+        detected_faces = self.face_detector.detect_from_batch(images.copy())
+        results = []
+        for i, d in enumerate(detected_faces):
+            if len(d) == 0:
+                results.append(None)
+                continue
+            d = d[0]
+            d = np.clip(d, 0, None)
+            x1, y1, x2, y2 = map(int, d[:-1])
+            results.append((x1, y1, x2, y2))
+        return results
+def face_detect_ov(images, device, face_det_batch_size, pads, nosmooth, path_to_detector):
+    detector = OVFaceAlignment(LandmarksType._2D, flip_input=False, device=device, path_to_detector=path_to_detector)
+    batch_size = face_det_batch_size
+    print("face_detect_ov images[0].shape: ", images[0].shape)
+    while 1:
+        predictions = []
+        try:
+            for i in tqdm(range(0, len(images), batch_size)):
+                predictions.extend(detector.get_detections_for_batch(np.array(images[i : i + batch_size])))
+        except RuntimeError:
+            if batch_size == 1:
+                raise RuntimeError("Image too big to run face detection on GPU. Please use the --resize_factor argument")
+            batch_size //= 2
+            print("Recovering from OOM error; New batch size: {}".format(batch_size))
+            continue
+        break
+    results = []
+    pady1, pady2, padx1, padx2 = pads
+    for rect, image in zip(predictions, images):
+        if rect is None:
+            # check this frame where the face was not detected.
+            cv2.imwrite("temp/faulty_frame.jpg", image)
+            raise ValueError("Face not detected! Ensure the video contains a face in all the frames.")
+        y1 = max(0, rect[1] - pady1)
+        y2 = min(image.shape[0], rect[3] + pady2)
+        x1 = max(0, rect[0] - padx1)
+        x2 = min(image.shape[1], rect[2] + padx2)
+        results.append([x1, y1, x2, y2])
+    boxes = np.array(results)
+    if not nosmooth:
+        boxes = get_smoothened_boxes(boxes, T=5)
+    results = [[image[y1:y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+    del detector
+    return results
+def datagen(frames, mels, box, static, face_det_batch_size, pads, nosmooth, img_size, wav2lip_batch_size, path_to_detector):
+    img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    if box[0] == -1:
+        if not static:
+            # BGR2RGB for CNN face detection
+            face_det_results = face_detect_ov(frames, "CPU", face_det_batch_size, pads, nosmooth, path_to_detector)
+        else:
+            face_det_results = face_detect_ov([frames[0]], "CPU", face_det_batch_size, pads, nosmooth, path_to_detector)
+    else:
+        print("Using the specified bounding box instead of face detection...")
+        y1, y2, x1, x2 = box
+        face_det_results = [[f[y1:y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
+    for i, m in enumerate(mels):
+        idx = 0 if static else i % len(frames)
+        frame_to_save = frames[idx].copy()
+        face, coords = face_det_results[idx].copy()
+        face = cv2.resize(face, (img_size, img_size))
+        img_batch.append(face)
+        mel_batch.append(m)
+        frame_batch.append(frame_to_save)
+        coords_batch.append(coords)
+        if len(img_batch) >= wav2lip_batch_size:
+            img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+            img_masked = img_batch.copy()
+            img_masked[:, img_size // 2 :] = 0
+            img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
+            mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+            yield img_batch, mel_batch, frame_batch, coords_batch
+            img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    if len(img_batch) > 0:
+        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+        img_masked = img_batch.copy()
+        img_masked[:, img_size // 2 :] = 0
+        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
+        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+        yield img_batch, mel_batch, frame_batch, coords_batch
+def ov_inference(
+    face_path,
+    audio_path,
+    face_detection_path="models/face_detection.xml",
+    wav2lip_path="models/wav2lip.xml",
+    inference_device="CPU",
+    wav2lip_batch_size=128,
+    outfile="results/result_voice.mp4",
+    resize_factor=1,
+    rotate=False,
+    crop=[0, -1, 0, -1],
+    mel_step_size=16,
+    box=[-1, -1, -1, -1],
+    static=False,
+    img_size=96,
+    face_det_batch_size=16,
+    pads=[0, 10, 0, 0],
+    nosmooth=False,
+):
+    print("Reading video frames...")
+    video_stream = cv2.VideoCapture(face_path)
+    fps = video_stream.get(cv2.CAP_PROP_FPS)
+    full_frames = []
+    while 1:
+        still_reading, frame = video_stream.read()
+        if not still_reading:
+            video_stream.release()
+            break
+        if resize_factor > 1:
+            frame = cv2.resize(frame, (frame.shape[1] // resize_factor, frame.shape[0] // resize_factor))
+        if rotate:
+            frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
+        y1, y2, x1, x2 = crop
+        if x2 == -1:
+            x2 = frame.shape[1]
+        if y2 == -1:
+            y2 = frame.shape[0]
+        frame = frame[y1:y2, x1:x2]
+        full_frames.append(frame)
+    print("Number of frames available for inference: " + str(len(full_frames)))
+    core = ov.Core()
+    if not audio_path.endswith(".wav"):
+        print("Extracting raw audio...")
+        command = "ffmpeg -y -i {} -strict -2 {}".format(audio_path, "temp/temp.wav")
+        subprocess.call(command, shell=True)
+        audio_path = "temp/temp.wav"
+    wav = audio.load_wav(audio_path, 16000)
+    mel = audio.melspectrogram(wav)
+    print(mel.shape)
+    if np.isnan(mel.reshape(-1)).sum() > 0:
+        raise ValueError("Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again")
+    mel_chunks = []
+    mel_idx_multiplier = 80.0 / fps
+    i = 0
+    while 1:
+        start_idx = int(i * mel_idx_multiplier)
+        if start_idx + mel_step_size > len(mel[0]):
+            mel_chunks.append(mel[:, len(mel[0]) - mel_step_size :])
+            break
+        mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+        i += 1
+    print("Length of mel chunks: {}".format(len(mel_chunks)))
+    full_frames = full_frames[: len(mel_chunks)]
+    batch_size = wav2lip_batch_size
+    gen = datagen(full_frames.copy(), mel_chunks, box, static, face_det_batch_size, pads, nosmooth, img_size, wav2lip_batch_size, face_detection_path)
+    for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, total=int(np.ceil(float(len(mel_chunks)) / batch_size)))):
+        if i == 0:
+            img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+            mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+            compiled_wav2lip_model = core.compile_model(wav2lip_path, inference_device)
+            print("Model loaded")
+            frame_h, frame_w = full_frames[0].shape[:-1]
+            out = cv2.VideoWriter("C:/programacionEjercicios/miwav2lipv6/src/Wav2Lip/temp/result.avi", cv2.VideoWriter_fourcc(*"DIVX"), fps, (frame_w, frame_h))
+            pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch.numpy(), "face_sequences": img_batch.numpy()})[0]
+        else:
+            img_batch = np.transpose(img_batch, (0, 3, 1, 2))
+            mel_batch = np.transpose(mel_batch, (0, 3, 1, 2))
+            pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch, "face_sequences": img_batch})[0]
+        pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch, "face_sequences": img_batch})[0]
+        pred_ov = pred_ov.transpose(0, 2, 3, 1) * 255.0
+        for p, f, c in zip(pred_ov, frames, coords):
+            y1, y2, x1, x2 = c
+            p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
+            f[y1:y2, x1:x2] = p
+            out.write(f)
+    out.release()
+    command = "ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}".format(audio_path, "C:/programacionEjercicios/miwav2lipv6/src/Wav2Lip/temp/result.avi", outfile)
+    subprocess.call(command, shell=True)
+    return outfile

src/ov_wav2lip_helper.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+import sys
+import os
+import openvino as ov
+import torch
+from pathlib import Path
+# Añade `src` al `sys.path` para que Python encuentre `utils/notebook_utils.py`
+sys.path.append(str(Path(__file__).resolve().parent))
+# Importa `download_file` desde `notebook_utils`
+from utils.notebook_utils import download_file
+from huggingface_hub import hf_hub_download
+from Wav2Lip.face_detection.detection.sfd.net_s3fd import s3fd
+from Wav2Lip.models import Wav2Lip
+def _load(checkpoint_path):
+    checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
+    return checkpoint
+def load_model(path):
+    model = Wav2Lip()
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace("module.", "")] = v
+    model.load_state_dict(new_s)
+    return model.eval()
+def download_and_convert_models(ov_face_detection_model_path, ov_wav2lip_model_path):
+    models_urls = {"s3fd": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"}
+    path_to_detector = "checkpoints/face_detection.pth"
+    # Convert Face Detection Model
+    print("Convert Face Detection Model ...")
+    if not os.path.isfile(path_to_detector):
+        download_file(models_urls["s3fd"])
+        if not os.path.exists("checkpoints"):
+            os.mkdir("checkpoints")
+        os.replace("s3fd-619a316812.pth", path_to_detector)
+    model_weights = torch.load(path_to_detector)
+    face_detector = s3fd()
+    face_detector.load_state_dict(model_weights)
+    if not ov_face_detection_model_path.exists():
+        face_detection_dummy_inputs = torch.FloatTensor(np.random.rand(1, 3, 768, 576))
+        face_detection_ov_model = ov.convert_model(face_detector, example_input=face_detection_dummy_inputs)
+        ov.save_model(face_detection_ov_model, ov_face_detection_model_path)
+    print("Converted face detection OpenVINO model: ", ov_face_detection_model_path)
+    print("Convert Wav2Lip Model ...")
+    path_to_wav2lip = hf_hub_download(repo_id="numz/wav2lip_studio", filename="Wav2lip/wav2lip.pth", local_dir="checkpoints")
+    wav2lip = load_model(path_to_wav2lip)
+    img_batch = torch.FloatTensor(np.random.rand(123, 6, 96, 96))
+    mel_batch = torch.FloatTensor(np.random.rand(123, 1, 80, 16))
+    if not ov_wav2lip_model_path.exists():
+        example_inputs = {"audio_sequences": mel_batch, "face_sequences": img_batch}
+        wav2lip_ov_model = ov.convert_model(wav2lip, example_input=example_inputs)
+        ov.save_model(wav2lip_ov_model, ov_wav2lip_model_path)
+    print("Converted face detection OpenVINO model: ", ov_wav2lip_model_path)

src/run_inference.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+from ov_inference import ov_inference
+import soundfile as sf
+import cv2
+def verificar_archivos(video_path, audio_path):
+    """
+    Verifica que los archivos de video y audio existen y son legibles.
+    Args:
+        video_path (str): Ruta del archivo de video.
+        audio_path (str): Ruta del archivo de audio.
+    Returns:
+        bool: True si ambos archivos son legibles, False en caso contrario.
+    """
+    # Verificar el archivo de video
+    if not os.path.exists(video_path):
+        print(f"Error: El archivo de video no existe en la ruta {video_path}")
+        return False
+    else:
+        # Intentar abrir el video
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            print(f"Error: No se puede abrir el archivo de video en {video_path}")
+            return False
+        else:
+            print(f"Archivo de video {video_path} está accesible.")
+        cap.release()
+    # Verificar el archivo de audio
+    if not os.path.exists(audio_path):
+        print(f"Error: El archivo de audio no existe en la ruta {audio_path}")
+        return False
+    else:
+        try:
+            # Intentar abrir el archivo de audio
+            with sf.SoundFile(audio_path) as audio_file:
+                print(f"Archivo de audio {audio_path} está accesible.")
+        except Exception as e:
+            print(f"Error al leer el archivo de audio: {e}")
+            return False
+    return True
+# Rutas de archivos
+#video_path = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun_5s.mp4")
+video_path = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun.mp4")
+#audio_path = os.path.abspath("../miwav2lipv6/assets/audio/grabacion_gradio.wav")
+audio_path = os.path.abspath("../miwav2lipv6/assets/audio/audio.wav")
+face_detection_path = os.path.abspath("../miwav2lipv6/models/face_detection.xml")
+wav2lip_path = os.path.abspath("../miwav2lipv6/models/wav2lip.xml")
+outfile = os.path.abspath("../miwav2lipv6/results/result_voice.mp4")
+# Verificar archivos antes de llamar a ov_inference
+if verificar_archivos(video_path, audio_path):
+    ov_inference(
+        video_path,
+        audio_path,
+        face_detection_path=face_detection_path,
+        wav2lip_path=wav2lip_path,
+        inference_device="CPU",
+        outfile=outfile,
+        resize_factor = 2,
+    )
+else:
+    print("No se pudo proceder con la inferencia debido a problemas con los archivos.")

src/text_to_speech.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# text_to_speech.py
+from gtts import gTTS
+import os
+# Rutas de los archivos
+#TRANSCRIPTION_TEXT_PATH = "C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt"
+TRANSCRIPTION_TEXT_PATH = "C:/programacionEjercicios/miwav2lipv6/results/OpenAI_response.txt"
+OUTPUT_AUDIO_PATH = "C:/programacionEjercicios/miwav2lipv6/assets/audio/audio.wav"
+def generar_audio_desde_texto():
+    """
+    Convierte el texto en `transcripcion.txt` a un archivo de audio en español (`audio.wav`).
+    """
+    try:
+        # Verificar si el archivo de transcripción existe
+        if not os.path.exists(TRANSCRIPTION_TEXT_PATH):
+            print("Error: No se encontró el archivo de transcripción.")
+            return
+        # Leer el contenido de transcripcion.txt
+        with open(TRANSCRIPTION_TEXT_PATH, "r", encoding="utf-8") as file:
+            texto = file.read()
+        # Generar el audio en español usando gTTS
+        tts = gTTS(text=texto, lang='es', slow=False)
+        tts.save(OUTPUT_AUDIO_PATH)
+        print(f"Audio generado correctamente en: {OUTPUT_AUDIO_PATH}")
+    except Exception as e:
+        print(f"Error al generar el audio: {e}")
+if __name__ == "__main__":
+    generar_audio_desde_texto()

src/utils/notebook_utils.py ADDED Viewed

	@@ -0,0 +1,708 @@

+import os
+import platform
+import sys
+import threading
+import time
+import urllib.parse
+from os import PathLike
+from pathlib import Path
+from typing import List, NamedTuple, Optional, Tuple
+from tqdm import tqdm
+import numpy as np
+from openvino.runtime import Core, Type, get_version
+from IPython.display import HTML, Image, display
+import openvino as ov
+from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
+from openvino.runtime import opset10 as ops
+# ## Files
+#
+# Load an image, download a file, download an IR model, and create a progress bar to show download progress.
+def device_widget(default="AUTO", exclude=None, added=None):
+    import openvino as ov
+    import ipywidgets as widgets
+    core = ov.Core()
+    supported_devices = core.available_devices + ["AUTO"]
+    exclude = exclude or []
+    if exclude:
+        for ex_device in exclude:
+            if ex_device in supported_devices:
+                supported_devices.remove(ex_device)
+    added = added or []
+    if added:
+        for add_device in added:
+            if add_device not in supported_devices:
+                supported_devices.append(add_device)
+    device = widgets.Dropdown(
+        options=supported_devices,
+        value=default,
+        description="Device:",
+        disabled=False,
+    )
+    return device
+def quantization_widget(default=True):
+    import ipywidgets as widgets
+    to_quantize = widgets.Checkbox(
+        value=default,
+        description="Quantization",
+        disabled=False,
+    )
+    return to_quantize
+def pip_install(*args):
+    import subprocess  # nosec - disable B404:import-subprocess check
+    cli_args = []
+    for arg in args:
+        cli_args.extend(str(arg).split(" "))
+    subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], shell=(platform.system() == "Windows"), check=True)
+def load_image(path: str) -> np.ndarray:
+    """
+    Loads an image from `path` and returns it as BGR numpy array. `path`
+    should point to an image file, either a local filename or a url. The image is
+    not stored to the filesystem. Use the `download_file` function to download and
+    store an image.
+    :param path: Local path name or URL to image.
+    :return: image as BGR numpy array
+    """
+    import cv2
+    import requests
+    if path.startswith("http"):
+        # Set User-Agent to Mozilla because some websites block
+        # requests with User-Agent Python
+        response = requests.get(path, headers={"User-Agent": "Mozilla/5.0"})
+        array = np.asarray(bytearray(response.content), dtype="uint8")
+        image = cv2.imdecode(array, -1)  # Loads the image as BGR
+    else:
+        image = cv2.imread(path)
+    return image
+def download_file(
+    url: PathLike,
+    filename: PathLike = None,
+    directory: PathLike = None,
+    show_progress: bool = True,
+    silent: bool = False,
+    timeout: int = 10,
+) -> PathLike:
+    """
+    Download a file from a url and save it to the local filesystem. The file is saved to the
+    current directory by default, or to `directory` if specified. If a filename is not given,
+    the filename of the URL will be used.
+    :param url: URL that points to the file to download
+    :param filename: Name of the local file to save. Should point to the name of the file only,
+                     not the full path. If None the filename from the url will be used
+    :param directory: Directory to save the file to. Will be created if it doesn't exist
+                      If None the file will be saved to the current working directory
+    :param show_progress: If True, show an TQDM ProgressBar
+    :param silent: If True, do not print a message if the file already exists
+    :param timeout: Number of seconds before cancelling the connection attempt
+    :return: path to downloaded file
+    """
+    from tqdm.notebook import tqdm_notebook
+    import requests
+    filename = filename or Path(urllib.parse.urlparse(url).path).name
+    chunk_size = 16384  # make chunks bigger so that not too many updates are triggered for Jupyter front-end
+    filename = Path(filename)
+    if len(filename.parts) > 1:
+        raise ValueError(
+            "`filename` should refer to the name of the file, excluding the directory. "
+            "Use the `directory` parameter to specify a target directory for the downloaded file."
+        )
+    # create the directory if it does not exist, and add the directory to the filename
+    if directory is not None:
+        directory = Path(directory)
+        directory.mkdir(parents=True, exist_ok=True)
+        filename = directory / Path(filename)
+    try:
+        response = requests.get(url=url, headers={"User-agent": "Mozilla/5.0"}, stream=True)
+        response.raise_for_status()
+    except (
+        requests.exceptions.HTTPError
+    ) as error:  # For error associated with not-200 codes. Will output something like: "404 Client Error: Not Found for url: {url}"
+        raise Exception(error) from None
+    except requests.exceptions.Timeout:
+        raise Exception(
+            "Connection timed out. If you access the internet through a proxy server, please "
+            "make sure the proxy is set in the shell from where you launched Jupyter."
+        ) from None
+    except requests.exceptions.RequestException as error:
+        raise Exception(f"File downloading failed with error: {error}") from None
+    # download the file if it does not exist, or if it exists with an incorrect file size
+    filesize = int(response.headers.get("Content-length", 0))
+    if not filename.exists() or (os.stat(filename).st_size != filesize):
+        with tqdm(
+            total=filesize,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            desc=str(filename),
+            disable=not show_progress,
+        ) as progress_bar:
+            with open(filename, "wb") as file_object:
+                for chunk in response.iter_content(chunk_size):
+                    file_object.write(chunk)
+                    progress_bar.update(len(chunk))
+                    progress_bar.refresh()
+    else:
+        if not silent:
+            print(f"'{filename}' already exists.")
+    response.close()
+    return filename.resolve()
+def download_ir_model(model_xml_url: str, destination_folder: PathLike = None) -> PathLike:
+    """
+    Download IR model from `model_xml_url`. Downloads model xml and bin file; the weights file is
+    assumed to exist at the same location and name as model_xml_url with a ".bin" extension.
+    :param model_xml_url: URL to model xml file to download
+    :param destination_folder: Directory where downloaded model xml and bin are saved. If None, model
+                               files are saved to the current directory
+    :return: path to downloaded xml model file
+    """
+    model_bin_url = model_xml_url[:-4] + ".bin"
+    model_xml_path = download_file(model_xml_url, directory=destination_folder, show_progress=False)
+    download_file(model_bin_url, directory=destination_folder)
+    return model_xml_path
+# ## Images
+# ### Convert Pixel Data
+#
+# Normalize image pixel values between 0 and 1, and convert images to RGB and BGR.
+# In[ ]:
+def normalize_minmax(data):
+    """
+    Normalizes the values in `data` between 0 and 1
+    """
+    if data.max() == data.min():
+        raise ValueError("Normalization is not possible because all elements of" f"`data` have the same value: {data.max()}.")
+    return (data - data.min()) / (data.max() - data.min())
+def to_rgb(image_data: np.ndarray) -> np.ndarray:
+    """
+    Convert image_data from BGR to RGB
+    """
+    import cv2
+    return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)
+def to_bgr(image_data: np.ndarray) -> np.ndarray:
+    """
+    Convert image_data from RGB to BGR
+    """
+    import cv2
+    return cv2.cvtColor(image_data, cv2.COLOR_RGB2BGR)
+# ## Videos
+# ### Video Player
+#
+# Custom video player to fulfill FPS requirements. You can set target FPS and output size, flip the video horizontally or skip first N frames.
+# In[ ]:
+class VideoPlayer:
+    """
+    Custom video player to fulfill FPS requirements. You can set target FPS and output size,
+    flip the video horizontally or skip first N frames.
+    :param source: Video source. It could be either camera device or video file.
+    :param size: Output frame size.
+    :param flip: Flip source horizontally.
+    :param fps: Target FPS.
+    :param skip_first_frames: Skip first N frames.
+    """
+    def __init__(self, source, size=None, flip=False, fps=None, skip_first_frames=0, width=1280, height=720):
+        import cv2
+        self.cv2 = cv2  # This is done to access the package in class methods
+        self.__cap = cv2.VideoCapture(source)
+        # try HD by default to get better video quality
+        self.__cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
+        self.__cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
+        if not self.__cap.isOpened():
+            raise RuntimeError(f"Cannot open {'camera' if isinstance(source, int) else ''} {source}")
+        # skip first N frames
+        self.__cap.set(cv2.CAP_PROP_POS_FRAMES, skip_first_frames)
+        # fps of input file
+        self.__input_fps = self.__cap.get(cv2.CAP_PROP_FPS)
+        if self.__input_fps <= 0:
+            self.__input_fps = 60
+        # target fps given by user
+        self.__output_fps = fps if fps is not None else self.__input_fps
+        self.__flip = flip
+        self.__size = None
+        self.__interpolation = None
+        if size is not None:
+            self.__size = size
+            # AREA better for shrinking, LINEAR better for enlarging
+            self.__interpolation = cv2.INTER_AREA if size[0] < self.__cap.get(cv2.CAP_PROP_FRAME_WIDTH) else cv2.INTER_LINEAR
+        # first frame
+        _, self.__frame = self.__cap.read()
+        self.__lock = threading.Lock()
+        self.__thread = None
+        self.__stop = False
+    """
+    Start playing.
+    """
+    def start(self):
+        self.__stop = False
+        self.__thread = threading.Thread(target=self.__run, daemon=True)
+        self.__thread.start()
+    """
+    Stop playing and release resources.
+    """
+    def stop(self):
+        self.__stop = True
+        if self.__thread is not None:
+            self.__thread.join()
+        self.__cap.release()
+    def __run(self):
+        prev_time = 0
+        while not self.__stop:
+            t1 = time.time()
+            ret, frame = self.__cap.read()
+            if not ret:
+                break
+            # fulfill target fps
+            if 1 / self.__output_fps < time.time() - prev_time:
+                prev_time = time.time()
+                # replace by current frame
+                with self.__lock:
+                    self.__frame = frame
+            t2 = time.time()
+            # time to wait [s] to fulfill input fps
+            wait_time = 1 / self.__input_fps - (t2 - t1)
+            # wait until
+            time.sleep(max(0, wait_time))
+        self.__frame = None
+    """
+    Get current frame.
+    """
+    def next(self):
+        import cv2
+        with self.__lock:
+            if self.__frame is None:
+                return None
+            # need to copy frame, because can be cached and reused if fps is low
+            frame = self.__frame.copy()
+        if self.__size is not None:
+            frame = self.cv2.resize(frame, self.__size, interpolation=self.__interpolation)
+        if self.__flip:
+            frame = self.cv2.flip(frame, 1)
+        return frame
+# ## Visualization
+# ### Segmentation
+#
+# Define a SegmentationMap NamedTuple that keeps the labels and colormap for a segmentation project/dataset. Create CityScapesSegmentation and BinarySegmentation SegmentationMaps. Create a function to convert a segmentation map to an RGB image with a colormap, and to show the segmentation result as an overlay over the original image.
+# In[ ]:
+class Label(NamedTuple):
+    index: int
+    color: Tuple
+    name: Optional[str] = None
+# In[ ]:
+class SegmentationMap(NamedTuple):
+    labels: List
+    def get_colormap(self):
+        return np.array([label.color for label in self.labels])
+    def get_labels(self):
+        labelnames = [label.name for label in self.labels]
+        if any(labelnames):
+            return labelnames
+        else:
+            return None
+# In[ ]:
+cityscape_labels = [
+    Label(index=0, color=(128, 64, 128), name="road"),
+    Label(index=1, color=(244, 35, 232), name="sidewalk"),
+    Label(index=2, color=(70, 70, 70), name="building"),
+    Label(index=3, color=(102, 102, 156), name="wall"),
+    Label(index=4, color=(190, 153, 153), name="fence"),
+    Label(index=5, color=(153, 153, 153), name="pole"),
+    Label(index=6, color=(250, 170, 30), name="traffic light"),
+    Label(index=7, color=(220, 220, 0), name="traffic sign"),
+    Label(index=8, color=(107, 142, 35), name="vegetation"),
+    Label(index=9, color=(152, 251, 152), name="terrain"),
+    Label(index=10, color=(70, 130, 180), name="sky"),
+    Label(index=11, color=(220, 20, 60), name="person"),
+    Label(index=12, color=(255, 0, 0), name="rider"),
+    Label(index=13, color=(0, 0, 142), name="car"),
+    Label(index=14, color=(0, 0, 70), name="truck"),
+    Label(index=15, color=(0, 60, 100), name="bus"),
+    Label(index=16, color=(0, 80, 100), name="train"),
+    Label(index=17, color=(0, 0, 230), name="motorcycle"),
+    Label(index=18, color=(119, 11, 32), name="bicycle"),
+    Label(index=19, color=(255, 255, 255), name="background"),
+]
+CityScapesSegmentation = SegmentationMap(cityscape_labels)
+binary_labels = [
+    Label(index=0, color=(255, 255, 255), name="background"),
+    Label(index=1, color=(0, 0, 0), name="foreground"),
+]
+BinarySegmentation = SegmentationMap(binary_labels)
+# In[ ]:
+def segmentation_map_to_image(result: np.ndarray, colormap: np.ndarray, remove_holes: bool = False) -> np.ndarray:
+    """
+    Convert network result of floating point numbers to an RGB image with
+    integer values from 0-255 by applying a colormap.
+    :param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
+    :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
+    :param remove_holes: If True, remove holes in the segmentation result.
+    :return: An RGB image where each pixel is an int8 value according to colormap.
+    """
+    import cv2
+    if len(result.shape) != 2 and result.shape[0] != 1:
+        raise ValueError(f"Expected result with shape (H,W) or (1,H,W), got result with shape {result.shape}")
+    if len(np.unique(result)) > colormap.shape[0]:
+        raise ValueError(
+            f"Expected max {colormap[0]} classes in result, got {len(np.unique(result))} "
+            "different output values. Please make sure to convert the network output to "
+            "pixel values before calling this function."
+        )
+    elif result.shape[0] == 1:
+        result = result.squeeze(0)
+    result = result.astype(np.uint8)
+    contour_mode = cv2.RETR_EXTERNAL if remove_holes else cv2.RETR_TREE
+    mask = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8)
+    for label_index, color in enumerate(colormap):
+        label_index_map = result == label_index
+        label_index_map = label_index_map.astype(np.uint8) * 255
+        contours, hierarchies = cv2.findContours(label_index_map, contour_mode, cv2.CHAIN_APPROX_SIMPLE)
+        cv2.drawContours(
+            mask,
+            contours,
+            contourIdx=-1,
+            color=color.tolist(),
+            thickness=cv2.FILLED,
+        )
+    return mask
+def segmentation_map_to_overlay(image, result, alpha, colormap, remove_holes=False) -> np.ndarray:
+    """
+    Returns a new image where a segmentation mask (created with colormap) is overlayed on
+    the source image.
+    :param image: Source image.
+    :param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
+    :param alpha: Alpha transparency value for the overlay image.
+    :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
+    :param remove_holes: If True, remove holes in the segmentation result.
+    :return: An RGP image with segmentation mask overlayed on the source image.
+    """
+    import cv2
+    if len(image.shape) == 2:
+        image = np.repeat(np.expand_dims(image, -1), 3, 2)
+    mask = segmentation_map_to_image(result, colormap, remove_holes)
+    image_height, image_width = image.shape[:2]
+    mask = cv2.resize(src=mask, dsize=(image_width, image_height))
+    return cv2.addWeighted(mask, alpha, image, 1 - alpha, 0)
+# ### Network Results
+#
+# Show network result image, optionally together with the source image and a legend with labels.
+# In[ ]:
+def viz_result_image(
+    result_image: np.ndarray,
+    source_image: np.ndarray = None,
+    source_title: str = None,
+    result_title: str = None,
+    labels: List[Label] = None,
+    resize: bool = False,
+    bgr_to_rgb: bool = False,
+    hide_axes: bool = False,
+):
+    """
+    Show result image, optionally together with source images, and a legend with labels.
+    :param result_image: Numpy array of RGB result image.
+    :param source_image: Numpy array of source image. If provided this image will be shown
+                         next to the result image. source_image is expected to be in RGB format.
+                         Set bgr_to_rgb to True if source_image is in BGR format.
+    :param source_title: Title to display for the source image.
+    :param result_title: Title to display for the result image.
+    :param labels: List of labels. If provided, a legend will be shown with the given labels.
+    :param resize: If true, resize the result image to the same shape as the source image.
+    :param bgr_to_rgb: If true, convert the source image from BGR to RGB. Use this option if
+                       source_image is a BGR image.
+    :param hide_axes: If true, do not show matplotlib axes.
+    :return: Matplotlib figure with result image
+    """
+    import cv2
+    import matplotlib.pyplot as plt
+    from matplotlib.lines import Line2D
+    if bgr_to_rgb:
+        source_image = to_rgb(source_image)
+    if resize:
+        result_image = cv2.resize(result_image, (source_image.shape[1], source_image.shape[0]))
+    num_images = 1 if source_image is None else 2
+    fig, ax = plt.subplots(1, num_images, figsize=(16, 8), squeeze=False)
+    if source_image is not None:
+        ax[0, 0].imshow(source_image)
+        ax[0, 0].set_title(source_title)
+    ax[0, num_images - 1].imshow(result_image)
+    ax[0, num_images - 1].set_title(result_title)
+    if hide_axes:
+        for a in ax.ravel():
+            a.axis("off")
+    if labels:
+        colors = labels.get_colormap()
+        lines = [
+            Line2D(
+                [0],
+                [0],
+                color=[item / 255 for item in c.tolist()],
+                linewidth=3,
+                linestyle="-",
+            )
+            for c in colors
+        ]
+        plt.legend(
+            lines,
+            labels.get_labels(),
+            bbox_to_anchor=(1, 1),
+            loc="upper left",
+            prop={"size": 12},
+        )
+    plt.close(fig)
+    return fig
+# ### Live Inference
+# In[ ]:
+def show_array(frame: np.ndarray, display_handle=None):
+    """
+    Display array `frame`. Replace information at `display_handle` with `frame`
+    encoded as jpeg image. `frame` is expected to have data in BGR order.
+    Create a display_handle with: `display_handle = display(display_id=True)`
+    """
+    import cv2
+    _, frame = cv2.imencode(ext=".jpeg", img=frame)
+    if display_handle is None:
+        display_handle = display(Image(data=frame.tobytes()), display_id=True)
+    else:
+        display_handle.update(Image(data=frame.tobytes()))
+    return display_handle
+# ## Checks and Alerts
+#
+# Create an alert class to show stylized info/error/warning messages and a `check_device` function that checks whether a given device is available.
+# In[ ]:
+class NotebookAlert(Exception):
+    def __init__(self, message: str, alert_class: str):
+        """
+        Show an alert box with the given message.
+        :param message: The message to display.
+        :param alert_class: The class for styling the message. Options: info, warning, success, danger.
+        """
+        self.message = message
+        self.alert_class = alert_class
+        self.show_message()
+    def show_message(self):
+        display(HTML(f"""<div class="alert alert-{self.alert_class}">{self.message}"""))
+class DeviceNotFoundAlert(NotebookAlert):
+    def __init__(self, device: str):
+        """
+        Show a warning message about an unavailable device. This class does not check whether or
+        not the device is available, use the `check_device` function to check this. `check_device`
+        also shows the warning if the device is not found.
+        :param device: The unavailable device.
+        :return: A formatted alert box with the message that `device` is not available, and a list
+                 of devices that are available.
+        """
+        ie = Core()
+        supported_devices = ie.available_devices
+        self.message = f"Running this cell requires a {device} device, " "which is not available on this system. "
+        self.alert_class = "warning"
+        if len(supported_devices) == 1:
+            self.message += f"The following device is available: {ie.available_devices[0]}"
+        else:
+            self.message += "The following devices are available: " f"{', '.join(ie.available_devices)}"
+        super().__init__(self.message, self.alert_class)
+def check_device(device: str) -> bool:
+    """
+    Check if the specified device is available on the system.
+    :param device: Device to check. e.g. CPU, GPU
+    :return: True if the device is available, False if not. If the device is not available,
+             a DeviceNotFoundAlert will be shown.
+    """
+    ie = Core()
+    if device not in ie.available_devices:
+        DeviceNotFoundAlert(device)
+        return False
+    else:
+        return True
+def check_openvino_version(version: str) -> bool:
+    """
+    Check if the specified OpenVINO version is installed.
+    :param version: the OpenVINO version to check. Example: 2021.4
+    :return: True if the version is installed, False if not. If the version is not installed,
+             an alert message will be shown.
+    """
+    installed_version = get_version()
+    if version not in installed_version:
+        NotebookAlert(
+            f"This notebook requires OpenVINO {version}. "
+            f"The version on your system is: <i>{installed_version}</i>.<br>"
+            "Please run <span style='font-family:monospace'>pip install --upgrade -r requirements.txt</span> "
+            "in the openvino_env environment to install this version. "
+            "See the <a href='https://github.com/openvinotoolkit/openvino_notebooks'>"
+            "OpenVINO Notebooks README</a> for detailed instructions",
+            alert_class="danger",
+        )
+        return False
+    else:
+        return True
+packed_layername_tensor_dict_list = [{"name": "aten::mul/Multiply"}]
+class ReplaceTensor(MatcherPass):
+    def __init__(self, packed_layername_tensor_dict_list):
+        MatcherPass.__init__(self)
+        self.model_changed = False
+        param = WrapType("opset10.Multiply")
+        def callback(matcher: Matcher) -> bool:
+            root = matcher.get_match_root()
+            if root is None:
+                return False
+            for y in packed_layername_tensor_dict_list:
+                root_name = root.get_friendly_name()
+                if root_name.find(y["name"]) != -1:
+                    max_fp16 = np.array([[[[-np.finfo(np.float16).max]]]]).astype(np.float32)
+                    new_tenser = ops.constant(max_fp16, Type.f32, name="Constant_4431")
+                    root.set_arguments([root.input_value(0).node, new_tenser])
+                    packed_layername_tensor_dict_list.remove(y)
+            return True
+        self.register_matcher(Matcher(param, "ReplaceTensor"), callback)
+def optimize_bge_embedding(model_path, output_model_path):
+    """
+    optimize_bge_embedding used to optimize BGE model for NPU device
+    Arguments:
+        model_path {str} -- original BGE IR model path
+        output_model_path {str} -- Converted BGE IR model path
+    """
+    core = Core()
+    ov_model = core.read_model(model_path)
+    manager = Manager()
+    manager.register_pass(ReplaceTensor(packed_layername_tensor_dict_list))
+    manager.run_passes(ov_model)
+    ov.save_model(ov_model, output_model_path, compress_to_fp16=False)

src/utils/pip_helper.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import sys
+def pip_install(*args):
+    import subprocess  # nosec - disable B404:import-subprocess check
+    cli_args = []
+    for arg in args:
+        cli_args.extend(str(arg).split(" "))
+    subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], check=True)

src/whisper_audio_extractor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# whisper_audio_extractor.py
+import sounddevice as sd
+from scipy.io.wavfile import write
+import whisper
+import os
+# Ruta para guardar el archivo de audio temporalmente
+AUDIO_PATH = os.path.join("..", "assets", "audio", "recorded_audio.wav")
+def record_audio(duration=5, sample_rate=44100):
+    """
+    Graba el audio del micrófono durante un tiempo específico y lo guarda como archivo WAV.
+    Args:
+        duration (int): Duración de la grabación en segundos.
+        sample_rate (int): Frecuencia de muestreo del audio.
+    """
+    print("Grabando...")
+    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2)
+    sd.wait()  # Espera a que finalice la grabación
+    write(AUDIO_PATH, sample_rate, audio_data)  # Guarda el audio en el directorio especificado
+    print(f"Grabación completa. Archivo guardado en {AUDIO_PATH}")
+def transcribe_audio():
+    """
+    Usa el modelo Whisper para transcribir el audio grabado y devuelve el texto.
+    Returns:
+        str: Texto transcrito del audio.
+    """
+    # Cargar el modelo de Whisper
+    model = whisper.load_model("base")
+    # Transcribir el audio
+    print("Transcribiendo el audio...")
+    result = model.transcribe(AUDIO_PATH)
+    print("Transcripción completada.")
+    return result["text"]
+if __name__ == "__main__":
+    # Paso 1: Grabar audio
+    record_audio()
+    # Paso 2: Transcribir audio
+    texto = transcribe_audio()
+    print("Texto extraído:", texto)

src/whisper_audio_transcriber.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# whisper_audio_transcriber.py
+import os
+from pathlib import Path
+import requests
+import librosa
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
+from transformers.utils import logging
+import soundfile as sf
+# Definición de modelos
+model_ids = {
+    "Multilingual models": [
+        "openai/whisper-large-v3-turbo",
+        "openai/whisper-large-v3",
+        "openai/whisper-large-v2",
+        "openai/whisper-large",
+        "openai/whisper-medium",
+        "openai/whisper-small",
+        "openai/whisper-base",
+        "openai/whisper-tiny",
+    ],
+    "English-only models": [
+        "distil-whisper/distil-large-v2",
+        "distil-whisper/distil-large-v3",
+        "distil-whisper/distil-medium.en",
+        "distil-whisper/distil-small.en",
+        "openai/whisper-medium.en",
+        "openai/whisper-small.en",
+        "openai/whisper-base.en",
+        "openai/whisper-tiny.en",
+    ],
+}
+def download_file(url, filename, directory="."):
+    """
+    Descarga un archivo desde una URL y lo guarda en el directorio especificado.
+    """
+    os.makedirs(directory, exist_ok=True)
+    filepath = Path(directory) / filename
+    response = requests.get(url)
+    filepath.write_bytes(response.content)
+    return filepath
+def transcribe_audio(file_path, model_name):
+    """
+    Transcribe el audio utilizando un modelo de Whisper.
+    Args:
+        file_path (str): Ruta del archivo de audio.
+        model_name (str): Nombre del modelo de Whisper.
+    Returns:
+        str: Transcripción del audio.
+    """
+    processor = AutoProcessor.from_pretrained(model_name)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
+    # Crear pipeline para transcripción
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        device="cpu",  # Cambiar a "cuda" si tienes una GPU disponible
+    )
+    # Cargar el archivo de audio
+    audio_data, samplerate = librosa.load(file_path, sr=16000)
+    # Transcribir el audio
+    result = pipe(audio_data)
+    return result["text"]
+def guardar_transcripcion(texto, filename="transcripcion.txt", directory="../results"):
+    """
+    Guarda el texto transcrito en un archivo .txt en el directorio especificado.
+    Args:
+        texto (str): Texto transcrito que se desea guardar.
+        filename (str): Nombre del archivo .txt.
+        directory (str): Directorio donde se guardará el archivo.
+    """
+    os.makedirs(directory, exist_ok=True)  # Crea el directorio si no existe
+    file_path = Path(directory) / filename
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(texto)
+    print(f"Transcripción guardada en: {file_path}")
+def main():
+    # Configuración de logging para errores únicamente
+    logging.set_verbosity_error()
+    # Ruta del archivo de audio
+    audio_path = os.path.abspath("../miwav2lipv6/assets/audio/grabacion_gradio.wav")
+    # Modelo seleccionado
+    model_name = "openai/whisper-large"  # Cambia esto al modelo deseado
+    # Transcribir el audio
+    print(f"Transcribiendo el audio del archivo: {audio_path}")
+    transcription = transcribe_audio(audio_path, model_name)
+    print(f"Transcripción: {transcription}")
+    # Guardar la transcripción en un archivo .txt
+    guardar_transcripcion(transcription)
+if __name__ == "__main__":
+    main()

tests/test_whisper_audio_extractor.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import pytest
+from src.whisper_audio_extractor import record_audio, transcribe_audio, AUDIO_PATH
+def test_record_audio():
+    """
+    Verifica que la función de grabación crea un archivo de audio con un tamaño válido.
+    """
+    # Ejecuta la grabación con una duración de prueba corta
+    record_audio(duration=2)  # Graba por 2 segundos para el test
+    # Comprueba si el archivo de audio existe
+    assert os.path.exists(AUDIO_PATH), "El archivo de audio no fue creado."
+    # Comprueba que el archivo no esté vacío
+    assert os.path.getsize(AUDIO_PATH) > 0, "El archivo de audio está vacío."
+def test_transcribe_audio():
+    """
+    Verifica que la función de transcripción devuelve texto.
+    """
+    # Ejecuta la transcripción del audio grabado
+    transcription = transcribe_audio()
+    # Asegura que se obtuvo texto
+    assert isinstance(transcription, str) and len(transcription) > 0, "La transcripción está vacía o no es texto."
+if __name__ == "__main__":
+    pytest.main()