fireedman commited on
Commit
d4757ae
·
0 Parent(s):

Primer commit, creo que faltan los modelos pesados

Browse files
.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Ignorar entorno virtual
2
+ env/
3
+
4
+ #Ignorar archivos y carpetas de compilacion
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.py[cod]
9
+ .vscode/
10
+ .DS_Store
11
+
12
+ #Ignorar archivos de log y salida de pruebas
13
+ *.log
14
+ *.out
15
+ *.tmp
16
+
17
+
18
+ #Ignorar modelos y checkpoints
19
+ models/
20
+ checkpoints/
21
+ src/Wav2Lip/
22
+ assets/
23
+ data/
24
+ #Archivos temporales y de sistema
estructura_proyecto.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ proyecto_root/
2
+
3
+ ├── assets/
4
+ ├── video/
5
+ │ │ ├──data_video_sun_5s.mp4
6
+ │ └── audio/
7
+ │ └──data_audio_sun_5s.wav
8
+ │ └── # Archivos de datos, audio y video de prueba, como `data_audio_sun_5s.wav`
9
+
10
+ ├── checkpoints/
11
+ │ └── # Modelos y checkpoints preentrenados, como `wav2lip_gan.pth`
12
+
13
+ ├── models/
14
+ │ └── # Modelos convertidos a OpenVINO IR, como `face_detection.xml` y `wav2lip.xml`
15
+
16
+ ├── src/
17
+ | ├── utils/
18
+ | ├── Wav2Lip/
19
+ │ ├── convert_models.py
20
+ │ ├── gradio_helper.py
21
+ │ ├── ov_inference.py
22
+ │ ├── ov_wav2lip_helper.py
23
+ │ └── run_inference
24
+
25
+ ├── tests/
26
+ │ └── # Scripts de pruebas para verificar la funcionalidad de tu código
27
+
28
+ ├── results/
29
+ │ └── result_voice.mp4
30
+
31
+ ├── requirements.txt # Lista de dependencias del proyecto
32
+ ├── setup.py # Script de configuración del proyecto
33
+ ├── estructura_proyecto.py # Script de configuración del proyecto
34
+ └── README.md # Documentación del proyecto
requirements.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openvino>=2024.4.0
2
+ huggingface_hub
3
+ torch>=2.1
4
+ gradio>=4.19
5
+ librosa==0.9.2
6
+ opencv-contrib-python
7
+ opencv-python
8
+ IPython
9
+ tqdm
10
+ numba
11
+ numpy
12
+
13
+ openai-whisper
14
+ sounddevice
15
+ scipy
16
+
17
+ transformers>=4.35
18
+ torchvision>=0.18.1
19
+ onnx>=1.16.1
20
+ optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
21
+ openvino
22
+ openvino-tokenizers
23
+ openvino-genai
24
+ datasets
25
+ soundfile>=0.12
26
+ python-ffmpeg<=1.0.16
27
+ nncf>=2.13.0
28
+ jiwer
29
+
30
+ gtts
31
+
results/OpenAI_response.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Hola, prueba en marcha,
2
+ María con IA se realza,
3
+ Nuevo modelo se lanza,
4
+ Incorporación, esperanza,
5
+ Ser mejor, nuestra balanza.
results/transcripcion.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Hola, esta es una prueba para ver si podemos incorporar este modelo a María, María RB.
setup.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 2024/03/11 setup.py
2
+
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ import requests
7
+
8
+ from pathlib import Path
9
+
10
+ # Definición de las carpetas del proyecto
11
+ PROJECT_DIRECTORIES = [
12
+ "assets",
13
+ "assets/audio",
14
+ "assets/video",
15
+ "checkpoints",
16
+ "models",
17
+ "src",
18
+ "src/utils",
19
+ "tests",
20
+ "results"
21
+ ]
22
+
23
+ # URLs de las utilidades de OpenVINO Notebooks
24
+ OPENVINO_UTILS = {
25
+ "notebook_utils.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
26
+ "pip_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py"
27
+ }
28
+
29
+ # URLs de los archivos de ayuda de Wav2Lip
30
+ WAV2LIP_HELPERS = {
31
+ "gradio_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/gradio_helper.py",
32
+ "ov_inference.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/ov_inference.py",
33
+ "ov_wav2lip_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/ov_wav2lip_helper.py"
34
+ }
35
+
36
+ WAV2LIP_HELPERS_DIR = Path("src")
37
+ OPENVINO_UTILS_DIR = Path("src/utils")
38
+
39
+ # URLs de los archivos de ejemplo de entrada
40
+ EXAMPLE_FILES = {
41
+ "audio_example": {
42
+ "filename": "data_audio_sun_5s.wav",
43
+ "url": "https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_audio_sun_5s.wav?raw=true",
44
+ "folder": "assets/audio"
45
+ },
46
+ "video_example": {
47
+ "filename": "data_video_sun_5s.mp4",
48
+ "url": "https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_video_sun_5s.mp4?raw=true",
49
+ "folder": "assets/video"
50
+ }
51
+ }
52
+
53
+ # Función para crear la estructura general del proyecto
54
+ def create_project_structure():
55
+ """
56
+ Crea la estructura de las carpetas del proyecto
57
+ """
58
+ for directory in PROJECT_DIRECTORIES:
59
+ path = Path(directory)
60
+ if not path.exists():
61
+ path.mkdir(parents=True, exist_ok=True)
62
+ print(f"Carpeta '{directory}' creada.")
63
+ else:
64
+ print(f"Carpeta '{directory}' ya existe.")
65
+
66
+ # Función para crear el entorno virtual
67
+ def create_virtual_environment():
68
+ """
69
+ Crea el entorno virtual si no existe.
70
+ """
71
+ env_path = Path("env")
72
+ if not env_path.exists():
73
+ print("Creando el entorno virtual...")
74
+ subprocess.check_call([sys.executable, "-m", "venv", "env"])
75
+ print(f"Entorno virtual creado en '{env_path}'.")
76
+ else:
77
+ print(f"El entorno virtual '{env_path}' ya existe.")
78
+
79
+ # Función que activa y define pip y python
80
+ def activate_virtual_environment():
81
+ """
82
+ Activa el entorno virtual y devuelve las rutas de pip y python.
83
+ """
84
+ if os.name == 'nt': # Windows
85
+ python_path = str(Path("env") / "Scripts" / "python.exe")
86
+ pip_path = str(Path("env") / "Scripts" / "pip.exe")
87
+ else: # Unix/MacOS
88
+ python_path = str(Path("env") / "bin" / "python")
89
+ pip_path = str(Path("env") / "bin" / "pip")
90
+
91
+ # Actualizar pip a la última versión en el entorno virtual usando python -m pip
92
+ try:
93
+ subprocess.check_call([python_path, "-m", "pip", "install", "--upgrade", "pip"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
94
+ print("pip actualizado a la última versión.")
95
+ except subprocess.CalledProcessError:
96
+ print("Error al actualizar pip.")
97
+ try:
98
+ subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
99
+ except subprocess.CalledProcessError:
100
+ print("Error al instalar tqdm.")
101
+
102
+ return python_path, pip_path
103
+
104
+ # Funcion para instalar las dependencias desde requirements.txt con barra de progreso
105
+ def install_requirements(pip_path):
106
+ """
107
+ Instala las dependencias de requirements.txt con una barra de progreso.
108
+ """
109
+ print("Instalando dependencias...")
110
+ # Instalar tqdm en el entorno virtual si no está instalado
111
+ try:
112
+ subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
113
+ except subprocess.CalledProcessError:
114
+ print("Error al instalar tqdm.")
115
+
116
+ from tqdm import tqdm # Importar tqdm para la barra de progreso
117
+
118
+ # Leer requirements.txt y mostrar barra de progreso
119
+ requirements_path = Path("requirements.txt")
120
+ if not requirements_path.exists():
121
+ print("Archivo requirements.txt no encontrado.")
122
+ return
123
+
124
+ with open(requirements_path, "r") as f:
125
+ dependencies = f.read().splitlines()
126
+
127
+ # Instalar cada dependencia con barra de progreso
128
+ for dependency in tqdm(dependencies, desc="Instalando dependencias", unit="paquete"):
129
+ try:
130
+ subprocess.check_call([pip_path, "install", dependency], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
131
+ except subprocess.CalledProcessError:
132
+ print(f"\nError al instalar {dependency}.")
133
+
134
+ print("Todas las dependencias fueron instaladas correctamente.")
135
+
136
+ # Funcion para descargar los archivos de utilidades de OpenVINO Notebooks
137
+ def download_openvino_utils(pip_path):
138
+ """
139
+ Descarga los archivos de utilidades de OpenVINO Notebooks en src/utils si no existen.
140
+ """
141
+ # Crear la carpeta de utilidades si no existe
142
+ OPENVINO_UTILS_DIR.mkdir(parents=True, exist_ok=True)
143
+
144
+ # Instalar requests en el entorno virtual si no está instalado
145
+ try:
146
+ subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
147
+ except subprocess.CalledProcessError:
148
+ print("Error al instalar requests.")
149
+
150
+ # Instalar tqdm en el entorno virtual si no está instalado
151
+ try:
152
+ subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
153
+ except subprocess.CalledProcessError:
154
+ print("Error al instalar tqdm.")
155
+
156
+ from tqdm import tqdm # Importar tqdm para la barra de progreso
157
+
158
+ for filename, url in tqdm(OPENVINO_UTILS.items(), desc="Descargando utilidades de OpenVINO", unit="archivo"):
159
+ file_path = OPENVINO_UTILS_DIR / filename
160
+ if not file_path.exists():
161
+ response = requests.get(url)
162
+ if response.status_code == 200:
163
+ with open(file_path, "wb") as f:
164
+ f.write(response.content)
165
+ else:
166
+ print(f"Error al descargar {filename} desde {url}")
167
+
168
+ # Función para descargar los archivos de ayuda específicos de Wav2Lip
169
+ def download_wav2lip_helpers(pip_path):
170
+ """
171
+ Descarga los archivos de ayuda específicos de Wav2Lip si no existen.
172
+ """
173
+ WAV2LIP_HELPERS_DIR.mkdir(parents=True, exist_ok=True) # Crea `src` si no existe
174
+
175
+ # Instalar requests en el entorno virtual si no está instalado
176
+ try:
177
+ subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
178
+ except subprocess.CalledProcessError:
179
+ print("Error al instalar requests.")
180
+
181
+ try:
182
+ subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
183
+ except subprocess.CalledProcessError:
184
+ print("Error al instalar tqdm.")
185
+
186
+ from tqdm import tqdm # Importar tqdm para la barra de progreso
187
+ for filename, url in tqdm(WAV2LIP_HELPERS.items(), desc="Descargando ayudas de Wav2Lip", unit="archivo"):
188
+ file_path = WAV2LIP_HELPERS_DIR / filename
189
+ if not file_path.exists():
190
+ response = requests.get(url)
191
+ if response.status_code == 200:
192
+ with open(file_path, "wb") as f:
193
+ f.write(response.content)
194
+
195
+ # Función para descargar los archivos de ejemplo de entrada (audio y video)
196
+ def download_example_files():
197
+ """
198
+ Descarga los archivos de ejemplo de entrada (audio y video) en sus carpetas correspondientes.
199
+ """
200
+ # Instalar requests en el entorno virtual si no está instalado
201
+ try:
202
+ subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
203
+ except subprocess.CalledProcessError:
204
+ print("Error al instalar requests.")
205
+
206
+ try:
207
+ subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
208
+ except subprocess.CalledProcessError:
209
+ print("Error al instalar tqdm.")
210
+
211
+ from tqdm import tqdm # Importar tqdm para la barra de progreso
212
+
213
+ for example_name, example_info in tqdm(EXAMPLE_FILES.items(), desc="Descargando archivos de ejemplo", unit="archivo"):
214
+ folder_path = Path(example_info["folder"])
215
+ file_path = folder_path / example_info["filename"]
216
+
217
+ # Crear la carpeta si no existe
218
+ folder_path.mkdir(parents=True, exist_ok=True)
219
+
220
+ # Descargar el archivo si no existe
221
+ if not file_path.exists():
222
+ response = requests.get(example_info["url"])
223
+ if response.status_code == 200:
224
+ with open(file_path, "wb") as f:
225
+ f.write(response.content)
226
+
227
+ def clone_wav2lip_repo():
228
+ """
229
+ Clona el repositorio oficial de Wav2Lip, ocultando el progreso mediante tqdm.
230
+ """
231
+ repo_url = "https://github.com/Rudrabha/Wav2Lip"
232
+ clone_path = "src/Wav2Lip"
233
+
234
+ try:
235
+ subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
236
+ except subprocess.CalledProcessError:
237
+ print("Error al instalar requests.")
238
+
239
+ try:
240
+ subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
241
+ except subprocess.CalledProcessError:
242
+ print("Error al instalar tqdm.")
243
+
244
+ from tqdm import tqdm # Importar tqdm para la barra de progreso
245
+
246
+ # Verifica si el repositorio ya existe para evitar clonarlo nuevamente
247
+ if os.path.exists(clone_path):
248
+ print(f"El repositorio '{clone_path}' ya existe.")
249
+ return
250
+
251
+ # Inicia el proceso de clonación con tqdm para ocultar el progreso
252
+ print("Clonando el repositorio de Wav2Lip...")
253
+ with tqdm(total=100, desc="Clonación en progreso", ncols=100, bar_format="{l_bar}{bar}") as pbar:
254
+ # Ejecuta el comando de clonación
255
+ exit_code = subprocess.call(["git", "clone", repo_url, clone_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
256
+
257
+ if exit_code != 0:
258
+ raise Exception("Error: La clonación del repositorio ha fallado.")
259
+ else:
260
+ pbar.update(100)
261
+ print("Repositorio clonado exitosamente en 'Wav2Lip'.")
262
+
263
+
264
+ if __name__ == "__main__":
265
+ create_project_structure()
266
+ create_virtual_environment()
267
+ python_path, pip_path = activate_virtual_environment()
268
+
269
+ download_openvino_utils(pip_path)
270
+ download_wav2lip_helpers(pip_path)
271
+ download_example_files()
272
+ install_requirements(pip_path)
273
+ clone_wav2lip_repo()
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
src/.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
src/audio_recorder.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # audio_recorder.py
2
+
3
+ import sounddevice as sd
4
+ from scipy.io.wavfile import write
5
+ import os
6
+
7
+ # Ruta para guardar el archivo de audio en el directorio `assets/audio/`
8
+ AUDIO_PATH = os.path.join("..", "assets", "audio", "grabacion_8s.wav")
9
+
10
+ def listar_dispositivos():
11
+ """
12
+ Lista todos los dispositivos de audio disponibles en el sistema.
13
+ """
14
+ print("Dispositivos de audio disponibles:")
15
+ dispositivos = sd.query_devices()
16
+ for idx, dispositivo in enumerate(dispositivos):
17
+ print(f"{idx}: {dispositivo['name']} - {'Entrada' if dispositivo['max_input_channels'] > 0 else 'Salida'}")
18
+ print("\nSelecciona el índice del dispositivo de entrada que prefieras para grabar audio.")
19
+
20
+ def record_audio(duration=8, sample_rate=44100, device_index=None):
21
+ """
22
+ Graba el audio desde el micrófono durante un tiempo específico y lo guarda como archivo WAV.
23
+
24
+ Args:
25
+ duration (int): Duración de la grabación en segundos.
26
+ sample_rate (int): Frecuencia de muestreo del audio.
27
+ device_index (int): Índice del dispositivo de audio a utilizar.
28
+ """
29
+ print("Grabando...")
30
+
31
+ # Iniciar la grabación con un canal
32
+ audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, device=device_index)
33
+ sd.wait() # Espera a que la grabación termine
34
+
35
+ # Guardar el archivo de audio
36
+ write(AUDIO_PATH, sample_rate, audio_data)
37
+ print(f"Grabación completada. Archivo guardado en: {AUDIO_PATH}")
38
+
39
+ if __name__ == "__main__":
40
+ # Paso 1: Listar dispositivos de audio
41
+ listar_dispositivos()
42
+
43
+ # Aquí esperaremos tu selección del índice del dispositivo
44
+ device_index = int(input("Introduce el índice del dispositivo de entrada que deseas utilizar: "))
45
+
46
+ # Paso 2: Grabar audio con el dispositivo seleccionado
47
+ record_audio(device_index=device_index)
48
+
src/call_openai_api.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.prompts import PromptTemplate
6
+ from langchain.chains import LLMChain
7
+ from pathlib import Path
8
+
9
+ #Cargar variables de entorno desde el archivo .env
10
+ # Ruta relativa al archivo .env en models/
11
+ project_root = Path(__file__).resolve().parent.parent # Sube al nivel raíz del proyecto
12
+ env_path = project_root / "models" / ".env" # Ruta completa al archivo .env
13
+ load_dotenv(dotenv_path=env_path)
14
+
15
+ #Configuracion de la clave de la api
16
+ api_key = os.getenv("OPENAI_API_KEY")
17
+ if not api_key:
18
+ raise ValueError("No se encontro la clave de API")
19
+
20
+ OPENAI_KEY_VAL = api_key
21
+
22
+ llm = ChatOpenAI(
23
+ openai_api_key = OPENAI_KEY_VAL,
24
+ temperature = 0.7,
25
+ model = "gpt-4"
26
+ )
27
+
28
+ #plantilla del prompt con el texto leido del archivo
29
+ template ="""
30
+ Eres un asistente de IA que orienta a los alumnos a ser mejores personas. Haz una haiku de 5 lineas sobre lo que te estan comentando. Da siempre la respuesta en Español
31
+ Texto:{texto}
32
+ Respuesta:
33
+ """
34
+ prompt = PromptTemplate(
35
+ input_variables = ["texto"],
36
+ template = template
37
+ )
38
+
39
+ chain = LLMChain(
40
+ llm = llm,
41
+ prompt = prompt
42
+ )
43
+
44
+ #def save_summary_to_file(summary_text, filename = 'response.txt'):
45
+ def save_summary_to_file(summary_text, filename = 'C:/programacionEjercicios/miwav2lipv6/results/OpenAI_response.txt'):
46
+ try:
47
+ with open(filename,'w', encoding='utf-8') as file:
48
+ file.write(summary_text)
49
+ print(f"El resumen se ha guardado exitosamente en {filename}")
50
+ except Exception as e:
51
+ print(f"Ocurrio un error al guardar el resumen {e}")
52
+
53
+ def read_text_from_file(filename):
54
+ try:
55
+ with open(filename, 'r') as file:
56
+ return file.read()
57
+ except Exception as e:
58
+ print(f"Error al leer el archivo {filename}: {e}")
59
+ return ""
60
+
61
+
62
+ #def main():
63
+ def moni(archivo):
64
+ #texto_usuario = input("Ingresa un texto para resumir:")
65
+ #texto_usuario = read_text_from_file("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
66
+ texto_usuario = read_text_from_file(archivo)
67
+ resultado = chain.run(texto = texto_usuario)
68
+
69
+ #Mostrar el resumen generado
70
+ print("\nResumen generado:")
71
+ print(resultado)
72
+ save_summary_to_file(resultado)
73
+
74
+ return resultado
75
+ #
76
+
77
+
78
+ if __name__ == "__main__":
79
+ moni()
80
+
src/convert_models.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ # Añade `src` a `sys.path` para que Python encuentre el módulo `utils`
5
+ sys.path.append(str(Path(__file__).resolve().parent))
6
+
7
+ # Importa la función desde utils/notebook_utils.py
8
+ from utils.notebook_utils import download_file
9
+ from ov_wav2lip_helper import download_and_convert_models
10
+
11
+
12
+ OV_FACE_DETECTION_MODEL_PATH = Path("../miwav2lipv6/models/face_detection.xml")
13
+ OV_WAV2LIP_MODEL_PATH = Path("../miwav2lipv6/models/wav2lip.xml")
14
+
15
+
16
+ download_and_convert_models(OV_FACE_DETECTION_MODEL_PATH, OV_WAV2LIP_MODEL_PATH)
src/gradio_helper.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+ import gradio as gr
3
+ import numpy as np
4
+
5
+
6
+ examples = [
7
+ [
8
+ #"data_video_sun_5s.mp4",
9
+ "data_video_sun.mp4",
10
+ "data_audio_sun_5s.wav",
11
+ ],
12
+ ]
13
+
14
+
15
+ def make_demo(fn: Callable):
16
+ demo = gr.Interface(
17
+ fn=fn,
18
+ inputs=[
19
+ gr.Video(label="Face video"),
20
+ gr.Audio(label="Audio", type="filepath"),
21
+ ],
22
+ outputs="video",
23
+ examples=examples,
24
+ allow_flagging="never",
25
+ )
26
+ return demo
src/interface.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # interface.py
2
+
3
+ import gradio as gr
4
+ import sounddevice as sd
5
+ from scipy.io.wavfile import write
6
+ import tempfile
7
+ import shutil
8
+ import os
9
+
10
+ # Rutas de video y audio con absolutas para evitar errores de acceso
11
+ AUDIO_COPY_PATH = os.path.abspath(os.path.join("..", "miwav2lipv6","assets", "audio", "grabacion_gradio.wav"))
12
+ #VIDEO_PATH = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun_5s.mp4")
13
+ VIDEO_PATH = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun.mp4")
14
+
15
+ # Verificar la existencia del video
16
+ if not os.path.exists(VIDEO_PATH):
17
+ print(f"Advertencia: El archivo de video no se encontró en la ruta {VIDEO_PATH}")
18
+
19
+ # Función para grabar audio
20
+ def grabar_audio(duration=8, sample_rate=44100):
21
+ print("Grabando...")
22
+ audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
23
+ sd.wait() # Espera a que la grabación termine
24
+
25
+ # Guardar archivo temporal de audio
26
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
27
+ write(temp_audio.name, sample_rate, audio_data)
28
+ print("Grabación completada. Archivo temporal guardado en:", temp_audio.name)
29
+
30
+ # Verificar y crear `assets/audio` si no existe
31
+ os.makedirs(os.path.dirname(AUDIO_COPY_PATH), exist_ok=True)
32
+
33
+ # Copiar a `assets/audio`
34
+ shutil.copy(temp_audio.name, AUDIO_COPY_PATH)
35
+ print(f"Copia de la grabación guardada en: {AUDIO_COPY_PATH}")
36
+
37
+ return AUDIO_COPY_PATH
38
+
39
+ # Función principal para la interfaz de Gradio
40
+ def interfaz():
41
+ with gr.Blocks() as demo:
42
+ gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
43
+
44
+ # Crear un botón de grabación
45
+ with gr.Row():
46
+ grabar_button = gr.Button("Iniciar Grabación")
47
+
48
+ # Mostrar el audio grabado a la derecha
49
+ output_audio = gr.Audio(label="Grabación de Audio", type="filepath")
50
+
51
+ # Asignar la función al botón
52
+ grabar_button.click(grabar_audio, outputs=output_audio)
53
+
54
+ return demo
55
+
56
+ # Ejecuta la interfaz con la ruta absoluta en allowed_paths
57
+ if __name__ == "__main__":
58
+ demo = interfaz()
59
+ demo.launch(allowed_paths=[os.path.dirname(AUDIO_COPY_PATH)])
60
+
src/interfaceV2.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # interfaceV2.py
2
+
3
+ import gradio as gr
4
+ import sounddevice as sd
5
+ from scipy.io.wavfile import write
6
+ import tempfile
7
+ import shutil
8
+ import os
9
+ import subprocess
10
+ import sys
11
+ from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion
12
+ from call_openai_api import moni as rtff # Asegúrate de que el archivo call_open_api.py esté en el mismo directorio
13
+
14
+
15
+ # Paths to files (adjusted as per your specified structure)
16
+ AUDIO_RECORD_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/audio/grabacion_gradio.wav")
17
+ #VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/video/data_video_sun_5s.mp4")
18
+ VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/video/data_video_sun.mp4")
19
+ #TRANSCRIPTION_TEXT_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
20
+ TRANSCRIPTION_TEXT_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
21
+ RESULT_AUDIO_TEMP_PATH = os.path.abspath( "C:/programacionEjercicios/miwav2lipv6/results/audiov2.wav")
22
+ RESULT_AUDIO_FINAL_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/audio/audio.wav")
23
+ RESULT_VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/result_voice.mp4")
24
+ TEXT_TO_SPEECH_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/src/text_to_speech.py")
25
+
26
+ # Function to record 8-second audio
27
+ def grabar_audio(duration=8, sample_rate=44100):
28
+ print("Starting recording...")
29
+ audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
30
+ print(f"Recording in progress for {duration} seconds...")
31
+ sd.wait()
32
+ print("Recording completed.")
33
+
34
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
35
+ write(temp_audio.name, sample_rate, audio_data)
36
+ print("Audio temporarily saved at:", temp_audio.name)
37
+ temp_audio.close() # Asegúrate de cerrarlo antes de usarlo
38
+ os.makedirs(os.path.dirname(AUDIO_RECORD_PATH), exist_ok=True)
39
+ shutil.copy(temp_audio.name, AUDIO_RECORD_PATH)
40
+ print(f"Recording copied to: {AUDIO_RECORD_PATH}")
41
+
42
+ return AUDIO_RECORD_PATH, "Recording completed."
43
+
44
+ # Function to transcribe audio with Whisper
45
+ def transcribir_con_progreso(audio_path):
46
+ progreso = gr.Progress()
47
+ progreso(0, "Starting transcription...")
48
+ model_name = "openai/whisper-large"
49
+ progreso(25, "Loading Whisper model...")
50
+
51
+ transcripcion = transcribe_audio(audio_path, model_name)
52
+ progreso(75, "Saving transcription...")
53
+ guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH)
54
+ progreso(100, "Transcription completed.")
55
+ if not os.path.exists(TRANSCRIPTION_TEXT_PATH):
56
+ raise FileNotFoundError(f"El archivo {TRANSCRIPTION_TEXT_PATH} no se generó.")
57
+
58
+ return transcripcion
59
+
60
+ # Function to convert text to audio using text_to_speech.py
61
+ def generar_audio_desde_texto():
62
+ print("Generating audio from text...")
63
+ result = subprocess.run(
64
+ [sys.executable, TEXT_TO_SPEECH_PATH],
65
+ capture_output=True,
66
+ text=True
67
+ )
68
+ if result.returncode != 0:
69
+ raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}")
70
+ if result.stdout:
71
+ print("Output:", result.stdout)
72
+ if result.stderr:
73
+ print("Errors:", result.stderr)
74
+
75
+ if os.path.exists(RESULT_AUDIO_TEMP_PATH):
76
+ print(f"Temporary audio generated at: {RESULT_AUDIO_TEMP_PATH}")
77
+
78
+ os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True)
79
+ shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH)
80
+ print(f"Final audio copied to: {RESULT_AUDIO_FINAL_PATH}")
81
+
82
+ return RESULT_AUDIO_FINAL_PATH
83
+ else:
84
+ print(f"Error: Audio file was not generated in {RESULT_AUDIO_FINAL_PATH} ")
85
+ return None
86
+
87
+ # Function to process video and audio using run_inference.py with the generated audio file
88
+ def procesar_video_audio():
89
+ print("Starting video and audio processing...")
90
+ run_inference_path = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/src/run_inference.py")
91
+
92
+ result = subprocess.run(
93
+ [sys.executable, run_inference_path, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH],
94
+ capture_output=True,
95
+ text=True
96
+ )
97
+
98
+ if result.stdout:
99
+ print("Output:", result.stdout)
100
+ if result.stderr:
101
+ print("Errors:", result.stderr)
102
+
103
+ if os.path.exists(RESULT_VIDEO_PATH):
104
+ print(f"Processed video saved at: {RESULT_VIDEO_PATH}")
105
+ return RESULT_VIDEO_PATH
106
+ else:
107
+ print("Error: Video file was not generated at 'results/result_voice.mp4'")
108
+ return None
109
+
110
+ # Gradio Interface Configuration
111
+ def interfaz():
112
+ with gr.Blocks() as demo:
113
+ with gr.Row():
114
+ with gr.Column():
115
+ gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
116
+ grabar_button = gr.Button("Comenzando la grabacion de audio")
117
+ estado_grabacion = gr.Textbox(label="Recording Status", interactive=False)
118
+
119
+ with gr.Column():
120
+ output_audio = gr.Audio(AUDIO_RECORD_PATH, label="Audio Grabado", interactive=False)
121
+ output_audio_speech = gr.Audio(RESULT_AUDIO_FINAL_PATH, label="Audio TTS", interactive=False)
122
+ video_resultado = gr.Video(RESULT_VIDEO_PATH,label="Video procesado", interactive=False)
123
+ texto_transcripcion = gr.Textbox(label="Texto transcrito")
124
+ progreso_transcripcion = gr.Textbox(label="Transcription Status", interactive=False)
125
+
126
+ # Full flow: recording, transcription, text-to-speech, and video processing
127
+ """
128
+ def flujo_completo():
129
+ _, mensaje_grabacion = grabar_audio()
130
+ transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH)
131
+ audio_generado = generar_audio_desde_texto()
132
+ video_path = procesar_video_audio()
133
+
134
+ # Ensure function always returns 5 outputs for Gradio, even in error cases
135
+ if video_path and audio_generado:
136
+ return mensaje_grabacion, AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path
137
+ else:
138
+ return mensaje_grabacion, AUDIO_RECORD_PATH, transcripcion, audio_generado or "Audio generation failed", video_path or "Video generation failed"
139
+ """
140
+ def flujo_completo():
141
+ try:
142
+ print("Inicio del flujo completo...")
143
+ # Grabar audio
144
+ audio_path, mensaje_grabacion = grabar_audio()
145
+ print("Audio grabado en:", audio_path)
146
+ # Transcribir audio
147
+ transcripcion = transcribir_con_progreso(audio_path)
148
+ print("Transcripción completada:", transcripcion)
149
+
150
+ #respuesta_openai = rtff(transcripcion)
151
+ respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH)
152
+ print("Respuesta generada por OpenAI")
153
+
154
+ # Generar audio desde texto
155
+ audio_generado = generar_audio_desde_texto()
156
+ print("Audio generado:", audio_generado)
157
+ # Procesar video y audio
158
+ video_path = procesar_video_audio()
159
+ print("Video procesado en:", video_path)
160
+ # Devolver resultados si todo fue exitoso
161
+ return mensaje_grabacion, audio_path, transcripcion, audio_generado, video_path
162
+
163
+ except Exception as e:
164
+ # Imprime el error en la terminal y regresa mensajes de error a la interfaz
165
+ print("Error detectado en flujo completo:", str(e))
166
+ return (
167
+ "Error durante el flujo completo",
168
+ None, # Audio grabado
169
+ f"Error: {str(e)}", # Transcripción
170
+ None, # Audio generado
171
+ None # Video procesado
172
+ )
173
+
174
+ grabar_button.click(
175
+ flujo_completo,
176
+ outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado]
177
+ )
178
+
179
+ return demo
180
+
181
+ if __name__ == "__main__":
182
+ demo = interfaz()
183
+ demo.launch(allowed_paths=["C:/programacionEjercicios/miwav2lipv6/assets", "C:/programacionEjercicios/miwav2lipv6/results"])
src/ov_inference.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from glob import glob
2
+ from enum import Enum
3
+ import math
4
+ import subprocess
5
+
6
+ import cv2
7
+ import numpy as np
8
+ from tqdm import tqdm
9
+ import torch
10
+ import torch.nn.functional as F
11
+
12
+ from Wav2Lip import audio
13
+ import openvino as ov
14
+
15
+
16
+ device = "cpu"
17
+
18
+
19
+ def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
20
+ xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
21
+ dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
22
+ dw, dh = math.log(ww / aww), math.log(hh / ahh)
23
+ return dx, dy, dw, dh
24
+
25
+
26
+ def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
27
+ xc, yc = dx * aww + axc, dy * ahh + ayc
28
+ ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
29
+ x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
30
+ return x1, y1, x2, y2
31
+
32
+
33
+ def nms(dets, thresh):
34
+ if 0 == len(dets):
35
+ return []
36
+ x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
37
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
38
+ order = scores.argsort()[::-1]
39
+
40
+ keep = []
41
+ while order.size > 0:
42
+ i = order[0]
43
+ keep.append(i)
44
+ xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
45
+ xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
46
+
47
+ w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
48
+ ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
49
+
50
+ inds = np.where(ovr <= thresh)[0]
51
+ order = order[inds + 1]
52
+
53
+ return keep
54
+
55
+
56
+ def encode(matched, priors, variances):
57
+ """Encode the variances from the priorbox layers into the ground truth boxes
58
+ we have matched (based on jaccard overlap) with the prior boxes.
59
+ Args:
60
+ matched: (tensor) Coords of ground truth for each prior in point-form
61
+ Shape: [num_priors, 4].
62
+ priors: (tensor) Prior boxes in center-offset form
63
+ Shape: [num_priors,4].
64
+ variances: (list[float]) Variances of priorboxes
65
+ Return:
66
+ encoded boxes (tensor), Shape: [num_priors, 4]
67
+ """
68
+
69
+ # dist b/t match center and prior's center
70
+ g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
71
+ # encode variance
72
+ g_cxcy /= variances[0] * priors[:, 2:]
73
+ # match wh / prior wh
74
+ g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
75
+ g_wh = torch.log(g_wh) / variances[1]
76
+ # return target for smooth_l1_loss
77
+ return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
78
+
79
+
80
+ def decode(loc, priors, variances):
81
+ """Decode locations from predictions using priors to undo
82
+ the encoding we did for offset regression at train time.
83
+ Args:
84
+ loc (tensor): location predictions for loc layers,
85
+ Shape: [num_priors,4]
86
+ priors (tensor): Prior boxes in center-offset form.
87
+ Shape: [num_priors,4].
88
+ variances: (list[float]) Variances of priorboxes
89
+ Return:
90
+ decoded bounding box predictions
91
+ """
92
+
93
+ boxes = torch.cat((priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
94
+ boxes[:, :2] -= boxes[:, 2:] / 2
95
+ boxes[:, 2:] += boxes[:, :2]
96
+ return boxes
97
+
98
+
99
+ def batch_decode(loc, priors, variances):
100
+ """Decode locations from predictions using priors to undo
101
+ the encoding we did for offset regression at train time.
102
+ Args:
103
+ loc (tensor): location predictions for loc layers,
104
+ Shape: [num_priors,4]
105
+ priors (tensor): Prior boxes in center-offset form.
106
+ Shape: [num_priors,4].
107
+ variances: (list[float]) Variances of priorboxes
108
+ Return:
109
+ decoded bounding box predictions
110
+ """
111
+
112
+ boxes = torch.cat((priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:], priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
113
+ boxes[:, :, :2] -= boxes[:, :, 2:] / 2
114
+ boxes[:, :, 2:] += boxes[:, :, :2]
115
+ return boxes
116
+
117
+
118
+ def get_smoothened_boxes(boxes, T):
119
+ for i in range(len(boxes)):
120
+ if i + T > len(boxes):
121
+ window = boxes[len(boxes) - T :]
122
+ else:
123
+ window = boxes[i : i + T]
124
+ boxes[i] = np.mean(window, axis=0)
125
+ return boxes
126
+
127
+
128
+ def detect(net, img, device):
129
+ img = img - np.array([104, 117, 123])
130
+ img = img.transpose(2, 0, 1)
131
+ img = img.reshape((1,) + img.shape)
132
+
133
+ img = torch.from_numpy(img).float().to(device)
134
+ BB, CC, HH, WW = img.size()
135
+
136
+ results = net({"x": img})
137
+ olist = [torch.Tensor(results[i]) for i in range(12)]
138
+
139
+ bboxlist = []
140
+ for i in range(len(olist) // 2):
141
+ olist[i * 2] = F.softmax(olist[i * 2], dim=1)
142
+ olist = [oelem.data.cpu() for oelem in olist]
143
+ for i in range(len(olist) // 2):
144
+ ocls, oreg = olist[i * 2], olist[i * 2 + 1]
145
+ FB, FC, FH, FW = ocls.size() # feature map size
146
+ stride = 2 ** (i + 2) # 4,8,16,32,64,128
147
+ anchor = stride * 4
148
+ poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
149
+ for Iindex, hindex, windex in poss:
150
+ axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
151
+ score = ocls[0, 1, hindex, windex]
152
+ loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
153
+ priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
154
+ variances = [0.1, 0.2]
155
+ box = decode(loc, priors, variances)
156
+ x1, y1, x2, y2 = box[0] * 1.0
157
+ # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
158
+ bboxlist.append([x1, y1, x2, y2, score])
159
+ bboxlist = np.array(bboxlist)
160
+ if 0 == len(bboxlist):
161
+ bboxlist = np.zeros((1, 5))
162
+
163
+ return bboxlist
164
+
165
+
166
+ def batch_detect(net, imgs, device):
167
+ imgs = imgs - np.array([104, 117, 123])
168
+ imgs = imgs.transpose(0, 3, 1, 2)
169
+
170
+ imgs = torch.from_numpy(imgs).float().to(device)
171
+ BB, CC, HH, WW = imgs.size()
172
+
173
+ results = net({"x": imgs.numpy()})
174
+ olist = [torch.Tensor(results[i]) for i in range(12)]
175
+
176
+ bboxlist = []
177
+ for i in range(len(olist) // 2):
178
+ olist[i * 2] = F.softmax(olist[i * 2], dim=1)
179
+ # olist[i * 2] = (olist[i * 2], dim=1)
180
+ olist = [oelem.data.cpu() for oelem in olist]
181
+ for i in range(len(olist) // 2):
182
+ ocls, oreg = olist[i * 2], olist[i * 2 + 1]
183
+ FB, FC, FH, FW = ocls.size() # feature map size
184
+ stride = 2 ** (i + 2) # 4,8,16,32,64,128
185
+ anchor = stride * 4
186
+ poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
187
+ for Iindex, hindex, windex in poss:
188
+ axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
189
+ score = ocls[:, 1, hindex, windex]
190
+ loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
191
+ priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
192
+ variances = [0.1, 0.2]
193
+ box = batch_decode(loc, priors, variances)
194
+ box = box[:, 0] * 1.0
195
+ # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
196
+ bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
197
+ bboxlist = np.array(bboxlist)
198
+ if 0 == len(bboxlist):
199
+ bboxlist = np.zeros((1, BB, 5))
200
+
201
+ return bboxlist
202
+
203
+
204
+ def flip_detect(net, img, device):
205
+ img = cv2.flip(img, 1)
206
+ b = detect(net, img, device)
207
+
208
+ bboxlist = np.zeros(b.shape)
209
+ bboxlist[:, 0] = img.shape[1] - b[:, 2]
210
+ bboxlist[:, 1] = b[:, 1]
211
+ bboxlist[:, 2] = img.shape[1] - b[:, 0]
212
+ bboxlist[:, 3] = b[:, 3]
213
+ bboxlist[:, 4] = b[:, 4]
214
+ return bboxlist
215
+
216
+
217
+ def pts_to_bb(pts):
218
+ min_x, min_y = np.min(pts, axis=0)
219
+ max_x, max_y = np.max(pts, axis=0)
220
+ return np.array([min_x, min_y, max_x, max_y])
221
+
222
+
223
+ class OVFaceDetector(object):
224
+ """An abstract class representing a face detector.
225
+
226
+ Any other face detection implementation must subclass it. All subclasses
227
+ must implement ``detect_from_image``, that return a list of detected
228
+ bounding boxes. Optionally, for speed considerations detect from path is
229
+ recommended.
230
+ """
231
+
232
+ def __init__(self, device, verbose):
233
+ self.device = device
234
+ self.verbose = verbose
235
+
236
+ def detect_from_image(self, tensor_or_path):
237
+ """Detects faces in a given image.
238
+
239
+ This function detects the faces present in a provided BGR(usually)
240
+ image. The input can be either the image itself or the path to it.
241
+
242
+ Arguments:
243
+ tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
244
+ to an image or the image itself.
245
+
246
+ Example::
247
+
248
+ >>> path_to_image = 'data/image_01.jpg'
249
+ ... detected_faces = detect_from_image(path_to_image)
250
+ [A list of bounding boxes (x1, y1, x2, y2)]
251
+ >>> image = cv2.imread(path_to_image)
252
+ ... detected_faces = detect_from_image(image)
253
+ [A list of bounding boxes (x1, y1, x2, y2)]
254
+
255
+ """
256
+ raise NotImplementedError
257
+
258
+ def detect_from_directory(self, path, extensions=[".jpg", ".png"], recursive=False, show_progress_bar=True):
259
+ """Detects faces from all the images present in a given directory.
260
+
261
+ Arguments:
262
+ path {string} -- a string containing a path that points to the folder containing the images
263
+
264
+ Keyword Arguments:
265
+ extensions {list} -- list of string containing the extensions to be
266
+ consider in the following format: ``.extension_name`` (default:
267
+ {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
268
+ folder recursively (default: {False}) show_progress_bar {bool} --
269
+ display a progressbar (default: {True})
270
+
271
+ Example:
272
+ >>> directory = 'data'
273
+ ... detected_faces = detect_from_directory(directory)
274
+ {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
275
+
276
+ """
277
+ if self.verbose:
278
+ logger = logging.getLogger(__name__)
279
+
280
+ if len(extensions) == 0:
281
+ if self.verbose:
282
+ logger.error("Expected at list one extension, but none was received.")
283
+ raise ValueError
284
+
285
+ if self.verbose:
286
+ logger.info("Constructing the list of images.")
287
+ additional_pattern = "/**/*" if recursive else "/*"
288
+ files = []
289
+ for extension in extensions:
290
+ files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
291
+
292
+ if self.verbose:
293
+ logger.info("Finished searching for images. %s images found", len(files))
294
+ logger.info("Preparing to run the detection.")
295
+
296
+ predictions = {}
297
+ for image_path in tqdm(files, disable=not show_progress_bar):
298
+ if self.verbose:
299
+ logger.info("Running the face detector on image: %s", image_path)
300
+ predictions[image_path] = self.detect_from_image(image_path)
301
+
302
+ if self.verbose:
303
+ logger.info("The detector was successfully run on all %s images", len(files))
304
+
305
+ return predictions
306
+
307
+ @property
308
+ def reference_scale(self):
309
+ raise NotImplementedError
310
+
311
+ @property
312
+ def reference_x_shift(self):
313
+ raise NotImplementedError
314
+
315
+ @property
316
+ def reference_y_shift(self):
317
+ raise NotImplementedError
318
+
319
+ @staticmethod
320
+ def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
321
+ """Convert path (represented as a string) or torch.tensor to a numpy.ndarray
322
+
323
+ Arguments:
324
+ tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
325
+ """
326
+ if isinstance(tensor_or_path, str):
327
+ return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
328
+ elif torch.is_tensor(tensor_or_path):
329
+ # Call cpu in case its coming from cuda
330
+ return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
331
+ elif isinstance(tensor_or_path, np.ndarray):
332
+ return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
333
+ else:
334
+ raise TypeError
335
+
336
+
337
+ class OVSFDDetector(OVFaceDetector):
338
+ def __init__(self, device, path_to_detector="models/face_detection.xml", verbose=False):
339
+ super(OVSFDDetector, self).__init__(device, verbose)
340
+
341
+ core = ov.Core()
342
+ self.face_detector = core.compile_model(path_to_detector, self.device)
343
+
344
+ def detect_from_image(self, tensor_or_path):
345
+ image = self.tensor_or_path_to_ndarray(tensor_or_path)
346
+
347
+ bboxlist = detect(self.face_detector, image, device="cpu")
348
+ keep = nms(bboxlist, 0.3)
349
+ bboxlist = bboxlist[keep, :]
350
+ bboxlist = [x for x in bboxlist if x[-1] > 0.5]
351
+
352
+ return bboxlist
353
+
354
+ def detect_from_batch(self, images):
355
+ bboxlists = batch_detect(self.face_detector, images, device="cpu")
356
+ keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
357
+ bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
358
+ bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
359
+
360
+ return bboxlists
361
+
362
+ @property
363
+ def reference_scale(self):
364
+ return 195
365
+
366
+ @property
367
+ def reference_x_shift(self):
368
+ return 0
369
+
370
+ @property
371
+ def reference_y_shift(self):
372
+ return 0
373
+
374
+
375
+ class LandmarksType(Enum):
376
+ """Enum class defining the type of landmarks to detect.
377
+
378
+ ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
379
+ ``_2halfD`` - this points represent the projection of the 3D points into 3D
380
+ ``_3D`` - detect the points ``(x,y,z)``` in a 3D space
381
+
382
+ """
383
+
384
+ _2D = 1
385
+ _2halfD = 2
386
+ _3D = 3
387
+
388
+
389
+ class NetworkSize(Enum):
390
+ # TINY = 1
391
+ # SMALL = 2
392
+ # MEDIUM = 3
393
+ LARGE = 4
394
+
395
+ def __new__(cls, value):
396
+ member = object.__new__(cls)
397
+ member._value_ = value
398
+ return member
399
+
400
+ def __int__(self):
401
+ return self.value
402
+
403
+
404
+ class OVFaceAlignment:
405
+ def __init__(
406
+ self, landmarks_type, network_size=NetworkSize.LARGE, device="CPU", flip_input=False, verbose=False, path_to_detector="models/face_detection.xml"
407
+ ):
408
+ self.device = device
409
+ self.flip_input = flip_input
410
+ self.landmarks_type = landmarks_type
411
+ self.verbose = verbose
412
+
413
+ network_size = int(network_size)
414
+
415
+ self.face_detector = OVSFDDetector(device=device, path_to_detector=path_to_detector, verbose=verbose)
416
+
417
+ def get_detections_for_batch(self, images):
418
+ images = images[..., ::-1]
419
+ detected_faces = self.face_detector.detect_from_batch(images.copy())
420
+ results = []
421
+
422
+ for i, d in enumerate(detected_faces):
423
+ if len(d) == 0:
424
+ results.append(None)
425
+ continue
426
+ d = d[0]
427
+ d = np.clip(d, 0, None)
428
+
429
+ x1, y1, x2, y2 = map(int, d[:-1])
430
+ results.append((x1, y1, x2, y2))
431
+
432
+ return results
433
+
434
+
435
+ def face_detect_ov(images, device, face_det_batch_size, pads, nosmooth, path_to_detector):
436
+ detector = OVFaceAlignment(LandmarksType._2D, flip_input=False, device=device, path_to_detector=path_to_detector)
437
+
438
+ batch_size = face_det_batch_size
439
+
440
+ print("face_detect_ov images[0].shape: ", images[0].shape)
441
+ while 1:
442
+ predictions = []
443
+ try:
444
+ for i in tqdm(range(0, len(images), batch_size)):
445
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i : i + batch_size])))
446
+ except RuntimeError:
447
+ if batch_size == 1:
448
+ raise RuntimeError("Image too big to run face detection on GPU. Please use the --resize_factor argument")
449
+ batch_size //= 2
450
+ print("Recovering from OOM error; New batch size: {}".format(batch_size))
451
+ continue
452
+ break
453
+
454
+ results = []
455
+ pady1, pady2, padx1, padx2 = pads
456
+ for rect, image in zip(predictions, images):
457
+ if rect is None:
458
+ # check this frame where the face was not detected.
459
+ cv2.imwrite("temp/faulty_frame.jpg", image)
460
+ raise ValueError("Face not detected! Ensure the video contains a face in all the frames.")
461
+
462
+ y1 = max(0, rect[1] - pady1)
463
+ y2 = min(image.shape[0], rect[3] + pady2)
464
+ x1 = max(0, rect[0] - padx1)
465
+ x2 = min(image.shape[1], rect[2] + padx2)
466
+
467
+ results.append([x1, y1, x2, y2])
468
+
469
+ boxes = np.array(results)
470
+ if not nosmooth:
471
+ boxes = get_smoothened_boxes(boxes, T=5)
472
+ results = [[image[y1:y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
473
+
474
+ del detector
475
+ return results
476
+
477
+
478
+ def datagen(frames, mels, box, static, face_det_batch_size, pads, nosmooth, img_size, wav2lip_batch_size, path_to_detector):
479
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
480
+
481
+ if box[0] == -1:
482
+ if not static:
483
+ # BGR2RGB for CNN face detection
484
+ face_det_results = face_detect_ov(frames, "CPU", face_det_batch_size, pads, nosmooth, path_to_detector)
485
+ else:
486
+ face_det_results = face_detect_ov([frames[0]], "CPU", face_det_batch_size, pads, nosmooth, path_to_detector)
487
+ else:
488
+ print("Using the specified bounding box instead of face detection...")
489
+ y1, y2, x1, x2 = box
490
+ face_det_results = [[f[y1:y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
491
+
492
+ for i, m in enumerate(mels):
493
+ idx = 0 if static else i % len(frames)
494
+ frame_to_save = frames[idx].copy()
495
+ face, coords = face_det_results[idx].copy()
496
+
497
+ face = cv2.resize(face, (img_size, img_size))
498
+
499
+ img_batch.append(face)
500
+ mel_batch.append(m)
501
+ frame_batch.append(frame_to_save)
502
+ coords_batch.append(coords)
503
+
504
+ if len(img_batch) >= wav2lip_batch_size:
505
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
506
+
507
+ img_masked = img_batch.copy()
508
+ img_masked[:, img_size // 2 :] = 0
509
+
510
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
511
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
512
+
513
+ yield img_batch, mel_batch, frame_batch, coords_batch
514
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
515
+
516
+ if len(img_batch) > 0:
517
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
518
+
519
+ img_masked = img_batch.copy()
520
+ img_masked[:, img_size // 2 :] = 0
521
+
522
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
523
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
524
+
525
+ yield img_batch, mel_batch, frame_batch, coords_batch
526
+
527
+
528
+ def ov_inference(
529
+ face_path,
530
+ audio_path,
531
+ face_detection_path="models/face_detection.xml",
532
+ wav2lip_path="models/wav2lip.xml",
533
+ inference_device="CPU",
534
+ wav2lip_batch_size=128,
535
+ outfile="results/result_voice.mp4",
536
+ resize_factor=1,
537
+ rotate=False,
538
+ crop=[0, -1, 0, -1],
539
+ mel_step_size=16,
540
+ box=[-1, -1, -1, -1],
541
+ static=False,
542
+ img_size=96,
543
+ face_det_batch_size=16,
544
+ pads=[0, 10, 0, 0],
545
+ nosmooth=False,
546
+ ):
547
+ print("Reading video frames...")
548
+
549
+ video_stream = cv2.VideoCapture(face_path)
550
+ fps = video_stream.get(cv2.CAP_PROP_FPS)
551
+
552
+ full_frames = []
553
+ while 1:
554
+ still_reading, frame = video_stream.read()
555
+ if not still_reading:
556
+ video_stream.release()
557
+ break
558
+ if resize_factor > 1:
559
+ frame = cv2.resize(frame, (frame.shape[1] // resize_factor, frame.shape[0] // resize_factor))
560
+
561
+ if rotate:
562
+ frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
563
+
564
+ y1, y2, x1, x2 = crop
565
+ if x2 == -1:
566
+ x2 = frame.shape[1]
567
+ if y2 == -1:
568
+ y2 = frame.shape[0]
569
+
570
+ frame = frame[y1:y2, x1:x2]
571
+
572
+ full_frames.append(frame)
573
+
574
+ print("Number of frames available for inference: " + str(len(full_frames)))
575
+
576
+ core = ov.Core()
577
+
578
+ if not audio_path.endswith(".wav"):
579
+ print("Extracting raw audio...")
580
+ command = "ffmpeg -y -i {} -strict -2 {}".format(audio_path, "temp/temp.wav")
581
+
582
+ subprocess.call(command, shell=True)
583
+ audio_path = "temp/temp.wav"
584
+
585
+ wav = audio.load_wav(audio_path, 16000)
586
+ mel = audio.melspectrogram(wav)
587
+ print(mel.shape)
588
+
589
+ if np.isnan(mel.reshape(-1)).sum() > 0:
590
+ raise ValueError("Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again")
591
+
592
+ mel_chunks = []
593
+ mel_idx_multiplier = 80.0 / fps
594
+ i = 0
595
+ while 1:
596
+ start_idx = int(i * mel_idx_multiplier)
597
+ if start_idx + mel_step_size > len(mel[0]):
598
+ mel_chunks.append(mel[:, len(mel[0]) - mel_step_size :])
599
+ break
600
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
601
+ i += 1
602
+
603
+ print("Length of mel chunks: {}".format(len(mel_chunks)))
604
+
605
+ full_frames = full_frames[: len(mel_chunks)]
606
+ batch_size = wav2lip_batch_size
607
+ gen = datagen(full_frames.copy(), mel_chunks, box, static, face_det_batch_size, pads, nosmooth, img_size, wav2lip_batch_size, face_detection_path)
608
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, total=int(np.ceil(float(len(mel_chunks)) / batch_size)))):
609
+ if i == 0:
610
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
611
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
612
+ compiled_wav2lip_model = core.compile_model(wav2lip_path, inference_device)
613
+ print("Model loaded")
614
+
615
+ frame_h, frame_w = full_frames[0].shape[:-1]
616
+ out = cv2.VideoWriter("C:/programacionEjercicios/miwav2lipv6/src/Wav2Lip/temp/result.avi", cv2.VideoWriter_fourcc(*"DIVX"), fps, (frame_w, frame_h))
617
+ pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch.numpy(), "face_sequences": img_batch.numpy()})[0]
618
+ else:
619
+ img_batch = np.transpose(img_batch, (0, 3, 1, 2))
620
+ mel_batch = np.transpose(mel_batch, (0, 3, 1, 2))
621
+ pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch, "face_sequences": img_batch})[0]
622
+
623
+ pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch, "face_sequences": img_batch})[0]
624
+ pred_ov = pred_ov.transpose(0, 2, 3, 1) * 255.0
625
+ for p, f, c in zip(pred_ov, frames, coords):
626
+ y1, y2, x1, x2 = c
627
+ p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
628
+
629
+ f[y1:y2, x1:x2] = p
630
+ out.write(f)
631
+
632
+ out.release()
633
+
634
+ command = "ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}".format(audio_path, "C:/programacionEjercicios/miwav2lipv6/src/Wav2Lip/temp/result.avi", outfile)
635
+ subprocess.call(command, shell=True)
636
+
637
+ return outfile
src/ov_wav2lip_helper.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import sys
3
+ import os
4
+ import openvino as ov
5
+ import torch
6
+
7
+ from pathlib import Path
8
+ # Añade `src` al `sys.path` para que Python encuentre `utils/notebook_utils.py`
9
+ sys.path.append(str(Path(__file__).resolve().parent))
10
+
11
+ # Importa `download_file` desde `notebook_utils`
12
+ from utils.notebook_utils import download_file
13
+ from huggingface_hub import hf_hub_download
14
+ from Wav2Lip.face_detection.detection.sfd.net_s3fd import s3fd
15
+ from Wav2Lip.models import Wav2Lip
16
+
17
+
18
+
19
+ def _load(checkpoint_path):
20
+ checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
21
+ return checkpoint
22
+
23
+
24
+ def load_model(path):
25
+ model = Wav2Lip()
26
+ print("Load checkpoint from: {}".format(path))
27
+ checkpoint = _load(path)
28
+ s = checkpoint["state_dict"]
29
+ new_s = {}
30
+ for k, v in s.items():
31
+ new_s[k.replace("module.", "")] = v
32
+ model.load_state_dict(new_s)
33
+
34
+ return model.eval()
35
+
36
+
37
+ def download_and_convert_models(ov_face_detection_model_path, ov_wav2lip_model_path):
38
+ models_urls = {"s3fd": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"}
39
+ path_to_detector = "checkpoints/face_detection.pth"
40
+ # Convert Face Detection Model
41
+ print("Convert Face Detection Model ...")
42
+ if not os.path.isfile(path_to_detector):
43
+ download_file(models_urls["s3fd"])
44
+ if not os.path.exists("checkpoints"):
45
+ os.mkdir("checkpoints")
46
+ os.replace("s3fd-619a316812.pth", path_to_detector)
47
+ model_weights = torch.load(path_to_detector)
48
+
49
+ face_detector = s3fd()
50
+ face_detector.load_state_dict(model_weights)
51
+
52
+ if not ov_face_detection_model_path.exists():
53
+ face_detection_dummy_inputs = torch.FloatTensor(np.random.rand(1, 3, 768, 576))
54
+ face_detection_ov_model = ov.convert_model(face_detector, example_input=face_detection_dummy_inputs)
55
+ ov.save_model(face_detection_ov_model, ov_face_detection_model_path)
56
+ print("Converted face detection OpenVINO model: ", ov_face_detection_model_path)
57
+
58
+ print("Convert Wav2Lip Model ...")
59
+ path_to_wav2lip = hf_hub_download(repo_id="numz/wav2lip_studio", filename="Wav2lip/wav2lip.pth", local_dir="checkpoints")
60
+ wav2lip = load_model(path_to_wav2lip)
61
+ img_batch = torch.FloatTensor(np.random.rand(123, 6, 96, 96))
62
+ mel_batch = torch.FloatTensor(np.random.rand(123, 1, 80, 16))
63
+
64
+ if not ov_wav2lip_model_path.exists():
65
+ example_inputs = {"audio_sequences": mel_batch, "face_sequences": img_batch}
66
+ wav2lip_ov_model = ov.convert_model(wav2lip, example_input=example_inputs)
67
+ ov.save_model(wav2lip_ov_model, ov_wav2lip_model_path)
68
+ print("Converted face detection OpenVINO model: ", ov_wav2lip_model_path)
src/run_inference.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from ov_inference import ov_inference
3
+ import soundfile as sf
4
+ import cv2
5
+
6
+ def verificar_archivos(video_path, audio_path):
7
+ """
8
+ Verifica que los archivos de video y audio existen y son legibles.
9
+
10
+ Args:
11
+ video_path (str): Ruta del archivo de video.
12
+ audio_path (str): Ruta del archivo de audio.
13
+
14
+ Returns:
15
+ bool: True si ambos archivos son legibles, False en caso contrario.
16
+ """
17
+ # Verificar el archivo de video
18
+ if not os.path.exists(video_path):
19
+ print(f"Error: El archivo de video no existe en la ruta {video_path}")
20
+ return False
21
+ else:
22
+ # Intentar abrir el video
23
+ cap = cv2.VideoCapture(video_path)
24
+ if not cap.isOpened():
25
+ print(f"Error: No se puede abrir el archivo de video en {video_path}")
26
+ return False
27
+ else:
28
+ print(f"Archivo de video {video_path} está accesible.")
29
+ cap.release()
30
+
31
+ # Verificar el archivo de audio
32
+ if not os.path.exists(audio_path):
33
+ print(f"Error: El archivo de audio no existe en la ruta {audio_path}")
34
+ return False
35
+ else:
36
+ try:
37
+ # Intentar abrir el archivo de audio
38
+ with sf.SoundFile(audio_path) as audio_file:
39
+ print(f"Archivo de audio {audio_path} está accesible.")
40
+ except Exception as e:
41
+ print(f"Error al leer el archivo de audio: {e}")
42
+ return False
43
+
44
+ return True
45
+
46
+ # Rutas de archivos
47
+ #video_path = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun_5s.mp4")
48
+ video_path = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun.mp4")
49
+ #audio_path = os.path.abspath("../miwav2lipv6/assets/audio/grabacion_gradio.wav")
50
+ audio_path = os.path.abspath("../miwav2lipv6/assets/audio/audio.wav")
51
+ face_detection_path = os.path.abspath("../miwav2lipv6/models/face_detection.xml")
52
+ wav2lip_path = os.path.abspath("../miwav2lipv6/models/wav2lip.xml")
53
+ outfile = os.path.abspath("../miwav2lipv6/results/result_voice.mp4")
54
+
55
+ # Verificar archivos antes de llamar a ov_inference
56
+ if verificar_archivos(video_path, audio_path):
57
+ ov_inference(
58
+ video_path,
59
+ audio_path,
60
+ face_detection_path=face_detection_path,
61
+ wav2lip_path=wav2lip_path,
62
+ inference_device="CPU",
63
+ outfile=outfile,
64
+ resize_factor = 2,
65
+ )
66
+ else:
67
+ print("No se pudo proceder con la inferencia debido a problemas con los archivos.")
src/text_to_speech.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_to_speech.py
2
+
3
+ from gtts import gTTS
4
+ import os
5
+
6
+ # Rutas de los archivos
7
+ #TRANSCRIPTION_TEXT_PATH = "C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt"
8
+ TRANSCRIPTION_TEXT_PATH = "C:/programacionEjercicios/miwav2lipv6/results/OpenAI_response.txt"
9
+ OUTPUT_AUDIO_PATH = "C:/programacionEjercicios/miwav2lipv6/assets/audio/audio.wav"
10
+
11
+ def generar_audio_desde_texto():
12
+ """
13
+ Convierte el texto en `transcripcion.txt` a un archivo de audio en español (`audio.wav`).
14
+ """
15
+ try:
16
+ # Verificar si el archivo de transcripción existe
17
+ if not os.path.exists(TRANSCRIPTION_TEXT_PATH):
18
+ print("Error: No se encontró el archivo de transcripción.")
19
+ return
20
+
21
+ # Leer el contenido de transcripcion.txt
22
+ with open(TRANSCRIPTION_TEXT_PATH, "r", encoding="utf-8") as file:
23
+ texto = file.read()
24
+
25
+ # Generar el audio en español usando gTTS
26
+ tts = gTTS(text=texto, lang='es', slow=False)
27
+ tts.save(OUTPUT_AUDIO_PATH)
28
+
29
+ print(f"Audio generado correctamente en: {OUTPUT_AUDIO_PATH}")
30
+
31
+ except Exception as e:
32
+ print(f"Error al generar el audio: {e}")
33
+
34
+ if __name__ == "__main__":
35
+ generar_audio_desde_texto()
36
+
src/utils/notebook_utils.py ADDED
@@ -0,0 +1,708 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import platform
3
+ import sys
4
+ import threading
5
+ import time
6
+ import urllib.parse
7
+
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import List, NamedTuple, Optional, Tuple
11
+ from tqdm import tqdm
12
+
13
+ import numpy as np
14
+ from openvino.runtime import Core, Type, get_version
15
+ from IPython.display import HTML, Image, display
16
+
17
+ import openvino as ov
18
+ from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
19
+ from openvino.runtime import opset10 as ops
20
+
21
+
22
+ # ## Files
23
+ #
24
+ # Load an image, download a file, download an IR model, and create a progress bar to show download progress.
25
+
26
+ def device_widget(default="AUTO", exclude=None, added=None):
27
+ import openvino as ov
28
+ import ipywidgets as widgets
29
+
30
+ core = ov.Core()
31
+
32
+ supported_devices = core.available_devices + ["AUTO"]
33
+ exclude = exclude or []
34
+ if exclude:
35
+ for ex_device in exclude:
36
+ if ex_device in supported_devices:
37
+ supported_devices.remove(ex_device)
38
+
39
+ added = added or []
40
+ if added:
41
+ for add_device in added:
42
+ if add_device not in supported_devices:
43
+ supported_devices.append(add_device)
44
+
45
+ device = widgets.Dropdown(
46
+ options=supported_devices,
47
+ value=default,
48
+ description="Device:",
49
+ disabled=False,
50
+ )
51
+ return device
52
+
53
+
54
+ def quantization_widget(default=True):
55
+ import ipywidgets as widgets
56
+
57
+ to_quantize = widgets.Checkbox(
58
+ value=default,
59
+ description="Quantization",
60
+ disabled=False,
61
+ )
62
+
63
+ return to_quantize
64
+
65
+
66
+ def pip_install(*args):
67
+ import subprocess # nosec - disable B404:import-subprocess check
68
+
69
+ cli_args = []
70
+ for arg in args:
71
+ cli_args.extend(str(arg).split(" "))
72
+ subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], shell=(platform.system() == "Windows"), check=True)
73
+
74
+
75
+ def load_image(path: str) -> np.ndarray:
76
+ """
77
+ Loads an image from `path` and returns it as BGR numpy array. `path`
78
+ should point to an image file, either a local filename or a url. The image is
79
+ not stored to the filesystem. Use the `download_file` function to download and
80
+ store an image.
81
+
82
+ :param path: Local path name or URL to image.
83
+ :return: image as BGR numpy array
84
+ """
85
+ import cv2
86
+ import requests
87
+
88
+ if path.startswith("http"):
89
+ # Set User-Agent to Mozilla because some websites block
90
+ # requests with User-Agent Python
91
+ response = requests.get(path, headers={"User-Agent": "Mozilla/5.0"})
92
+ array = np.asarray(bytearray(response.content), dtype="uint8")
93
+ image = cv2.imdecode(array, -1) # Loads the image as BGR
94
+ else:
95
+ image = cv2.imread(path)
96
+ return image
97
+
98
+
99
+ def download_file(
100
+ url: PathLike,
101
+ filename: PathLike = None,
102
+ directory: PathLike = None,
103
+ show_progress: bool = True,
104
+ silent: bool = False,
105
+ timeout: int = 10,
106
+ ) -> PathLike:
107
+ """
108
+ Download a file from a url and save it to the local filesystem. The file is saved to the
109
+ current directory by default, or to `directory` if specified. If a filename is not given,
110
+ the filename of the URL will be used.
111
+
112
+ :param url: URL that points to the file to download
113
+ :param filename: Name of the local file to save. Should point to the name of the file only,
114
+ not the full path. If None the filename from the url will be used
115
+ :param directory: Directory to save the file to. Will be created if it doesn't exist
116
+ If None the file will be saved to the current working directory
117
+ :param show_progress: If True, show an TQDM ProgressBar
118
+ :param silent: If True, do not print a message if the file already exists
119
+ :param timeout: Number of seconds before cancelling the connection attempt
120
+ :return: path to downloaded file
121
+ """
122
+ from tqdm.notebook import tqdm_notebook
123
+ import requests
124
+
125
+ filename = filename or Path(urllib.parse.urlparse(url).path).name
126
+ chunk_size = 16384 # make chunks bigger so that not too many updates are triggered for Jupyter front-end
127
+
128
+ filename = Path(filename)
129
+ if len(filename.parts) > 1:
130
+ raise ValueError(
131
+ "`filename` should refer to the name of the file, excluding the directory. "
132
+ "Use the `directory` parameter to specify a target directory for the downloaded file."
133
+ )
134
+
135
+ # create the directory if it does not exist, and add the directory to the filename
136
+ if directory is not None:
137
+ directory = Path(directory)
138
+ directory.mkdir(parents=True, exist_ok=True)
139
+ filename = directory / Path(filename)
140
+
141
+ try:
142
+ response = requests.get(url=url, headers={"User-agent": "Mozilla/5.0"}, stream=True)
143
+ response.raise_for_status()
144
+ except (
145
+ requests.exceptions.HTTPError
146
+ ) as error: # For error associated with not-200 codes. Will output something like: "404 Client Error: Not Found for url: {url}"
147
+ raise Exception(error) from None
148
+ except requests.exceptions.Timeout:
149
+ raise Exception(
150
+ "Connection timed out. If you access the internet through a proxy server, please "
151
+ "make sure the proxy is set in the shell from where you launched Jupyter."
152
+ ) from None
153
+ except requests.exceptions.RequestException as error:
154
+ raise Exception(f"File downloading failed with error: {error}") from None
155
+
156
+ # download the file if it does not exist, or if it exists with an incorrect file size
157
+ filesize = int(response.headers.get("Content-length", 0))
158
+ if not filename.exists() or (os.stat(filename).st_size != filesize):
159
+ with tqdm(
160
+ total=filesize,
161
+ unit="B",
162
+ unit_scale=True,
163
+ unit_divisor=1024,
164
+ desc=str(filename),
165
+ disable=not show_progress,
166
+ ) as progress_bar:
167
+ with open(filename, "wb") as file_object:
168
+ for chunk in response.iter_content(chunk_size):
169
+ file_object.write(chunk)
170
+ progress_bar.update(len(chunk))
171
+ progress_bar.refresh()
172
+ else:
173
+ if not silent:
174
+ print(f"'{filename}' already exists.")
175
+
176
+ response.close()
177
+
178
+ return filename.resolve()
179
+
180
+
181
+ def download_ir_model(model_xml_url: str, destination_folder: PathLike = None) -> PathLike:
182
+ """
183
+ Download IR model from `model_xml_url`. Downloads model xml and bin file; the weights file is
184
+ assumed to exist at the same location and name as model_xml_url with a ".bin" extension.
185
+
186
+ :param model_xml_url: URL to model xml file to download
187
+ :param destination_folder: Directory where downloaded model xml and bin are saved. If None, model
188
+ files are saved to the current directory
189
+ :return: path to downloaded xml model file
190
+ """
191
+ model_bin_url = model_xml_url[:-4] + ".bin"
192
+ model_xml_path = download_file(model_xml_url, directory=destination_folder, show_progress=False)
193
+ download_file(model_bin_url, directory=destination_folder)
194
+ return model_xml_path
195
+
196
+
197
+ # ## Images
198
+
199
+ # ### Convert Pixel Data
200
+ #
201
+ # Normalize image pixel values between 0 and 1, and convert images to RGB and BGR.
202
+
203
+ # In[ ]:
204
+
205
+
206
+ def normalize_minmax(data):
207
+ """
208
+ Normalizes the values in `data` between 0 and 1
209
+ """
210
+ if data.max() == data.min():
211
+ raise ValueError("Normalization is not possible because all elements of" f"`data` have the same value: {data.max()}.")
212
+ return (data - data.min()) / (data.max() - data.min())
213
+
214
+
215
+ def to_rgb(image_data: np.ndarray) -> np.ndarray:
216
+ """
217
+ Convert image_data from BGR to RGB
218
+ """
219
+ import cv2
220
+
221
+ return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)
222
+
223
+
224
+ def to_bgr(image_data: np.ndarray) -> np.ndarray:
225
+ """
226
+ Convert image_data from RGB to BGR
227
+ """
228
+ import cv2
229
+
230
+ return cv2.cvtColor(image_data, cv2.COLOR_RGB2BGR)
231
+
232
+
233
+ # ## Videos
234
+
235
+ # ### Video Player
236
+ #
237
+ # Custom video player to fulfill FPS requirements. You can set target FPS and output size, flip the video horizontally or skip first N frames.
238
+
239
+ # In[ ]:
240
+
241
+
242
+ class VideoPlayer:
243
+ """
244
+ Custom video player to fulfill FPS requirements. You can set target FPS and output size,
245
+ flip the video horizontally or skip first N frames.
246
+
247
+ :param source: Video source. It could be either camera device or video file.
248
+ :param size: Output frame size.
249
+ :param flip: Flip source horizontally.
250
+ :param fps: Target FPS.
251
+ :param skip_first_frames: Skip first N frames.
252
+ """
253
+
254
+ def __init__(self, source, size=None, flip=False, fps=None, skip_first_frames=0, width=1280, height=720):
255
+ import cv2
256
+
257
+ self.cv2 = cv2 # This is done to access the package in class methods
258
+ self.__cap = cv2.VideoCapture(source)
259
+ # try HD by default to get better video quality
260
+ self.__cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
261
+ self.__cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
262
+
263
+ if not self.__cap.isOpened():
264
+ raise RuntimeError(f"Cannot open {'camera' if isinstance(source, int) else ''} {source}")
265
+ # skip first N frames
266
+ self.__cap.set(cv2.CAP_PROP_POS_FRAMES, skip_first_frames)
267
+ # fps of input file
268
+ self.__input_fps = self.__cap.get(cv2.CAP_PROP_FPS)
269
+ if self.__input_fps <= 0:
270
+ self.__input_fps = 60
271
+ # target fps given by user
272
+ self.__output_fps = fps if fps is not None else self.__input_fps
273
+ self.__flip = flip
274
+ self.__size = None
275
+ self.__interpolation = None
276
+ if size is not None:
277
+ self.__size = size
278
+ # AREA better for shrinking, LINEAR better for enlarging
279
+ self.__interpolation = cv2.INTER_AREA if size[0] < self.__cap.get(cv2.CAP_PROP_FRAME_WIDTH) else cv2.INTER_LINEAR
280
+ # first frame
281
+ _, self.__frame = self.__cap.read()
282
+ self.__lock = threading.Lock()
283
+ self.__thread = None
284
+ self.__stop = False
285
+
286
+ """
287
+ Start playing.
288
+ """
289
+
290
+ def start(self):
291
+ self.__stop = False
292
+ self.__thread = threading.Thread(target=self.__run, daemon=True)
293
+ self.__thread.start()
294
+
295
+ """
296
+ Stop playing and release resources.
297
+ """
298
+
299
+ def stop(self):
300
+ self.__stop = True
301
+ if self.__thread is not None:
302
+ self.__thread.join()
303
+ self.__cap.release()
304
+
305
+ def __run(self):
306
+ prev_time = 0
307
+ while not self.__stop:
308
+ t1 = time.time()
309
+ ret, frame = self.__cap.read()
310
+ if not ret:
311
+ break
312
+
313
+ # fulfill target fps
314
+ if 1 / self.__output_fps < time.time() - prev_time:
315
+ prev_time = time.time()
316
+ # replace by current frame
317
+ with self.__lock:
318
+ self.__frame = frame
319
+
320
+ t2 = time.time()
321
+ # time to wait [s] to fulfill input fps
322
+ wait_time = 1 / self.__input_fps - (t2 - t1)
323
+ # wait until
324
+ time.sleep(max(0, wait_time))
325
+
326
+ self.__frame = None
327
+
328
+ """
329
+ Get current frame.
330
+ """
331
+
332
+ def next(self):
333
+ import cv2
334
+
335
+ with self.__lock:
336
+ if self.__frame is None:
337
+ return None
338
+ # need to copy frame, because can be cached and reused if fps is low
339
+ frame = self.__frame.copy()
340
+ if self.__size is not None:
341
+ frame = self.cv2.resize(frame, self.__size, interpolation=self.__interpolation)
342
+ if self.__flip:
343
+ frame = self.cv2.flip(frame, 1)
344
+ return frame
345
+
346
+
347
+ # ## Visualization
348
+
349
+ # ### Segmentation
350
+ #
351
+ # Define a SegmentationMap NamedTuple that keeps the labels and colormap for a segmentation project/dataset. Create CityScapesSegmentation and BinarySegmentation SegmentationMaps. Create a function to convert a segmentation map to an RGB image with a colormap, and to show the segmentation result as an overlay over the original image.
352
+
353
+ # In[ ]:
354
+
355
+
356
+ class Label(NamedTuple):
357
+ index: int
358
+ color: Tuple
359
+ name: Optional[str] = None
360
+
361
+
362
+ # In[ ]:
363
+
364
+
365
+ class SegmentationMap(NamedTuple):
366
+ labels: List
367
+
368
+ def get_colormap(self):
369
+ return np.array([label.color for label in self.labels])
370
+
371
+ def get_labels(self):
372
+ labelnames = [label.name for label in self.labels]
373
+ if any(labelnames):
374
+ return labelnames
375
+ else:
376
+ return None
377
+
378
+
379
+ # In[ ]:
380
+
381
+
382
+ cityscape_labels = [
383
+ Label(index=0, color=(128, 64, 128), name="road"),
384
+ Label(index=1, color=(244, 35, 232), name="sidewalk"),
385
+ Label(index=2, color=(70, 70, 70), name="building"),
386
+ Label(index=3, color=(102, 102, 156), name="wall"),
387
+ Label(index=4, color=(190, 153, 153), name="fence"),
388
+ Label(index=5, color=(153, 153, 153), name="pole"),
389
+ Label(index=6, color=(250, 170, 30), name="traffic light"),
390
+ Label(index=7, color=(220, 220, 0), name="traffic sign"),
391
+ Label(index=8, color=(107, 142, 35), name="vegetation"),
392
+ Label(index=9, color=(152, 251, 152), name="terrain"),
393
+ Label(index=10, color=(70, 130, 180), name="sky"),
394
+ Label(index=11, color=(220, 20, 60), name="person"),
395
+ Label(index=12, color=(255, 0, 0), name="rider"),
396
+ Label(index=13, color=(0, 0, 142), name="car"),
397
+ Label(index=14, color=(0, 0, 70), name="truck"),
398
+ Label(index=15, color=(0, 60, 100), name="bus"),
399
+ Label(index=16, color=(0, 80, 100), name="train"),
400
+ Label(index=17, color=(0, 0, 230), name="motorcycle"),
401
+ Label(index=18, color=(119, 11, 32), name="bicycle"),
402
+ Label(index=19, color=(255, 255, 255), name="background"),
403
+ ]
404
+
405
+ CityScapesSegmentation = SegmentationMap(cityscape_labels)
406
+
407
+ binary_labels = [
408
+ Label(index=0, color=(255, 255, 255), name="background"),
409
+ Label(index=1, color=(0, 0, 0), name="foreground"),
410
+ ]
411
+
412
+ BinarySegmentation = SegmentationMap(binary_labels)
413
+
414
+
415
+ # In[ ]:
416
+
417
+
418
+ def segmentation_map_to_image(result: np.ndarray, colormap: np.ndarray, remove_holes: bool = False) -> np.ndarray:
419
+ """
420
+ Convert network result of floating point numbers to an RGB image with
421
+ integer values from 0-255 by applying a colormap.
422
+
423
+ :param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
424
+ :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
425
+ :param remove_holes: If True, remove holes in the segmentation result.
426
+ :return: An RGB image where each pixel is an int8 value according to colormap.
427
+ """
428
+ import cv2
429
+
430
+ if len(result.shape) != 2 and result.shape[0] != 1:
431
+ raise ValueError(f"Expected result with shape (H,W) or (1,H,W), got result with shape {result.shape}")
432
+
433
+ if len(np.unique(result)) > colormap.shape[0]:
434
+ raise ValueError(
435
+ f"Expected max {colormap[0]} classes in result, got {len(np.unique(result))} "
436
+ "different output values. Please make sure to convert the network output to "
437
+ "pixel values before calling this function."
438
+ )
439
+ elif result.shape[0] == 1:
440
+ result = result.squeeze(0)
441
+
442
+ result = result.astype(np.uint8)
443
+
444
+ contour_mode = cv2.RETR_EXTERNAL if remove_holes else cv2.RETR_TREE
445
+ mask = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8)
446
+ for label_index, color in enumerate(colormap):
447
+ label_index_map = result == label_index
448
+ label_index_map = label_index_map.astype(np.uint8) * 255
449
+ contours, hierarchies = cv2.findContours(label_index_map, contour_mode, cv2.CHAIN_APPROX_SIMPLE)
450
+ cv2.drawContours(
451
+ mask,
452
+ contours,
453
+ contourIdx=-1,
454
+ color=color.tolist(),
455
+ thickness=cv2.FILLED,
456
+ )
457
+
458
+ return mask
459
+
460
+
461
+ def segmentation_map_to_overlay(image, result, alpha, colormap, remove_holes=False) -> np.ndarray:
462
+ """
463
+ Returns a new image where a segmentation mask (created with colormap) is overlayed on
464
+ the source image.
465
+
466
+ :param image: Source image.
467
+ :param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
468
+ :param alpha: Alpha transparency value for the overlay image.
469
+ :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
470
+ :param remove_holes: If True, remove holes in the segmentation result.
471
+ :return: An RGP image with segmentation mask overlayed on the source image.
472
+ """
473
+ import cv2
474
+
475
+ if len(image.shape) == 2:
476
+ image = np.repeat(np.expand_dims(image, -1), 3, 2)
477
+ mask = segmentation_map_to_image(result, colormap, remove_holes)
478
+ image_height, image_width = image.shape[:2]
479
+ mask = cv2.resize(src=mask, dsize=(image_width, image_height))
480
+ return cv2.addWeighted(mask, alpha, image, 1 - alpha, 0)
481
+
482
+
483
+ # ### Network Results
484
+ #
485
+ # Show network result image, optionally together with the source image and a legend with labels.
486
+
487
+ # In[ ]:
488
+
489
+
490
+ def viz_result_image(
491
+ result_image: np.ndarray,
492
+ source_image: np.ndarray = None,
493
+ source_title: str = None,
494
+ result_title: str = None,
495
+ labels: List[Label] = None,
496
+ resize: bool = False,
497
+ bgr_to_rgb: bool = False,
498
+ hide_axes: bool = False,
499
+ ):
500
+ """
501
+ Show result image, optionally together with source images, and a legend with labels.
502
+
503
+ :param result_image: Numpy array of RGB result image.
504
+ :param source_image: Numpy array of source image. If provided this image will be shown
505
+ next to the result image. source_image is expected to be in RGB format.
506
+ Set bgr_to_rgb to True if source_image is in BGR format.
507
+ :param source_title: Title to display for the source image.
508
+ :param result_title: Title to display for the result image.
509
+ :param labels: List of labels. If provided, a legend will be shown with the given labels.
510
+ :param resize: If true, resize the result image to the same shape as the source image.
511
+ :param bgr_to_rgb: If true, convert the source image from BGR to RGB. Use this option if
512
+ source_image is a BGR image.
513
+ :param hide_axes: If true, do not show matplotlib axes.
514
+ :return: Matplotlib figure with result image
515
+ """
516
+ import cv2
517
+ import matplotlib.pyplot as plt
518
+ from matplotlib.lines import Line2D
519
+
520
+ if bgr_to_rgb:
521
+ source_image = to_rgb(source_image)
522
+ if resize:
523
+ result_image = cv2.resize(result_image, (source_image.shape[1], source_image.shape[0]))
524
+
525
+ num_images = 1 if source_image is None else 2
526
+
527
+ fig, ax = plt.subplots(1, num_images, figsize=(16, 8), squeeze=False)
528
+ if source_image is not None:
529
+ ax[0, 0].imshow(source_image)
530
+ ax[0, 0].set_title(source_title)
531
+
532
+ ax[0, num_images - 1].imshow(result_image)
533
+ ax[0, num_images - 1].set_title(result_title)
534
+
535
+ if hide_axes:
536
+ for a in ax.ravel():
537
+ a.axis("off")
538
+ if labels:
539
+ colors = labels.get_colormap()
540
+ lines = [
541
+ Line2D(
542
+ [0],
543
+ [0],
544
+ color=[item / 255 for item in c.tolist()],
545
+ linewidth=3,
546
+ linestyle="-",
547
+ )
548
+ for c in colors
549
+ ]
550
+ plt.legend(
551
+ lines,
552
+ labels.get_labels(),
553
+ bbox_to_anchor=(1, 1),
554
+ loc="upper left",
555
+ prop={"size": 12},
556
+ )
557
+ plt.close(fig)
558
+ return fig
559
+
560
+
561
+ # ### Live Inference
562
+
563
+ # In[ ]:
564
+
565
+
566
+ def show_array(frame: np.ndarray, display_handle=None):
567
+ """
568
+ Display array `frame`. Replace information at `display_handle` with `frame`
569
+ encoded as jpeg image. `frame` is expected to have data in BGR order.
570
+
571
+ Create a display_handle with: `display_handle = display(display_id=True)`
572
+ """
573
+ import cv2
574
+
575
+ _, frame = cv2.imencode(ext=".jpeg", img=frame)
576
+ if display_handle is None:
577
+ display_handle = display(Image(data=frame.tobytes()), display_id=True)
578
+ else:
579
+ display_handle.update(Image(data=frame.tobytes()))
580
+ return display_handle
581
+
582
+
583
+ # ## Checks and Alerts
584
+ #
585
+ # Create an alert class to show stylized info/error/warning messages and a `check_device` function that checks whether a given device is available.
586
+
587
+ # In[ ]:
588
+
589
+
590
+ class NotebookAlert(Exception):
591
+ def __init__(self, message: str, alert_class: str):
592
+ """
593
+ Show an alert box with the given message.
594
+
595
+ :param message: The message to display.
596
+ :param alert_class: The class for styling the message. Options: info, warning, success, danger.
597
+ """
598
+ self.message = message
599
+ self.alert_class = alert_class
600
+ self.show_message()
601
+
602
+ def show_message(self):
603
+ display(HTML(f"""<div class="alert alert-{self.alert_class}">{self.message}"""))
604
+
605
+
606
+ class DeviceNotFoundAlert(NotebookAlert):
607
+ def __init__(self, device: str):
608
+ """
609
+ Show a warning message about an unavailable device. This class does not check whether or
610
+ not the device is available, use the `check_device` function to check this. `check_device`
611
+ also shows the warning if the device is not found.
612
+
613
+ :param device: The unavailable device.
614
+ :return: A formatted alert box with the message that `device` is not available, and a list
615
+ of devices that are available.
616
+ """
617
+ ie = Core()
618
+ supported_devices = ie.available_devices
619
+ self.message = f"Running this cell requires a {device} device, " "which is not available on this system. "
620
+ self.alert_class = "warning"
621
+ if len(supported_devices) == 1:
622
+ self.message += f"The following device is available: {ie.available_devices[0]}"
623
+ else:
624
+ self.message += "The following devices are available: " f"{', '.join(ie.available_devices)}"
625
+ super().__init__(self.message, self.alert_class)
626
+
627
+
628
+ def check_device(device: str) -> bool:
629
+ """
630
+ Check if the specified device is available on the system.
631
+
632
+ :param device: Device to check. e.g. CPU, GPU
633
+ :return: True if the device is available, False if not. If the device is not available,
634
+ a DeviceNotFoundAlert will be shown.
635
+ """
636
+ ie = Core()
637
+ if device not in ie.available_devices:
638
+ DeviceNotFoundAlert(device)
639
+ return False
640
+ else:
641
+ return True
642
+
643
+
644
+ def check_openvino_version(version: str) -> bool:
645
+ """
646
+ Check if the specified OpenVINO version is installed.
647
+
648
+ :param version: the OpenVINO version to check. Example: 2021.4
649
+ :return: True if the version is installed, False if not. If the version is not installed,
650
+ an alert message will be shown.
651
+ """
652
+ installed_version = get_version()
653
+ if version not in installed_version:
654
+ NotebookAlert(
655
+ f"This notebook requires OpenVINO {version}. "
656
+ f"The version on your system is: <i>{installed_version}</i>.<br>"
657
+ "Please run <span style='font-family:monospace'>pip install --upgrade -r requirements.txt</span> "
658
+ "in the openvino_env environment to install this version. "
659
+ "See the <a href='https://github.com/openvinotoolkit/openvino_notebooks'>"
660
+ "OpenVINO Notebooks README</a> for detailed instructions",
661
+ alert_class="danger",
662
+ )
663
+ return False
664
+ else:
665
+ return True
666
+
667
+
668
+ packed_layername_tensor_dict_list = [{"name": "aten::mul/Multiply"}]
669
+
670
+
671
+ class ReplaceTensor(MatcherPass):
672
+ def __init__(self, packed_layername_tensor_dict_list):
673
+ MatcherPass.__init__(self)
674
+ self.model_changed = False
675
+
676
+ param = WrapType("opset10.Multiply")
677
+
678
+ def callback(matcher: Matcher) -> bool:
679
+ root = matcher.get_match_root()
680
+ if root is None:
681
+ return False
682
+ for y in packed_layername_tensor_dict_list:
683
+ root_name = root.get_friendly_name()
684
+ if root_name.find(y["name"]) != -1:
685
+ max_fp16 = np.array([[[[-np.finfo(np.float16).max]]]]).astype(np.float32)
686
+ new_tenser = ops.constant(max_fp16, Type.f32, name="Constant_4431")
687
+ root.set_arguments([root.input_value(0).node, new_tenser])
688
+ packed_layername_tensor_dict_list.remove(y)
689
+
690
+ return True
691
+
692
+ self.register_matcher(Matcher(param, "ReplaceTensor"), callback)
693
+
694
+
695
+ def optimize_bge_embedding(model_path, output_model_path):
696
+ """
697
+ optimize_bge_embedding used to optimize BGE model for NPU device
698
+
699
+ Arguments:
700
+ model_path {str} -- original BGE IR model path
701
+ output_model_path {str} -- Converted BGE IR model path
702
+ """
703
+ core = Core()
704
+ ov_model = core.read_model(model_path)
705
+ manager = Manager()
706
+ manager.register_pass(ReplaceTensor(packed_layername_tensor_dict_list))
707
+ manager.run_passes(ov_model)
708
+ ov.save_model(ov_model, output_model_path, compress_to_fp16=False)
src/utils/pip_helper.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+
4
+ def pip_install(*args):
5
+ import subprocess # nosec - disable B404:import-subprocess check
6
+
7
+ cli_args = []
8
+ for arg in args:
9
+ cli_args.extend(str(arg).split(" "))
10
+ subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], check=True)
src/whisper_audio_extractor.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # whisper_audio_extractor.py
2
+
3
+ import sounddevice as sd
4
+ from scipy.io.wavfile import write
5
+ import whisper
6
+ import os
7
+
8
+ # Ruta para guardar el archivo de audio temporalmente
9
+ AUDIO_PATH = os.path.join("..", "assets", "audio", "recorded_audio.wav")
10
+
11
+ def record_audio(duration=5, sample_rate=44100):
12
+ """
13
+ Graba el audio del micrófono durante un tiempo específico y lo guarda como archivo WAV.
14
+
15
+ Args:
16
+ duration (int): Duración de la grabación en segundos.
17
+ sample_rate (int): Frecuencia de muestreo del audio.
18
+ """
19
+ print("Grabando...")
20
+ audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2)
21
+ sd.wait() # Espera a que finalice la grabación
22
+ write(AUDIO_PATH, sample_rate, audio_data) # Guarda el audio en el directorio especificado
23
+ print(f"Grabación completa. Archivo guardado en {AUDIO_PATH}")
24
+
25
+ def transcribe_audio():
26
+ """
27
+ Usa el modelo Whisper para transcribir el audio grabado y devuelve el texto.
28
+
29
+ Returns:
30
+ str: Texto transcrito del audio.
31
+ """
32
+ # Cargar el modelo de Whisper
33
+ model = whisper.load_model("base")
34
+
35
+ # Transcribir el audio
36
+ print("Transcribiendo el audio...")
37
+ result = model.transcribe(AUDIO_PATH)
38
+ print("Transcripción completada.")
39
+ return result["text"]
40
+
41
+ if __name__ == "__main__":
42
+ # Paso 1: Grabar audio
43
+ record_audio()
44
+
45
+ # Paso 2: Transcribir audio
46
+ texto = transcribe_audio()
47
+ print("Texto extraído:", texto)
src/whisper_audio_transcriber.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # whisper_audio_transcriber.py
2
+
3
+ import os
4
+ from pathlib import Path
5
+ import requests
6
+ import librosa
7
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
8
+ from transformers.utils import logging
9
+ import soundfile as sf
10
+
11
+ # Definición de modelos
12
+ model_ids = {
13
+ "Multilingual models": [
14
+ "openai/whisper-large-v3-turbo",
15
+ "openai/whisper-large-v3",
16
+ "openai/whisper-large-v2",
17
+ "openai/whisper-large",
18
+ "openai/whisper-medium",
19
+ "openai/whisper-small",
20
+ "openai/whisper-base",
21
+ "openai/whisper-tiny",
22
+ ],
23
+ "English-only models": [
24
+ "distil-whisper/distil-large-v2",
25
+ "distil-whisper/distil-large-v3",
26
+ "distil-whisper/distil-medium.en",
27
+ "distil-whisper/distil-small.en",
28
+ "openai/whisper-medium.en",
29
+ "openai/whisper-small.en",
30
+ "openai/whisper-base.en",
31
+ "openai/whisper-tiny.en",
32
+ ],
33
+ }
34
+
35
+ def download_file(url, filename, directory="."):
36
+ """
37
+ Descarga un archivo desde una URL y lo guarda en el directorio especificado.
38
+ """
39
+ os.makedirs(directory, exist_ok=True)
40
+ filepath = Path(directory) / filename
41
+ response = requests.get(url)
42
+ filepath.write_bytes(response.content)
43
+ return filepath
44
+
45
+ def transcribe_audio(file_path, model_name):
46
+ """
47
+ Transcribe el audio utilizando un modelo de Whisper.
48
+
49
+ Args:
50
+ file_path (str): Ruta del archivo de audio.
51
+ model_name (str): Nombre del modelo de Whisper.
52
+
53
+ Returns:
54
+ str: Transcripción del audio.
55
+ """
56
+ processor = AutoProcessor.from_pretrained(model_name)
57
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
58
+
59
+ # Crear pipeline para transcripción
60
+ pipe = pipeline(
61
+ "automatic-speech-recognition",
62
+ model=model,
63
+ tokenizer=processor.tokenizer,
64
+ feature_extractor=processor.feature_extractor,
65
+ device="cpu", # Cambiar a "cuda" si tienes una GPU disponible
66
+ )
67
+
68
+ # Cargar el archivo de audio
69
+ audio_data, samplerate = librosa.load(file_path, sr=16000)
70
+
71
+ # Transcribir el audio
72
+ result = pipe(audio_data)
73
+ return result["text"]
74
+
75
+ def guardar_transcripcion(texto, filename="transcripcion.txt", directory="../results"):
76
+ """
77
+ Guarda el texto transcrito en un archivo .txt en el directorio especificado.
78
+
79
+ Args:
80
+ texto (str): Texto transcrito que se desea guardar.
81
+ filename (str): Nombre del archivo .txt.
82
+ directory (str): Directorio donde se guardará el archivo.
83
+ """
84
+ os.makedirs(directory, exist_ok=True) # Crea el directorio si no existe
85
+ file_path = Path(directory) / filename
86
+ with open(file_path, "w", encoding="utf-8") as f:
87
+ f.write(texto)
88
+ print(f"Transcripción guardada en: {file_path}")
89
+
90
+ def main():
91
+ # Configuración de logging para errores únicamente
92
+ logging.set_verbosity_error()
93
+
94
+ # Ruta del archivo de audio
95
+ audio_path = os.path.abspath("../miwav2lipv6/assets/audio/grabacion_gradio.wav")
96
+
97
+ # Modelo seleccionado
98
+ model_name = "openai/whisper-large" # Cambia esto al modelo deseado
99
+
100
+ # Transcribir el audio
101
+ print(f"Transcribiendo el audio del archivo: {audio_path}")
102
+ transcription = transcribe_audio(audio_path, model_name)
103
+ print(f"Transcripción: {transcription}")
104
+
105
+ # Guardar la transcripción en un archivo .txt
106
+ guardar_transcripcion(transcription)
107
+
108
+ if __name__ == "__main__":
109
+ main()
tests/test_whisper_audio_extractor.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pytest
3
+ from src.whisper_audio_extractor import record_audio, transcribe_audio, AUDIO_PATH
4
+
5
+ def test_record_audio():
6
+ """
7
+ Verifica que la función de grabación crea un archivo de audio con un tamaño válido.
8
+ """
9
+ # Ejecuta la grabación con una duración de prueba corta
10
+ record_audio(duration=2) # Graba por 2 segundos para el test
11
+
12
+ # Comprueba si el archivo de audio existe
13
+ assert os.path.exists(AUDIO_PATH), "El archivo de audio no fue creado."
14
+
15
+ # Comprueba que el archivo no esté vacío
16
+ assert os.path.getsize(AUDIO_PATH) > 0, "El archivo de audio está vacío."
17
+
18
+ def test_transcribe_audio():
19
+ """
20
+ Verifica que la función de transcripción devuelve texto.
21
+ """
22
+ # Ejecuta la transcripción del audio grabado
23
+ transcription = transcribe_audio()
24
+
25
+ # Asegura que se obtuvo texto
26
+ assert isinstance(transcription, str) and len(transcription) > 0, "La transcripción está vacía o no es texto."
27
+
28
+ if __name__ == "__main__":
29
+ pytest.main()