Commit
·
d4757ae
0
Parent(s):
Primer commit, creo que faltan los modelos pesados
Browse files- .gitignore +24 -0
- estructura_proyecto.txt +34 -0
- requirements.txt +31 -0
- results/OpenAI_response.txt +5 -0
- results/transcripcion.txt +1 -0
- setup.py +287 -0
- src/.gradio/certificate.pem +31 -0
- src/audio_recorder.py +48 -0
- src/call_openai_api.py +80 -0
- src/convert_models.py +16 -0
- src/gradio_helper.py +26 -0
- src/interface.py +60 -0
- src/interfaceV2.py +183 -0
- src/ov_inference.py +637 -0
- src/ov_wav2lip_helper.py +68 -0
- src/run_inference.py +67 -0
- src/text_to_speech.py +36 -0
- src/utils/notebook_utils.py +708 -0
- src/utils/pip_helper.py +10 -0
- src/whisper_audio_extractor.py +47 -0
- src/whisper_audio_transcriber.py +109 -0
- tests/test_whisper_audio_extractor.py +29 -0
.gitignore
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Ignorar entorno virtual
|
2 |
+
env/
|
3 |
+
|
4 |
+
#Ignorar archivos y carpetas de compilacion
|
5 |
+
__pycache__/
|
6 |
+
*.pyc
|
7 |
+
*.pyo
|
8 |
+
*.py[cod]
|
9 |
+
.vscode/
|
10 |
+
.DS_Store
|
11 |
+
|
12 |
+
#Ignorar archivos de log y salida de pruebas
|
13 |
+
*.log
|
14 |
+
*.out
|
15 |
+
*.tmp
|
16 |
+
|
17 |
+
|
18 |
+
#Ignorar modelos y checkpoints
|
19 |
+
models/
|
20 |
+
checkpoints/
|
21 |
+
src/Wav2Lip/
|
22 |
+
assets/
|
23 |
+
data/
|
24 |
+
#Archivos temporales y de sistema
|
estructura_proyecto.txt
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
proyecto_root/
|
2 |
+
│
|
3 |
+
├── assets/
|
4 |
+
├── video/
|
5 |
+
│ │ ├──data_video_sun_5s.mp4
|
6 |
+
│ └── audio/
|
7 |
+
│ └──data_audio_sun_5s.wav
|
8 |
+
│ └── # Archivos de datos, audio y video de prueba, como `data_audio_sun_5s.wav`
|
9 |
+
│
|
10 |
+
├── checkpoints/
|
11 |
+
│ └── # Modelos y checkpoints preentrenados, como `wav2lip_gan.pth`
|
12 |
+
│
|
13 |
+
├── models/
|
14 |
+
│ └── # Modelos convertidos a OpenVINO IR, como `face_detection.xml` y `wav2lip.xml`
|
15 |
+
│
|
16 |
+
├── src/
|
17 |
+
| ├── utils/
|
18 |
+
| ├── Wav2Lip/
|
19 |
+
│ ├── convert_models.py
|
20 |
+
│ ├── gradio_helper.py
|
21 |
+
│ ├── ov_inference.py
|
22 |
+
│ ├── ov_wav2lip_helper.py
|
23 |
+
│ └── run_inference
|
24 |
+
│
|
25 |
+
├── tests/
|
26 |
+
│ └── # Scripts de pruebas para verificar la funcionalidad de tu código
|
27 |
+
│
|
28 |
+
├── results/
|
29 |
+
│ └── result_voice.mp4
|
30 |
+
│
|
31 |
+
├── requirements.txt # Lista de dependencias del proyecto
|
32 |
+
├── setup.py # Script de configuración del proyecto
|
33 |
+
├── estructura_proyecto.py # Script de configuración del proyecto
|
34 |
+
└── README.md # Documentación del proyecto
|
requirements.txt
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openvino>=2024.4.0
|
2 |
+
huggingface_hub
|
3 |
+
torch>=2.1
|
4 |
+
gradio>=4.19
|
5 |
+
librosa==0.9.2
|
6 |
+
opencv-contrib-python
|
7 |
+
opencv-python
|
8 |
+
IPython
|
9 |
+
tqdm
|
10 |
+
numba
|
11 |
+
numpy
|
12 |
+
|
13 |
+
openai-whisper
|
14 |
+
sounddevice
|
15 |
+
scipy
|
16 |
+
|
17 |
+
transformers>=4.35
|
18 |
+
torchvision>=0.18.1
|
19 |
+
onnx>=1.16.1
|
20 |
+
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
|
21 |
+
openvino
|
22 |
+
openvino-tokenizers
|
23 |
+
openvino-genai
|
24 |
+
datasets
|
25 |
+
soundfile>=0.12
|
26 |
+
python-ffmpeg<=1.0.16
|
27 |
+
nncf>=2.13.0
|
28 |
+
jiwer
|
29 |
+
|
30 |
+
gtts
|
31 |
+
|
results/OpenAI_response.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hola, prueba en marcha,
|
2 |
+
María con IA se realza,
|
3 |
+
Nuevo modelo se lanza,
|
4 |
+
Incorporación, esperanza,
|
5 |
+
Ser mejor, nuestra balanza.
|
results/transcripcion.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Hola, esta es una prueba para ver si podemos incorporar este modelo a María, María RB.
|
setup.py
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 2024/03/11 setup.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import subprocess
|
5 |
+
import sys
|
6 |
+
import requests
|
7 |
+
|
8 |
+
from pathlib import Path
|
9 |
+
|
10 |
+
# Definición de las carpetas del proyecto
|
11 |
+
PROJECT_DIRECTORIES = [
|
12 |
+
"assets",
|
13 |
+
"assets/audio",
|
14 |
+
"assets/video",
|
15 |
+
"checkpoints",
|
16 |
+
"models",
|
17 |
+
"src",
|
18 |
+
"src/utils",
|
19 |
+
"tests",
|
20 |
+
"results"
|
21 |
+
]
|
22 |
+
|
23 |
+
# URLs de las utilidades de OpenVINO Notebooks
|
24 |
+
OPENVINO_UTILS = {
|
25 |
+
"notebook_utils.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
|
26 |
+
"pip_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py"
|
27 |
+
}
|
28 |
+
|
29 |
+
# URLs de los archivos de ayuda de Wav2Lip
|
30 |
+
WAV2LIP_HELPERS = {
|
31 |
+
"gradio_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/gradio_helper.py",
|
32 |
+
"ov_inference.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/ov_inference.py",
|
33 |
+
"ov_wav2lip_helper.py": "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/ov_wav2lip_helper.py"
|
34 |
+
}
|
35 |
+
|
36 |
+
WAV2LIP_HELPERS_DIR = Path("src")
|
37 |
+
OPENVINO_UTILS_DIR = Path("src/utils")
|
38 |
+
|
39 |
+
# URLs de los archivos de ejemplo de entrada
|
40 |
+
EXAMPLE_FILES = {
|
41 |
+
"audio_example": {
|
42 |
+
"filename": "data_audio_sun_5s.wav",
|
43 |
+
"url": "https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_audio_sun_5s.wav?raw=true",
|
44 |
+
"folder": "assets/audio"
|
45 |
+
},
|
46 |
+
"video_example": {
|
47 |
+
"filename": "data_video_sun_5s.mp4",
|
48 |
+
"url": "https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_video_sun_5s.mp4?raw=true",
|
49 |
+
"folder": "assets/video"
|
50 |
+
}
|
51 |
+
}
|
52 |
+
|
53 |
+
# Función para crear la estructura general del proyecto
|
54 |
+
def create_project_structure():
|
55 |
+
"""
|
56 |
+
Crea la estructura de las carpetas del proyecto
|
57 |
+
"""
|
58 |
+
for directory in PROJECT_DIRECTORIES:
|
59 |
+
path = Path(directory)
|
60 |
+
if not path.exists():
|
61 |
+
path.mkdir(parents=True, exist_ok=True)
|
62 |
+
print(f"Carpeta '{directory}' creada.")
|
63 |
+
else:
|
64 |
+
print(f"Carpeta '{directory}' ya existe.")
|
65 |
+
|
66 |
+
# Función para crear el entorno virtual
|
67 |
+
def create_virtual_environment():
|
68 |
+
"""
|
69 |
+
Crea el entorno virtual si no existe.
|
70 |
+
"""
|
71 |
+
env_path = Path("env")
|
72 |
+
if not env_path.exists():
|
73 |
+
print("Creando el entorno virtual...")
|
74 |
+
subprocess.check_call([sys.executable, "-m", "venv", "env"])
|
75 |
+
print(f"Entorno virtual creado en '{env_path}'.")
|
76 |
+
else:
|
77 |
+
print(f"El entorno virtual '{env_path}' ya existe.")
|
78 |
+
|
79 |
+
# Función que activa y define pip y python
|
80 |
+
def activate_virtual_environment():
|
81 |
+
"""
|
82 |
+
Activa el entorno virtual y devuelve las rutas de pip y python.
|
83 |
+
"""
|
84 |
+
if os.name == 'nt': # Windows
|
85 |
+
python_path = str(Path("env") / "Scripts" / "python.exe")
|
86 |
+
pip_path = str(Path("env") / "Scripts" / "pip.exe")
|
87 |
+
else: # Unix/MacOS
|
88 |
+
python_path = str(Path("env") / "bin" / "python")
|
89 |
+
pip_path = str(Path("env") / "bin" / "pip")
|
90 |
+
|
91 |
+
# Actualizar pip a la última versión en el entorno virtual usando python -m pip
|
92 |
+
try:
|
93 |
+
subprocess.check_call([python_path, "-m", "pip", "install", "--upgrade", "pip"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
94 |
+
print("pip actualizado a la última versión.")
|
95 |
+
except subprocess.CalledProcessError:
|
96 |
+
print("Error al actualizar pip.")
|
97 |
+
try:
|
98 |
+
subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
99 |
+
except subprocess.CalledProcessError:
|
100 |
+
print("Error al instalar tqdm.")
|
101 |
+
|
102 |
+
return python_path, pip_path
|
103 |
+
|
104 |
+
# Funcion para instalar las dependencias desde requirements.txt con barra de progreso
|
105 |
+
def install_requirements(pip_path):
|
106 |
+
"""
|
107 |
+
Instala las dependencias de requirements.txt con una barra de progreso.
|
108 |
+
"""
|
109 |
+
print("Instalando dependencias...")
|
110 |
+
# Instalar tqdm en el entorno virtual si no está instalado
|
111 |
+
try:
|
112 |
+
subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
113 |
+
except subprocess.CalledProcessError:
|
114 |
+
print("Error al instalar tqdm.")
|
115 |
+
|
116 |
+
from tqdm import tqdm # Importar tqdm para la barra de progreso
|
117 |
+
|
118 |
+
# Leer requirements.txt y mostrar barra de progreso
|
119 |
+
requirements_path = Path("requirements.txt")
|
120 |
+
if not requirements_path.exists():
|
121 |
+
print("Archivo requirements.txt no encontrado.")
|
122 |
+
return
|
123 |
+
|
124 |
+
with open(requirements_path, "r") as f:
|
125 |
+
dependencies = f.read().splitlines()
|
126 |
+
|
127 |
+
# Instalar cada dependencia con barra de progreso
|
128 |
+
for dependency in tqdm(dependencies, desc="Instalando dependencias", unit="paquete"):
|
129 |
+
try:
|
130 |
+
subprocess.check_call([pip_path, "install", dependency], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
131 |
+
except subprocess.CalledProcessError:
|
132 |
+
print(f"\nError al instalar {dependency}.")
|
133 |
+
|
134 |
+
print("Todas las dependencias fueron instaladas correctamente.")
|
135 |
+
|
136 |
+
# Funcion para descargar los archivos de utilidades de OpenVINO Notebooks
|
137 |
+
def download_openvino_utils(pip_path):
|
138 |
+
"""
|
139 |
+
Descarga los archivos de utilidades de OpenVINO Notebooks en src/utils si no existen.
|
140 |
+
"""
|
141 |
+
# Crear la carpeta de utilidades si no existe
|
142 |
+
OPENVINO_UTILS_DIR.mkdir(parents=True, exist_ok=True)
|
143 |
+
|
144 |
+
# Instalar requests en el entorno virtual si no está instalado
|
145 |
+
try:
|
146 |
+
subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
147 |
+
except subprocess.CalledProcessError:
|
148 |
+
print("Error al instalar requests.")
|
149 |
+
|
150 |
+
# Instalar tqdm en el entorno virtual si no está instalado
|
151 |
+
try:
|
152 |
+
subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
153 |
+
except subprocess.CalledProcessError:
|
154 |
+
print("Error al instalar tqdm.")
|
155 |
+
|
156 |
+
from tqdm import tqdm # Importar tqdm para la barra de progreso
|
157 |
+
|
158 |
+
for filename, url in tqdm(OPENVINO_UTILS.items(), desc="Descargando utilidades de OpenVINO", unit="archivo"):
|
159 |
+
file_path = OPENVINO_UTILS_DIR / filename
|
160 |
+
if not file_path.exists():
|
161 |
+
response = requests.get(url)
|
162 |
+
if response.status_code == 200:
|
163 |
+
with open(file_path, "wb") as f:
|
164 |
+
f.write(response.content)
|
165 |
+
else:
|
166 |
+
print(f"Error al descargar {filename} desde {url}")
|
167 |
+
|
168 |
+
# Función para descargar los archivos de ayuda específicos de Wav2Lip
|
169 |
+
def download_wav2lip_helpers(pip_path):
|
170 |
+
"""
|
171 |
+
Descarga los archivos de ayuda específicos de Wav2Lip si no existen.
|
172 |
+
"""
|
173 |
+
WAV2LIP_HELPERS_DIR.mkdir(parents=True, exist_ok=True) # Crea `src` si no existe
|
174 |
+
|
175 |
+
# Instalar requests en el entorno virtual si no está instalado
|
176 |
+
try:
|
177 |
+
subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
178 |
+
except subprocess.CalledProcessError:
|
179 |
+
print("Error al instalar requests.")
|
180 |
+
|
181 |
+
try:
|
182 |
+
subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
183 |
+
except subprocess.CalledProcessError:
|
184 |
+
print("Error al instalar tqdm.")
|
185 |
+
|
186 |
+
from tqdm import tqdm # Importar tqdm para la barra de progreso
|
187 |
+
for filename, url in tqdm(WAV2LIP_HELPERS.items(), desc="Descargando ayudas de Wav2Lip", unit="archivo"):
|
188 |
+
file_path = WAV2LIP_HELPERS_DIR / filename
|
189 |
+
if not file_path.exists():
|
190 |
+
response = requests.get(url)
|
191 |
+
if response.status_code == 200:
|
192 |
+
with open(file_path, "wb") as f:
|
193 |
+
f.write(response.content)
|
194 |
+
|
195 |
+
# Función para descargar los archivos de ejemplo de entrada (audio y video)
|
196 |
+
def download_example_files():
|
197 |
+
"""
|
198 |
+
Descarga los archivos de ejemplo de entrada (audio y video) en sus carpetas correspondientes.
|
199 |
+
"""
|
200 |
+
# Instalar requests en el entorno virtual si no está instalado
|
201 |
+
try:
|
202 |
+
subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
203 |
+
except subprocess.CalledProcessError:
|
204 |
+
print("Error al instalar requests.")
|
205 |
+
|
206 |
+
try:
|
207 |
+
subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
208 |
+
except subprocess.CalledProcessError:
|
209 |
+
print("Error al instalar tqdm.")
|
210 |
+
|
211 |
+
from tqdm import tqdm # Importar tqdm para la barra de progreso
|
212 |
+
|
213 |
+
for example_name, example_info in tqdm(EXAMPLE_FILES.items(), desc="Descargando archivos de ejemplo", unit="archivo"):
|
214 |
+
folder_path = Path(example_info["folder"])
|
215 |
+
file_path = folder_path / example_info["filename"]
|
216 |
+
|
217 |
+
# Crear la carpeta si no existe
|
218 |
+
folder_path.mkdir(parents=True, exist_ok=True)
|
219 |
+
|
220 |
+
# Descargar el archivo si no existe
|
221 |
+
if not file_path.exists():
|
222 |
+
response = requests.get(example_info["url"])
|
223 |
+
if response.status_code == 200:
|
224 |
+
with open(file_path, "wb") as f:
|
225 |
+
f.write(response.content)
|
226 |
+
|
227 |
+
def clone_wav2lip_repo():
|
228 |
+
"""
|
229 |
+
Clona el repositorio oficial de Wav2Lip, ocultando el progreso mediante tqdm.
|
230 |
+
"""
|
231 |
+
repo_url = "https://github.com/Rudrabha/Wav2Lip"
|
232 |
+
clone_path = "src/Wav2Lip"
|
233 |
+
|
234 |
+
try:
|
235 |
+
subprocess.check_call([pip_path, "install", "requests"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
236 |
+
except subprocess.CalledProcessError:
|
237 |
+
print("Error al instalar requests.")
|
238 |
+
|
239 |
+
try:
|
240 |
+
subprocess.check_call([pip_path, "install", "tqdm"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
241 |
+
except subprocess.CalledProcessError:
|
242 |
+
print("Error al instalar tqdm.")
|
243 |
+
|
244 |
+
from tqdm import tqdm # Importar tqdm para la barra de progreso
|
245 |
+
|
246 |
+
# Verifica si el repositorio ya existe para evitar clonarlo nuevamente
|
247 |
+
if os.path.exists(clone_path):
|
248 |
+
print(f"El repositorio '{clone_path}' ya existe.")
|
249 |
+
return
|
250 |
+
|
251 |
+
# Inicia el proceso de clonación con tqdm para ocultar el progreso
|
252 |
+
print("Clonando el repositorio de Wav2Lip...")
|
253 |
+
with tqdm(total=100, desc="Clonación en progreso", ncols=100, bar_format="{l_bar}{bar}") as pbar:
|
254 |
+
# Ejecuta el comando de clonación
|
255 |
+
exit_code = subprocess.call(["git", "clone", repo_url, clone_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
256 |
+
|
257 |
+
if exit_code != 0:
|
258 |
+
raise Exception("Error: La clonación del repositorio ha fallado.")
|
259 |
+
else:
|
260 |
+
pbar.update(100)
|
261 |
+
print("Repositorio clonado exitosamente en 'Wav2Lip'.")
|
262 |
+
|
263 |
+
|
264 |
+
if __name__ == "__main__":
|
265 |
+
create_project_structure()
|
266 |
+
create_virtual_environment()
|
267 |
+
python_path, pip_path = activate_virtual_environment()
|
268 |
+
|
269 |
+
download_openvino_utils(pip_path)
|
270 |
+
download_wav2lip_helpers(pip_path)
|
271 |
+
download_example_files()
|
272 |
+
install_requirements(pip_path)
|
273 |
+
clone_wav2lip_repo()
|
274 |
+
|
275 |
+
|
276 |
+
|
277 |
+
|
278 |
+
|
279 |
+
|
280 |
+
|
281 |
+
|
282 |
+
|
283 |
+
|
284 |
+
|
285 |
+
|
286 |
+
|
287 |
+
|
src/.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
src/audio_recorder.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# audio_recorder.py
|
2 |
+
|
3 |
+
import sounddevice as sd
|
4 |
+
from scipy.io.wavfile import write
|
5 |
+
import os
|
6 |
+
|
7 |
+
# Ruta para guardar el archivo de audio en el directorio `assets/audio/`
|
8 |
+
AUDIO_PATH = os.path.join("..", "assets", "audio", "grabacion_8s.wav")
|
9 |
+
|
10 |
+
def listar_dispositivos():
|
11 |
+
"""
|
12 |
+
Lista todos los dispositivos de audio disponibles en el sistema.
|
13 |
+
"""
|
14 |
+
print("Dispositivos de audio disponibles:")
|
15 |
+
dispositivos = sd.query_devices()
|
16 |
+
for idx, dispositivo in enumerate(dispositivos):
|
17 |
+
print(f"{idx}: {dispositivo['name']} - {'Entrada' if dispositivo['max_input_channels'] > 0 else 'Salida'}")
|
18 |
+
print("\nSelecciona el índice del dispositivo de entrada que prefieras para grabar audio.")
|
19 |
+
|
20 |
+
def record_audio(duration=8, sample_rate=44100, device_index=None):
|
21 |
+
"""
|
22 |
+
Graba el audio desde el micrófono durante un tiempo específico y lo guarda como archivo WAV.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
duration (int): Duración de la grabación en segundos.
|
26 |
+
sample_rate (int): Frecuencia de muestreo del audio.
|
27 |
+
device_index (int): Índice del dispositivo de audio a utilizar.
|
28 |
+
"""
|
29 |
+
print("Grabando...")
|
30 |
+
|
31 |
+
# Iniciar la grabación con un canal
|
32 |
+
audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, device=device_index)
|
33 |
+
sd.wait() # Espera a que la grabación termine
|
34 |
+
|
35 |
+
# Guardar el archivo de audio
|
36 |
+
write(AUDIO_PATH, sample_rate, audio_data)
|
37 |
+
print(f"Grabación completada. Archivo guardado en: {AUDIO_PATH}")
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
# Paso 1: Listar dispositivos de audio
|
41 |
+
listar_dispositivos()
|
42 |
+
|
43 |
+
# Aquí esperaremos tu selección del índice del dispositivo
|
44 |
+
device_index = int(input("Introduce el índice del dispositivo de entrada que deseas utilizar: "))
|
45 |
+
|
46 |
+
# Paso 2: Grabar audio con el dispositivo seleccionado
|
47 |
+
record_audio(device_index=device_index)
|
48 |
+
|
src/call_openai_api.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.prompts import PromptTemplate
|
6 |
+
from langchain.chains import LLMChain
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
#Cargar variables de entorno desde el archivo .env
|
10 |
+
# Ruta relativa al archivo .env en models/
|
11 |
+
project_root = Path(__file__).resolve().parent.parent # Sube al nivel raíz del proyecto
|
12 |
+
env_path = project_root / "models" / ".env" # Ruta completa al archivo .env
|
13 |
+
load_dotenv(dotenv_path=env_path)
|
14 |
+
|
15 |
+
#Configuracion de la clave de la api
|
16 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
17 |
+
if not api_key:
|
18 |
+
raise ValueError("No se encontro la clave de API")
|
19 |
+
|
20 |
+
OPENAI_KEY_VAL = api_key
|
21 |
+
|
22 |
+
llm = ChatOpenAI(
|
23 |
+
openai_api_key = OPENAI_KEY_VAL,
|
24 |
+
temperature = 0.7,
|
25 |
+
model = "gpt-4"
|
26 |
+
)
|
27 |
+
|
28 |
+
#plantilla del prompt con el texto leido del archivo
|
29 |
+
template ="""
|
30 |
+
Eres un asistente de IA que orienta a los alumnos a ser mejores personas. Haz una haiku de 5 lineas sobre lo que te estan comentando. Da siempre la respuesta en Español
|
31 |
+
Texto:{texto}
|
32 |
+
Respuesta:
|
33 |
+
"""
|
34 |
+
prompt = PromptTemplate(
|
35 |
+
input_variables = ["texto"],
|
36 |
+
template = template
|
37 |
+
)
|
38 |
+
|
39 |
+
chain = LLMChain(
|
40 |
+
llm = llm,
|
41 |
+
prompt = prompt
|
42 |
+
)
|
43 |
+
|
44 |
+
#def save_summary_to_file(summary_text, filename = 'response.txt'):
|
45 |
+
def save_summary_to_file(summary_text, filename = 'C:/programacionEjercicios/miwav2lipv6/results/OpenAI_response.txt'):
|
46 |
+
try:
|
47 |
+
with open(filename,'w', encoding='utf-8') as file:
|
48 |
+
file.write(summary_text)
|
49 |
+
print(f"El resumen se ha guardado exitosamente en {filename}")
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Ocurrio un error al guardar el resumen {e}")
|
52 |
+
|
53 |
+
def read_text_from_file(filename):
|
54 |
+
try:
|
55 |
+
with open(filename, 'r') as file:
|
56 |
+
return file.read()
|
57 |
+
except Exception as e:
|
58 |
+
print(f"Error al leer el archivo {filename}: {e}")
|
59 |
+
return ""
|
60 |
+
|
61 |
+
|
62 |
+
#def main():
|
63 |
+
def moni(archivo):
|
64 |
+
#texto_usuario = input("Ingresa un texto para resumir:")
|
65 |
+
#texto_usuario = read_text_from_file("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
|
66 |
+
texto_usuario = read_text_from_file(archivo)
|
67 |
+
resultado = chain.run(texto = texto_usuario)
|
68 |
+
|
69 |
+
#Mostrar el resumen generado
|
70 |
+
print("\nResumen generado:")
|
71 |
+
print(resultado)
|
72 |
+
save_summary_to_file(resultado)
|
73 |
+
|
74 |
+
return resultado
|
75 |
+
#
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
moni()
|
80 |
+
|
src/convert_models.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
# Añade `src` a `sys.path` para que Python encuentre el módulo `utils`
|
5 |
+
sys.path.append(str(Path(__file__).resolve().parent))
|
6 |
+
|
7 |
+
# Importa la función desde utils/notebook_utils.py
|
8 |
+
from utils.notebook_utils import download_file
|
9 |
+
from ov_wav2lip_helper import download_and_convert_models
|
10 |
+
|
11 |
+
|
12 |
+
OV_FACE_DETECTION_MODEL_PATH = Path("../miwav2lipv6/models/face_detection.xml")
|
13 |
+
OV_WAV2LIP_MODEL_PATH = Path("../miwav2lipv6/models/wav2lip.xml")
|
14 |
+
|
15 |
+
|
16 |
+
download_and_convert_models(OV_FACE_DETECTION_MODEL_PATH, OV_WAV2LIP_MODEL_PATH)
|
src/gradio_helper.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable
|
2 |
+
import gradio as gr
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
examples = [
|
7 |
+
[
|
8 |
+
#"data_video_sun_5s.mp4",
|
9 |
+
"data_video_sun.mp4",
|
10 |
+
"data_audio_sun_5s.wav",
|
11 |
+
],
|
12 |
+
]
|
13 |
+
|
14 |
+
|
15 |
+
def make_demo(fn: Callable):
|
16 |
+
demo = gr.Interface(
|
17 |
+
fn=fn,
|
18 |
+
inputs=[
|
19 |
+
gr.Video(label="Face video"),
|
20 |
+
gr.Audio(label="Audio", type="filepath"),
|
21 |
+
],
|
22 |
+
outputs="video",
|
23 |
+
examples=examples,
|
24 |
+
allow_flagging="never",
|
25 |
+
)
|
26 |
+
return demo
|
src/interface.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# interface.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import sounddevice as sd
|
5 |
+
from scipy.io.wavfile import write
|
6 |
+
import tempfile
|
7 |
+
import shutil
|
8 |
+
import os
|
9 |
+
|
10 |
+
# Rutas de video y audio con absolutas para evitar errores de acceso
|
11 |
+
AUDIO_COPY_PATH = os.path.abspath(os.path.join("..", "miwav2lipv6","assets", "audio", "grabacion_gradio.wav"))
|
12 |
+
#VIDEO_PATH = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun_5s.mp4")
|
13 |
+
VIDEO_PATH = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun.mp4")
|
14 |
+
|
15 |
+
# Verificar la existencia del video
|
16 |
+
if not os.path.exists(VIDEO_PATH):
|
17 |
+
print(f"Advertencia: El archivo de video no se encontró en la ruta {VIDEO_PATH}")
|
18 |
+
|
19 |
+
# Función para grabar audio
|
20 |
+
def grabar_audio(duration=8, sample_rate=44100):
|
21 |
+
print("Grabando...")
|
22 |
+
audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
|
23 |
+
sd.wait() # Espera a que la grabación termine
|
24 |
+
|
25 |
+
# Guardar archivo temporal de audio
|
26 |
+
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
27 |
+
write(temp_audio.name, sample_rate, audio_data)
|
28 |
+
print("Grabación completada. Archivo temporal guardado en:", temp_audio.name)
|
29 |
+
|
30 |
+
# Verificar y crear `assets/audio` si no existe
|
31 |
+
os.makedirs(os.path.dirname(AUDIO_COPY_PATH), exist_ok=True)
|
32 |
+
|
33 |
+
# Copiar a `assets/audio`
|
34 |
+
shutil.copy(temp_audio.name, AUDIO_COPY_PATH)
|
35 |
+
print(f"Copia de la grabación guardada en: {AUDIO_COPY_PATH}")
|
36 |
+
|
37 |
+
return AUDIO_COPY_PATH
|
38 |
+
|
39 |
+
# Función principal para la interfaz de Gradio
|
40 |
+
def interfaz():
|
41 |
+
with gr.Blocks() as demo:
|
42 |
+
gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
|
43 |
+
|
44 |
+
# Crear un botón de grabación
|
45 |
+
with gr.Row():
|
46 |
+
grabar_button = gr.Button("Iniciar Grabación")
|
47 |
+
|
48 |
+
# Mostrar el audio grabado a la derecha
|
49 |
+
output_audio = gr.Audio(label="Grabación de Audio", type="filepath")
|
50 |
+
|
51 |
+
# Asignar la función al botón
|
52 |
+
grabar_button.click(grabar_audio, outputs=output_audio)
|
53 |
+
|
54 |
+
return demo
|
55 |
+
|
56 |
+
# Ejecuta la interfaz con la ruta absoluta en allowed_paths
|
57 |
+
if __name__ == "__main__":
|
58 |
+
demo = interfaz()
|
59 |
+
demo.launch(allowed_paths=[os.path.dirname(AUDIO_COPY_PATH)])
|
60 |
+
|
src/interfaceV2.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# interfaceV2.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import sounddevice as sd
|
5 |
+
from scipy.io.wavfile import write
|
6 |
+
import tempfile
|
7 |
+
import shutil
|
8 |
+
import os
|
9 |
+
import subprocess
|
10 |
+
import sys
|
11 |
+
from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion
|
12 |
+
from call_openai_api import moni as rtff # Asegúrate de que el archivo call_open_api.py esté en el mismo directorio
|
13 |
+
|
14 |
+
|
15 |
+
# Paths to files (adjusted as per your specified structure)
|
16 |
+
AUDIO_RECORD_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/audio/grabacion_gradio.wav")
|
17 |
+
#VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/video/data_video_sun_5s.mp4")
|
18 |
+
VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/video/data_video_sun.mp4")
|
19 |
+
#TRANSCRIPTION_TEXT_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
|
20 |
+
TRANSCRIPTION_TEXT_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt")
|
21 |
+
RESULT_AUDIO_TEMP_PATH = os.path.abspath( "C:/programacionEjercicios/miwav2lipv6/results/audiov2.wav")
|
22 |
+
RESULT_AUDIO_FINAL_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/assets/audio/audio.wav")
|
23 |
+
RESULT_VIDEO_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/results/result_voice.mp4")
|
24 |
+
TEXT_TO_SPEECH_PATH = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/src/text_to_speech.py")
|
25 |
+
|
26 |
+
# Function to record 8-second audio
|
27 |
+
def grabar_audio(duration=8, sample_rate=44100):
|
28 |
+
print("Starting recording...")
|
29 |
+
audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
|
30 |
+
print(f"Recording in progress for {duration} seconds...")
|
31 |
+
sd.wait()
|
32 |
+
print("Recording completed.")
|
33 |
+
|
34 |
+
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
35 |
+
write(temp_audio.name, sample_rate, audio_data)
|
36 |
+
print("Audio temporarily saved at:", temp_audio.name)
|
37 |
+
temp_audio.close() # Asegúrate de cerrarlo antes de usarlo
|
38 |
+
os.makedirs(os.path.dirname(AUDIO_RECORD_PATH), exist_ok=True)
|
39 |
+
shutil.copy(temp_audio.name, AUDIO_RECORD_PATH)
|
40 |
+
print(f"Recording copied to: {AUDIO_RECORD_PATH}")
|
41 |
+
|
42 |
+
return AUDIO_RECORD_PATH, "Recording completed."
|
43 |
+
|
44 |
+
# Function to transcribe audio with Whisper
|
45 |
+
def transcribir_con_progreso(audio_path):
|
46 |
+
progreso = gr.Progress()
|
47 |
+
progreso(0, "Starting transcription...")
|
48 |
+
model_name = "openai/whisper-large"
|
49 |
+
progreso(25, "Loading Whisper model...")
|
50 |
+
|
51 |
+
transcripcion = transcribe_audio(audio_path, model_name)
|
52 |
+
progreso(75, "Saving transcription...")
|
53 |
+
guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH)
|
54 |
+
progreso(100, "Transcription completed.")
|
55 |
+
if not os.path.exists(TRANSCRIPTION_TEXT_PATH):
|
56 |
+
raise FileNotFoundError(f"El archivo {TRANSCRIPTION_TEXT_PATH} no se generó.")
|
57 |
+
|
58 |
+
return transcripcion
|
59 |
+
|
60 |
+
# Function to convert text to audio using text_to_speech.py
|
61 |
+
def generar_audio_desde_texto():
|
62 |
+
print("Generating audio from text...")
|
63 |
+
result = subprocess.run(
|
64 |
+
[sys.executable, TEXT_TO_SPEECH_PATH],
|
65 |
+
capture_output=True,
|
66 |
+
text=True
|
67 |
+
)
|
68 |
+
if result.returncode != 0:
|
69 |
+
raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}")
|
70 |
+
if result.stdout:
|
71 |
+
print("Output:", result.stdout)
|
72 |
+
if result.stderr:
|
73 |
+
print("Errors:", result.stderr)
|
74 |
+
|
75 |
+
if os.path.exists(RESULT_AUDIO_TEMP_PATH):
|
76 |
+
print(f"Temporary audio generated at: {RESULT_AUDIO_TEMP_PATH}")
|
77 |
+
|
78 |
+
os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True)
|
79 |
+
shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH)
|
80 |
+
print(f"Final audio copied to: {RESULT_AUDIO_FINAL_PATH}")
|
81 |
+
|
82 |
+
return RESULT_AUDIO_FINAL_PATH
|
83 |
+
else:
|
84 |
+
print(f"Error: Audio file was not generated in {RESULT_AUDIO_FINAL_PATH} ")
|
85 |
+
return None
|
86 |
+
|
87 |
+
# Function to process video and audio using run_inference.py with the generated audio file
|
88 |
+
def procesar_video_audio():
|
89 |
+
print("Starting video and audio processing...")
|
90 |
+
run_inference_path = os.path.abspath("C:/programacionEjercicios/miwav2lipv6/src/run_inference.py")
|
91 |
+
|
92 |
+
result = subprocess.run(
|
93 |
+
[sys.executable, run_inference_path, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH],
|
94 |
+
capture_output=True,
|
95 |
+
text=True
|
96 |
+
)
|
97 |
+
|
98 |
+
if result.stdout:
|
99 |
+
print("Output:", result.stdout)
|
100 |
+
if result.stderr:
|
101 |
+
print("Errors:", result.stderr)
|
102 |
+
|
103 |
+
if os.path.exists(RESULT_VIDEO_PATH):
|
104 |
+
print(f"Processed video saved at: {RESULT_VIDEO_PATH}")
|
105 |
+
return RESULT_VIDEO_PATH
|
106 |
+
else:
|
107 |
+
print("Error: Video file was not generated at 'results/result_voice.mp4'")
|
108 |
+
return None
|
109 |
+
|
110 |
+
# Gradio Interface Configuration
|
111 |
+
def interfaz():
|
112 |
+
with gr.Blocks() as demo:
|
113 |
+
with gr.Row():
|
114 |
+
with gr.Column():
|
115 |
+
gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500)
|
116 |
+
grabar_button = gr.Button("Comenzando la grabacion de audio")
|
117 |
+
estado_grabacion = gr.Textbox(label="Recording Status", interactive=False)
|
118 |
+
|
119 |
+
with gr.Column():
|
120 |
+
output_audio = gr.Audio(AUDIO_RECORD_PATH, label="Audio Grabado", interactive=False)
|
121 |
+
output_audio_speech = gr.Audio(RESULT_AUDIO_FINAL_PATH, label="Audio TTS", interactive=False)
|
122 |
+
video_resultado = gr.Video(RESULT_VIDEO_PATH,label="Video procesado", interactive=False)
|
123 |
+
texto_transcripcion = gr.Textbox(label="Texto transcrito")
|
124 |
+
progreso_transcripcion = gr.Textbox(label="Transcription Status", interactive=False)
|
125 |
+
|
126 |
+
# Full flow: recording, transcription, text-to-speech, and video processing
|
127 |
+
"""
|
128 |
+
def flujo_completo():
|
129 |
+
_, mensaje_grabacion = grabar_audio()
|
130 |
+
transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH)
|
131 |
+
audio_generado = generar_audio_desde_texto()
|
132 |
+
video_path = procesar_video_audio()
|
133 |
+
|
134 |
+
# Ensure function always returns 5 outputs for Gradio, even in error cases
|
135 |
+
if video_path and audio_generado:
|
136 |
+
return mensaje_grabacion, AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path
|
137 |
+
else:
|
138 |
+
return mensaje_grabacion, AUDIO_RECORD_PATH, transcripcion, audio_generado or "Audio generation failed", video_path or "Video generation failed"
|
139 |
+
"""
|
140 |
+
def flujo_completo():
|
141 |
+
try:
|
142 |
+
print("Inicio del flujo completo...")
|
143 |
+
# Grabar audio
|
144 |
+
audio_path, mensaje_grabacion = grabar_audio()
|
145 |
+
print("Audio grabado en:", audio_path)
|
146 |
+
# Transcribir audio
|
147 |
+
transcripcion = transcribir_con_progreso(audio_path)
|
148 |
+
print("Transcripción completada:", transcripcion)
|
149 |
+
|
150 |
+
#respuesta_openai = rtff(transcripcion)
|
151 |
+
respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH)
|
152 |
+
print("Respuesta generada por OpenAI")
|
153 |
+
|
154 |
+
# Generar audio desde texto
|
155 |
+
audio_generado = generar_audio_desde_texto()
|
156 |
+
print("Audio generado:", audio_generado)
|
157 |
+
# Procesar video y audio
|
158 |
+
video_path = procesar_video_audio()
|
159 |
+
print("Video procesado en:", video_path)
|
160 |
+
# Devolver resultados si todo fue exitoso
|
161 |
+
return mensaje_grabacion, audio_path, transcripcion, audio_generado, video_path
|
162 |
+
|
163 |
+
except Exception as e:
|
164 |
+
# Imprime el error en la terminal y regresa mensajes de error a la interfaz
|
165 |
+
print("Error detectado en flujo completo:", str(e))
|
166 |
+
return (
|
167 |
+
"Error durante el flujo completo",
|
168 |
+
None, # Audio grabado
|
169 |
+
f"Error: {str(e)}", # Transcripción
|
170 |
+
None, # Audio generado
|
171 |
+
None # Video procesado
|
172 |
+
)
|
173 |
+
|
174 |
+
grabar_button.click(
|
175 |
+
flujo_completo,
|
176 |
+
outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado]
|
177 |
+
)
|
178 |
+
|
179 |
+
return demo
|
180 |
+
|
181 |
+
if __name__ == "__main__":
|
182 |
+
demo = interfaz()
|
183 |
+
demo.launch(allowed_paths=["C:/programacionEjercicios/miwav2lipv6/assets", "C:/programacionEjercicios/miwav2lipv6/results"])
|
src/ov_inference.py
ADDED
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from glob import glob
|
2 |
+
from enum import Enum
|
3 |
+
import math
|
4 |
+
import subprocess
|
5 |
+
|
6 |
+
import cv2
|
7 |
+
import numpy as np
|
8 |
+
from tqdm import tqdm
|
9 |
+
import torch
|
10 |
+
import torch.nn.functional as F
|
11 |
+
|
12 |
+
from Wav2Lip import audio
|
13 |
+
import openvino as ov
|
14 |
+
|
15 |
+
|
16 |
+
device = "cpu"
|
17 |
+
|
18 |
+
|
19 |
+
def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
|
20 |
+
xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
|
21 |
+
dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
|
22 |
+
dw, dh = math.log(ww / aww), math.log(hh / ahh)
|
23 |
+
return dx, dy, dw, dh
|
24 |
+
|
25 |
+
|
26 |
+
def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
|
27 |
+
xc, yc = dx * aww + axc, dy * ahh + ayc
|
28 |
+
ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
|
29 |
+
x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
|
30 |
+
return x1, y1, x2, y2
|
31 |
+
|
32 |
+
|
33 |
+
def nms(dets, thresh):
|
34 |
+
if 0 == len(dets):
|
35 |
+
return []
|
36 |
+
x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
|
37 |
+
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
|
38 |
+
order = scores.argsort()[::-1]
|
39 |
+
|
40 |
+
keep = []
|
41 |
+
while order.size > 0:
|
42 |
+
i = order[0]
|
43 |
+
keep.append(i)
|
44 |
+
xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
|
45 |
+
xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
|
46 |
+
|
47 |
+
w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
|
48 |
+
ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
|
49 |
+
|
50 |
+
inds = np.where(ovr <= thresh)[0]
|
51 |
+
order = order[inds + 1]
|
52 |
+
|
53 |
+
return keep
|
54 |
+
|
55 |
+
|
56 |
+
def encode(matched, priors, variances):
|
57 |
+
"""Encode the variances from the priorbox layers into the ground truth boxes
|
58 |
+
we have matched (based on jaccard overlap) with the prior boxes.
|
59 |
+
Args:
|
60 |
+
matched: (tensor) Coords of ground truth for each prior in point-form
|
61 |
+
Shape: [num_priors, 4].
|
62 |
+
priors: (tensor) Prior boxes in center-offset form
|
63 |
+
Shape: [num_priors,4].
|
64 |
+
variances: (list[float]) Variances of priorboxes
|
65 |
+
Return:
|
66 |
+
encoded boxes (tensor), Shape: [num_priors, 4]
|
67 |
+
"""
|
68 |
+
|
69 |
+
# dist b/t match center and prior's center
|
70 |
+
g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
|
71 |
+
# encode variance
|
72 |
+
g_cxcy /= variances[0] * priors[:, 2:]
|
73 |
+
# match wh / prior wh
|
74 |
+
g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
|
75 |
+
g_wh = torch.log(g_wh) / variances[1]
|
76 |
+
# return target for smooth_l1_loss
|
77 |
+
return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
|
78 |
+
|
79 |
+
|
80 |
+
def decode(loc, priors, variances):
|
81 |
+
"""Decode locations from predictions using priors to undo
|
82 |
+
the encoding we did for offset regression at train time.
|
83 |
+
Args:
|
84 |
+
loc (tensor): location predictions for loc layers,
|
85 |
+
Shape: [num_priors,4]
|
86 |
+
priors (tensor): Prior boxes in center-offset form.
|
87 |
+
Shape: [num_priors,4].
|
88 |
+
variances: (list[float]) Variances of priorboxes
|
89 |
+
Return:
|
90 |
+
decoded bounding box predictions
|
91 |
+
"""
|
92 |
+
|
93 |
+
boxes = torch.cat((priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
|
94 |
+
boxes[:, :2] -= boxes[:, 2:] / 2
|
95 |
+
boxes[:, 2:] += boxes[:, :2]
|
96 |
+
return boxes
|
97 |
+
|
98 |
+
|
99 |
+
def batch_decode(loc, priors, variances):
|
100 |
+
"""Decode locations from predictions using priors to undo
|
101 |
+
the encoding we did for offset regression at train time.
|
102 |
+
Args:
|
103 |
+
loc (tensor): location predictions for loc layers,
|
104 |
+
Shape: [num_priors,4]
|
105 |
+
priors (tensor): Prior boxes in center-offset form.
|
106 |
+
Shape: [num_priors,4].
|
107 |
+
variances: (list[float]) Variances of priorboxes
|
108 |
+
Return:
|
109 |
+
decoded bounding box predictions
|
110 |
+
"""
|
111 |
+
|
112 |
+
boxes = torch.cat((priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:], priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
|
113 |
+
boxes[:, :, :2] -= boxes[:, :, 2:] / 2
|
114 |
+
boxes[:, :, 2:] += boxes[:, :, :2]
|
115 |
+
return boxes
|
116 |
+
|
117 |
+
|
118 |
+
def get_smoothened_boxes(boxes, T):
|
119 |
+
for i in range(len(boxes)):
|
120 |
+
if i + T > len(boxes):
|
121 |
+
window = boxes[len(boxes) - T :]
|
122 |
+
else:
|
123 |
+
window = boxes[i : i + T]
|
124 |
+
boxes[i] = np.mean(window, axis=0)
|
125 |
+
return boxes
|
126 |
+
|
127 |
+
|
128 |
+
def detect(net, img, device):
|
129 |
+
img = img - np.array([104, 117, 123])
|
130 |
+
img = img.transpose(2, 0, 1)
|
131 |
+
img = img.reshape((1,) + img.shape)
|
132 |
+
|
133 |
+
img = torch.from_numpy(img).float().to(device)
|
134 |
+
BB, CC, HH, WW = img.size()
|
135 |
+
|
136 |
+
results = net({"x": img})
|
137 |
+
olist = [torch.Tensor(results[i]) for i in range(12)]
|
138 |
+
|
139 |
+
bboxlist = []
|
140 |
+
for i in range(len(olist) // 2):
|
141 |
+
olist[i * 2] = F.softmax(olist[i * 2], dim=1)
|
142 |
+
olist = [oelem.data.cpu() for oelem in olist]
|
143 |
+
for i in range(len(olist) // 2):
|
144 |
+
ocls, oreg = olist[i * 2], olist[i * 2 + 1]
|
145 |
+
FB, FC, FH, FW = ocls.size() # feature map size
|
146 |
+
stride = 2 ** (i + 2) # 4,8,16,32,64,128
|
147 |
+
anchor = stride * 4
|
148 |
+
poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
|
149 |
+
for Iindex, hindex, windex in poss:
|
150 |
+
axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
|
151 |
+
score = ocls[0, 1, hindex, windex]
|
152 |
+
loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
|
153 |
+
priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
|
154 |
+
variances = [0.1, 0.2]
|
155 |
+
box = decode(loc, priors, variances)
|
156 |
+
x1, y1, x2, y2 = box[0] * 1.0
|
157 |
+
# cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
|
158 |
+
bboxlist.append([x1, y1, x2, y2, score])
|
159 |
+
bboxlist = np.array(bboxlist)
|
160 |
+
if 0 == len(bboxlist):
|
161 |
+
bboxlist = np.zeros((1, 5))
|
162 |
+
|
163 |
+
return bboxlist
|
164 |
+
|
165 |
+
|
166 |
+
def batch_detect(net, imgs, device):
|
167 |
+
imgs = imgs - np.array([104, 117, 123])
|
168 |
+
imgs = imgs.transpose(0, 3, 1, 2)
|
169 |
+
|
170 |
+
imgs = torch.from_numpy(imgs).float().to(device)
|
171 |
+
BB, CC, HH, WW = imgs.size()
|
172 |
+
|
173 |
+
results = net({"x": imgs.numpy()})
|
174 |
+
olist = [torch.Tensor(results[i]) for i in range(12)]
|
175 |
+
|
176 |
+
bboxlist = []
|
177 |
+
for i in range(len(olist) // 2):
|
178 |
+
olist[i * 2] = F.softmax(olist[i * 2], dim=1)
|
179 |
+
# olist[i * 2] = (olist[i * 2], dim=1)
|
180 |
+
olist = [oelem.data.cpu() for oelem in olist]
|
181 |
+
for i in range(len(olist) // 2):
|
182 |
+
ocls, oreg = olist[i * 2], olist[i * 2 + 1]
|
183 |
+
FB, FC, FH, FW = ocls.size() # feature map size
|
184 |
+
stride = 2 ** (i + 2) # 4,8,16,32,64,128
|
185 |
+
anchor = stride * 4
|
186 |
+
poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
|
187 |
+
for Iindex, hindex, windex in poss:
|
188 |
+
axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
|
189 |
+
score = ocls[:, 1, hindex, windex]
|
190 |
+
loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
|
191 |
+
priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
|
192 |
+
variances = [0.1, 0.2]
|
193 |
+
box = batch_decode(loc, priors, variances)
|
194 |
+
box = box[:, 0] * 1.0
|
195 |
+
# cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
|
196 |
+
bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
|
197 |
+
bboxlist = np.array(bboxlist)
|
198 |
+
if 0 == len(bboxlist):
|
199 |
+
bboxlist = np.zeros((1, BB, 5))
|
200 |
+
|
201 |
+
return bboxlist
|
202 |
+
|
203 |
+
|
204 |
+
def flip_detect(net, img, device):
|
205 |
+
img = cv2.flip(img, 1)
|
206 |
+
b = detect(net, img, device)
|
207 |
+
|
208 |
+
bboxlist = np.zeros(b.shape)
|
209 |
+
bboxlist[:, 0] = img.shape[1] - b[:, 2]
|
210 |
+
bboxlist[:, 1] = b[:, 1]
|
211 |
+
bboxlist[:, 2] = img.shape[1] - b[:, 0]
|
212 |
+
bboxlist[:, 3] = b[:, 3]
|
213 |
+
bboxlist[:, 4] = b[:, 4]
|
214 |
+
return bboxlist
|
215 |
+
|
216 |
+
|
217 |
+
def pts_to_bb(pts):
|
218 |
+
min_x, min_y = np.min(pts, axis=0)
|
219 |
+
max_x, max_y = np.max(pts, axis=0)
|
220 |
+
return np.array([min_x, min_y, max_x, max_y])
|
221 |
+
|
222 |
+
|
223 |
+
class OVFaceDetector(object):
|
224 |
+
"""An abstract class representing a face detector.
|
225 |
+
|
226 |
+
Any other face detection implementation must subclass it. All subclasses
|
227 |
+
must implement ``detect_from_image``, that return a list of detected
|
228 |
+
bounding boxes. Optionally, for speed considerations detect from path is
|
229 |
+
recommended.
|
230 |
+
"""
|
231 |
+
|
232 |
+
def __init__(self, device, verbose):
|
233 |
+
self.device = device
|
234 |
+
self.verbose = verbose
|
235 |
+
|
236 |
+
def detect_from_image(self, tensor_or_path):
|
237 |
+
"""Detects faces in a given image.
|
238 |
+
|
239 |
+
This function detects the faces present in a provided BGR(usually)
|
240 |
+
image. The input can be either the image itself or the path to it.
|
241 |
+
|
242 |
+
Arguments:
|
243 |
+
tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
|
244 |
+
to an image or the image itself.
|
245 |
+
|
246 |
+
Example::
|
247 |
+
|
248 |
+
>>> path_to_image = 'data/image_01.jpg'
|
249 |
+
... detected_faces = detect_from_image(path_to_image)
|
250 |
+
[A list of bounding boxes (x1, y1, x2, y2)]
|
251 |
+
>>> image = cv2.imread(path_to_image)
|
252 |
+
... detected_faces = detect_from_image(image)
|
253 |
+
[A list of bounding boxes (x1, y1, x2, y2)]
|
254 |
+
|
255 |
+
"""
|
256 |
+
raise NotImplementedError
|
257 |
+
|
258 |
+
def detect_from_directory(self, path, extensions=[".jpg", ".png"], recursive=False, show_progress_bar=True):
|
259 |
+
"""Detects faces from all the images present in a given directory.
|
260 |
+
|
261 |
+
Arguments:
|
262 |
+
path {string} -- a string containing a path that points to the folder containing the images
|
263 |
+
|
264 |
+
Keyword Arguments:
|
265 |
+
extensions {list} -- list of string containing the extensions to be
|
266 |
+
consider in the following format: ``.extension_name`` (default:
|
267 |
+
{['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
|
268 |
+
folder recursively (default: {False}) show_progress_bar {bool} --
|
269 |
+
display a progressbar (default: {True})
|
270 |
+
|
271 |
+
Example:
|
272 |
+
>>> directory = 'data'
|
273 |
+
... detected_faces = detect_from_directory(directory)
|
274 |
+
{A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
|
275 |
+
|
276 |
+
"""
|
277 |
+
if self.verbose:
|
278 |
+
logger = logging.getLogger(__name__)
|
279 |
+
|
280 |
+
if len(extensions) == 0:
|
281 |
+
if self.verbose:
|
282 |
+
logger.error("Expected at list one extension, but none was received.")
|
283 |
+
raise ValueError
|
284 |
+
|
285 |
+
if self.verbose:
|
286 |
+
logger.info("Constructing the list of images.")
|
287 |
+
additional_pattern = "/**/*" if recursive else "/*"
|
288 |
+
files = []
|
289 |
+
for extension in extensions:
|
290 |
+
files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
|
291 |
+
|
292 |
+
if self.verbose:
|
293 |
+
logger.info("Finished searching for images. %s images found", len(files))
|
294 |
+
logger.info("Preparing to run the detection.")
|
295 |
+
|
296 |
+
predictions = {}
|
297 |
+
for image_path in tqdm(files, disable=not show_progress_bar):
|
298 |
+
if self.verbose:
|
299 |
+
logger.info("Running the face detector on image: %s", image_path)
|
300 |
+
predictions[image_path] = self.detect_from_image(image_path)
|
301 |
+
|
302 |
+
if self.verbose:
|
303 |
+
logger.info("The detector was successfully run on all %s images", len(files))
|
304 |
+
|
305 |
+
return predictions
|
306 |
+
|
307 |
+
@property
|
308 |
+
def reference_scale(self):
|
309 |
+
raise NotImplementedError
|
310 |
+
|
311 |
+
@property
|
312 |
+
def reference_x_shift(self):
|
313 |
+
raise NotImplementedError
|
314 |
+
|
315 |
+
@property
|
316 |
+
def reference_y_shift(self):
|
317 |
+
raise NotImplementedError
|
318 |
+
|
319 |
+
@staticmethod
|
320 |
+
def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
|
321 |
+
"""Convert path (represented as a string) or torch.tensor to a numpy.ndarray
|
322 |
+
|
323 |
+
Arguments:
|
324 |
+
tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
|
325 |
+
"""
|
326 |
+
if isinstance(tensor_or_path, str):
|
327 |
+
return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
|
328 |
+
elif torch.is_tensor(tensor_or_path):
|
329 |
+
# Call cpu in case its coming from cuda
|
330 |
+
return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
|
331 |
+
elif isinstance(tensor_or_path, np.ndarray):
|
332 |
+
return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
|
333 |
+
else:
|
334 |
+
raise TypeError
|
335 |
+
|
336 |
+
|
337 |
+
class OVSFDDetector(OVFaceDetector):
|
338 |
+
def __init__(self, device, path_to_detector="models/face_detection.xml", verbose=False):
|
339 |
+
super(OVSFDDetector, self).__init__(device, verbose)
|
340 |
+
|
341 |
+
core = ov.Core()
|
342 |
+
self.face_detector = core.compile_model(path_to_detector, self.device)
|
343 |
+
|
344 |
+
def detect_from_image(self, tensor_or_path):
|
345 |
+
image = self.tensor_or_path_to_ndarray(tensor_or_path)
|
346 |
+
|
347 |
+
bboxlist = detect(self.face_detector, image, device="cpu")
|
348 |
+
keep = nms(bboxlist, 0.3)
|
349 |
+
bboxlist = bboxlist[keep, :]
|
350 |
+
bboxlist = [x for x in bboxlist if x[-1] > 0.5]
|
351 |
+
|
352 |
+
return bboxlist
|
353 |
+
|
354 |
+
def detect_from_batch(self, images):
|
355 |
+
bboxlists = batch_detect(self.face_detector, images, device="cpu")
|
356 |
+
keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
|
357 |
+
bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
|
358 |
+
bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
|
359 |
+
|
360 |
+
return bboxlists
|
361 |
+
|
362 |
+
@property
|
363 |
+
def reference_scale(self):
|
364 |
+
return 195
|
365 |
+
|
366 |
+
@property
|
367 |
+
def reference_x_shift(self):
|
368 |
+
return 0
|
369 |
+
|
370 |
+
@property
|
371 |
+
def reference_y_shift(self):
|
372 |
+
return 0
|
373 |
+
|
374 |
+
|
375 |
+
class LandmarksType(Enum):
|
376 |
+
"""Enum class defining the type of landmarks to detect.
|
377 |
+
|
378 |
+
``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
|
379 |
+
``_2halfD`` - this points represent the projection of the 3D points into 3D
|
380 |
+
``_3D`` - detect the points ``(x,y,z)``` in a 3D space
|
381 |
+
|
382 |
+
"""
|
383 |
+
|
384 |
+
_2D = 1
|
385 |
+
_2halfD = 2
|
386 |
+
_3D = 3
|
387 |
+
|
388 |
+
|
389 |
+
class NetworkSize(Enum):
|
390 |
+
# TINY = 1
|
391 |
+
# SMALL = 2
|
392 |
+
# MEDIUM = 3
|
393 |
+
LARGE = 4
|
394 |
+
|
395 |
+
def __new__(cls, value):
|
396 |
+
member = object.__new__(cls)
|
397 |
+
member._value_ = value
|
398 |
+
return member
|
399 |
+
|
400 |
+
def __int__(self):
|
401 |
+
return self.value
|
402 |
+
|
403 |
+
|
404 |
+
class OVFaceAlignment:
|
405 |
+
def __init__(
|
406 |
+
self, landmarks_type, network_size=NetworkSize.LARGE, device="CPU", flip_input=False, verbose=False, path_to_detector="models/face_detection.xml"
|
407 |
+
):
|
408 |
+
self.device = device
|
409 |
+
self.flip_input = flip_input
|
410 |
+
self.landmarks_type = landmarks_type
|
411 |
+
self.verbose = verbose
|
412 |
+
|
413 |
+
network_size = int(network_size)
|
414 |
+
|
415 |
+
self.face_detector = OVSFDDetector(device=device, path_to_detector=path_to_detector, verbose=verbose)
|
416 |
+
|
417 |
+
def get_detections_for_batch(self, images):
|
418 |
+
images = images[..., ::-1]
|
419 |
+
detected_faces = self.face_detector.detect_from_batch(images.copy())
|
420 |
+
results = []
|
421 |
+
|
422 |
+
for i, d in enumerate(detected_faces):
|
423 |
+
if len(d) == 0:
|
424 |
+
results.append(None)
|
425 |
+
continue
|
426 |
+
d = d[0]
|
427 |
+
d = np.clip(d, 0, None)
|
428 |
+
|
429 |
+
x1, y1, x2, y2 = map(int, d[:-1])
|
430 |
+
results.append((x1, y1, x2, y2))
|
431 |
+
|
432 |
+
return results
|
433 |
+
|
434 |
+
|
435 |
+
def face_detect_ov(images, device, face_det_batch_size, pads, nosmooth, path_to_detector):
|
436 |
+
detector = OVFaceAlignment(LandmarksType._2D, flip_input=False, device=device, path_to_detector=path_to_detector)
|
437 |
+
|
438 |
+
batch_size = face_det_batch_size
|
439 |
+
|
440 |
+
print("face_detect_ov images[0].shape: ", images[0].shape)
|
441 |
+
while 1:
|
442 |
+
predictions = []
|
443 |
+
try:
|
444 |
+
for i in tqdm(range(0, len(images), batch_size)):
|
445 |
+
predictions.extend(detector.get_detections_for_batch(np.array(images[i : i + batch_size])))
|
446 |
+
except RuntimeError:
|
447 |
+
if batch_size == 1:
|
448 |
+
raise RuntimeError("Image too big to run face detection on GPU. Please use the --resize_factor argument")
|
449 |
+
batch_size //= 2
|
450 |
+
print("Recovering from OOM error; New batch size: {}".format(batch_size))
|
451 |
+
continue
|
452 |
+
break
|
453 |
+
|
454 |
+
results = []
|
455 |
+
pady1, pady2, padx1, padx2 = pads
|
456 |
+
for rect, image in zip(predictions, images):
|
457 |
+
if rect is None:
|
458 |
+
# check this frame where the face was not detected.
|
459 |
+
cv2.imwrite("temp/faulty_frame.jpg", image)
|
460 |
+
raise ValueError("Face not detected! Ensure the video contains a face in all the frames.")
|
461 |
+
|
462 |
+
y1 = max(0, rect[1] - pady1)
|
463 |
+
y2 = min(image.shape[0], rect[3] + pady2)
|
464 |
+
x1 = max(0, rect[0] - padx1)
|
465 |
+
x2 = min(image.shape[1], rect[2] + padx2)
|
466 |
+
|
467 |
+
results.append([x1, y1, x2, y2])
|
468 |
+
|
469 |
+
boxes = np.array(results)
|
470 |
+
if not nosmooth:
|
471 |
+
boxes = get_smoothened_boxes(boxes, T=5)
|
472 |
+
results = [[image[y1:y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
|
473 |
+
|
474 |
+
del detector
|
475 |
+
return results
|
476 |
+
|
477 |
+
|
478 |
+
def datagen(frames, mels, box, static, face_det_batch_size, pads, nosmooth, img_size, wav2lip_batch_size, path_to_detector):
|
479 |
+
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
480 |
+
|
481 |
+
if box[0] == -1:
|
482 |
+
if not static:
|
483 |
+
# BGR2RGB for CNN face detection
|
484 |
+
face_det_results = face_detect_ov(frames, "CPU", face_det_batch_size, pads, nosmooth, path_to_detector)
|
485 |
+
else:
|
486 |
+
face_det_results = face_detect_ov([frames[0]], "CPU", face_det_batch_size, pads, nosmooth, path_to_detector)
|
487 |
+
else:
|
488 |
+
print("Using the specified bounding box instead of face detection...")
|
489 |
+
y1, y2, x1, x2 = box
|
490 |
+
face_det_results = [[f[y1:y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
|
491 |
+
|
492 |
+
for i, m in enumerate(mels):
|
493 |
+
idx = 0 if static else i % len(frames)
|
494 |
+
frame_to_save = frames[idx].copy()
|
495 |
+
face, coords = face_det_results[idx].copy()
|
496 |
+
|
497 |
+
face = cv2.resize(face, (img_size, img_size))
|
498 |
+
|
499 |
+
img_batch.append(face)
|
500 |
+
mel_batch.append(m)
|
501 |
+
frame_batch.append(frame_to_save)
|
502 |
+
coords_batch.append(coords)
|
503 |
+
|
504 |
+
if len(img_batch) >= wav2lip_batch_size:
|
505 |
+
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
506 |
+
|
507 |
+
img_masked = img_batch.copy()
|
508 |
+
img_masked[:, img_size // 2 :] = 0
|
509 |
+
|
510 |
+
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
|
511 |
+
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
512 |
+
|
513 |
+
yield img_batch, mel_batch, frame_batch, coords_batch
|
514 |
+
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
515 |
+
|
516 |
+
if len(img_batch) > 0:
|
517 |
+
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
518 |
+
|
519 |
+
img_masked = img_batch.copy()
|
520 |
+
img_masked[:, img_size // 2 :] = 0
|
521 |
+
|
522 |
+
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
|
523 |
+
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
524 |
+
|
525 |
+
yield img_batch, mel_batch, frame_batch, coords_batch
|
526 |
+
|
527 |
+
|
528 |
+
def ov_inference(
|
529 |
+
face_path,
|
530 |
+
audio_path,
|
531 |
+
face_detection_path="models/face_detection.xml",
|
532 |
+
wav2lip_path="models/wav2lip.xml",
|
533 |
+
inference_device="CPU",
|
534 |
+
wav2lip_batch_size=128,
|
535 |
+
outfile="results/result_voice.mp4",
|
536 |
+
resize_factor=1,
|
537 |
+
rotate=False,
|
538 |
+
crop=[0, -1, 0, -1],
|
539 |
+
mel_step_size=16,
|
540 |
+
box=[-1, -1, -1, -1],
|
541 |
+
static=False,
|
542 |
+
img_size=96,
|
543 |
+
face_det_batch_size=16,
|
544 |
+
pads=[0, 10, 0, 0],
|
545 |
+
nosmooth=False,
|
546 |
+
):
|
547 |
+
print("Reading video frames...")
|
548 |
+
|
549 |
+
video_stream = cv2.VideoCapture(face_path)
|
550 |
+
fps = video_stream.get(cv2.CAP_PROP_FPS)
|
551 |
+
|
552 |
+
full_frames = []
|
553 |
+
while 1:
|
554 |
+
still_reading, frame = video_stream.read()
|
555 |
+
if not still_reading:
|
556 |
+
video_stream.release()
|
557 |
+
break
|
558 |
+
if resize_factor > 1:
|
559 |
+
frame = cv2.resize(frame, (frame.shape[1] // resize_factor, frame.shape[0] // resize_factor))
|
560 |
+
|
561 |
+
if rotate:
|
562 |
+
frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
|
563 |
+
|
564 |
+
y1, y2, x1, x2 = crop
|
565 |
+
if x2 == -1:
|
566 |
+
x2 = frame.shape[1]
|
567 |
+
if y2 == -1:
|
568 |
+
y2 = frame.shape[0]
|
569 |
+
|
570 |
+
frame = frame[y1:y2, x1:x2]
|
571 |
+
|
572 |
+
full_frames.append(frame)
|
573 |
+
|
574 |
+
print("Number of frames available for inference: " + str(len(full_frames)))
|
575 |
+
|
576 |
+
core = ov.Core()
|
577 |
+
|
578 |
+
if not audio_path.endswith(".wav"):
|
579 |
+
print("Extracting raw audio...")
|
580 |
+
command = "ffmpeg -y -i {} -strict -2 {}".format(audio_path, "temp/temp.wav")
|
581 |
+
|
582 |
+
subprocess.call(command, shell=True)
|
583 |
+
audio_path = "temp/temp.wav"
|
584 |
+
|
585 |
+
wav = audio.load_wav(audio_path, 16000)
|
586 |
+
mel = audio.melspectrogram(wav)
|
587 |
+
print(mel.shape)
|
588 |
+
|
589 |
+
if np.isnan(mel.reshape(-1)).sum() > 0:
|
590 |
+
raise ValueError("Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again")
|
591 |
+
|
592 |
+
mel_chunks = []
|
593 |
+
mel_idx_multiplier = 80.0 / fps
|
594 |
+
i = 0
|
595 |
+
while 1:
|
596 |
+
start_idx = int(i * mel_idx_multiplier)
|
597 |
+
if start_idx + mel_step_size > len(mel[0]):
|
598 |
+
mel_chunks.append(mel[:, len(mel[0]) - mel_step_size :])
|
599 |
+
break
|
600 |
+
mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
|
601 |
+
i += 1
|
602 |
+
|
603 |
+
print("Length of mel chunks: {}".format(len(mel_chunks)))
|
604 |
+
|
605 |
+
full_frames = full_frames[: len(mel_chunks)]
|
606 |
+
batch_size = wav2lip_batch_size
|
607 |
+
gen = datagen(full_frames.copy(), mel_chunks, box, static, face_det_batch_size, pads, nosmooth, img_size, wav2lip_batch_size, face_detection_path)
|
608 |
+
for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, total=int(np.ceil(float(len(mel_chunks)) / batch_size)))):
|
609 |
+
if i == 0:
|
610 |
+
img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
|
611 |
+
mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
|
612 |
+
compiled_wav2lip_model = core.compile_model(wav2lip_path, inference_device)
|
613 |
+
print("Model loaded")
|
614 |
+
|
615 |
+
frame_h, frame_w = full_frames[0].shape[:-1]
|
616 |
+
out = cv2.VideoWriter("C:/programacionEjercicios/miwav2lipv6/src/Wav2Lip/temp/result.avi", cv2.VideoWriter_fourcc(*"DIVX"), fps, (frame_w, frame_h))
|
617 |
+
pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch.numpy(), "face_sequences": img_batch.numpy()})[0]
|
618 |
+
else:
|
619 |
+
img_batch = np.transpose(img_batch, (0, 3, 1, 2))
|
620 |
+
mel_batch = np.transpose(mel_batch, (0, 3, 1, 2))
|
621 |
+
pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch, "face_sequences": img_batch})[0]
|
622 |
+
|
623 |
+
pred_ov = compiled_wav2lip_model({"audio_sequences": mel_batch, "face_sequences": img_batch})[0]
|
624 |
+
pred_ov = pred_ov.transpose(0, 2, 3, 1) * 255.0
|
625 |
+
for p, f, c in zip(pred_ov, frames, coords):
|
626 |
+
y1, y2, x1, x2 = c
|
627 |
+
p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
|
628 |
+
|
629 |
+
f[y1:y2, x1:x2] = p
|
630 |
+
out.write(f)
|
631 |
+
|
632 |
+
out.release()
|
633 |
+
|
634 |
+
command = "ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}".format(audio_path, "C:/programacionEjercicios/miwav2lipv6/src/Wav2Lip/temp/result.avi", outfile)
|
635 |
+
subprocess.call(command, shell=True)
|
636 |
+
|
637 |
+
return outfile
|
src/ov_wav2lip_helper.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
import openvino as ov
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from pathlib import Path
|
8 |
+
# Añade `src` al `sys.path` para que Python encuentre `utils/notebook_utils.py`
|
9 |
+
sys.path.append(str(Path(__file__).resolve().parent))
|
10 |
+
|
11 |
+
# Importa `download_file` desde `notebook_utils`
|
12 |
+
from utils.notebook_utils import download_file
|
13 |
+
from huggingface_hub import hf_hub_download
|
14 |
+
from Wav2Lip.face_detection.detection.sfd.net_s3fd import s3fd
|
15 |
+
from Wav2Lip.models import Wav2Lip
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
def _load(checkpoint_path):
|
20 |
+
checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
|
21 |
+
return checkpoint
|
22 |
+
|
23 |
+
|
24 |
+
def load_model(path):
|
25 |
+
model = Wav2Lip()
|
26 |
+
print("Load checkpoint from: {}".format(path))
|
27 |
+
checkpoint = _load(path)
|
28 |
+
s = checkpoint["state_dict"]
|
29 |
+
new_s = {}
|
30 |
+
for k, v in s.items():
|
31 |
+
new_s[k.replace("module.", "")] = v
|
32 |
+
model.load_state_dict(new_s)
|
33 |
+
|
34 |
+
return model.eval()
|
35 |
+
|
36 |
+
|
37 |
+
def download_and_convert_models(ov_face_detection_model_path, ov_wav2lip_model_path):
|
38 |
+
models_urls = {"s3fd": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"}
|
39 |
+
path_to_detector = "checkpoints/face_detection.pth"
|
40 |
+
# Convert Face Detection Model
|
41 |
+
print("Convert Face Detection Model ...")
|
42 |
+
if not os.path.isfile(path_to_detector):
|
43 |
+
download_file(models_urls["s3fd"])
|
44 |
+
if not os.path.exists("checkpoints"):
|
45 |
+
os.mkdir("checkpoints")
|
46 |
+
os.replace("s3fd-619a316812.pth", path_to_detector)
|
47 |
+
model_weights = torch.load(path_to_detector)
|
48 |
+
|
49 |
+
face_detector = s3fd()
|
50 |
+
face_detector.load_state_dict(model_weights)
|
51 |
+
|
52 |
+
if not ov_face_detection_model_path.exists():
|
53 |
+
face_detection_dummy_inputs = torch.FloatTensor(np.random.rand(1, 3, 768, 576))
|
54 |
+
face_detection_ov_model = ov.convert_model(face_detector, example_input=face_detection_dummy_inputs)
|
55 |
+
ov.save_model(face_detection_ov_model, ov_face_detection_model_path)
|
56 |
+
print("Converted face detection OpenVINO model: ", ov_face_detection_model_path)
|
57 |
+
|
58 |
+
print("Convert Wav2Lip Model ...")
|
59 |
+
path_to_wav2lip = hf_hub_download(repo_id="numz/wav2lip_studio", filename="Wav2lip/wav2lip.pth", local_dir="checkpoints")
|
60 |
+
wav2lip = load_model(path_to_wav2lip)
|
61 |
+
img_batch = torch.FloatTensor(np.random.rand(123, 6, 96, 96))
|
62 |
+
mel_batch = torch.FloatTensor(np.random.rand(123, 1, 80, 16))
|
63 |
+
|
64 |
+
if not ov_wav2lip_model_path.exists():
|
65 |
+
example_inputs = {"audio_sequences": mel_batch, "face_sequences": img_batch}
|
66 |
+
wav2lip_ov_model = ov.convert_model(wav2lip, example_input=example_inputs)
|
67 |
+
ov.save_model(wav2lip_ov_model, ov_wav2lip_model_path)
|
68 |
+
print("Converted face detection OpenVINO model: ", ov_wav2lip_model_path)
|
src/run_inference.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from ov_inference import ov_inference
|
3 |
+
import soundfile as sf
|
4 |
+
import cv2
|
5 |
+
|
6 |
+
def verificar_archivos(video_path, audio_path):
|
7 |
+
"""
|
8 |
+
Verifica que los archivos de video y audio existen y son legibles.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
video_path (str): Ruta del archivo de video.
|
12 |
+
audio_path (str): Ruta del archivo de audio.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
bool: True si ambos archivos son legibles, False en caso contrario.
|
16 |
+
"""
|
17 |
+
# Verificar el archivo de video
|
18 |
+
if not os.path.exists(video_path):
|
19 |
+
print(f"Error: El archivo de video no existe en la ruta {video_path}")
|
20 |
+
return False
|
21 |
+
else:
|
22 |
+
# Intentar abrir el video
|
23 |
+
cap = cv2.VideoCapture(video_path)
|
24 |
+
if not cap.isOpened():
|
25 |
+
print(f"Error: No se puede abrir el archivo de video en {video_path}")
|
26 |
+
return False
|
27 |
+
else:
|
28 |
+
print(f"Archivo de video {video_path} está accesible.")
|
29 |
+
cap.release()
|
30 |
+
|
31 |
+
# Verificar el archivo de audio
|
32 |
+
if not os.path.exists(audio_path):
|
33 |
+
print(f"Error: El archivo de audio no existe en la ruta {audio_path}")
|
34 |
+
return False
|
35 |
+
else:
|
36 |
+
try:
|
37 |
+
# Intentar abrir el archivo de audio
|
38 |
+
with sf.SoundFile(audio_path) as audio_file:
|
39 |
+
print(f"Archivo de audio {audio_path} está accesible.")
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error al leer el archivo de audio: {e}")
|
42 |
+
return False
|
43 |
+
|
44 |
+
return True
|
45 |
+
|
46 |
+
# Rutas de archivos
|
47 |
+
#video_path = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun_5s.mp4")
|
48 |
+
video_path = os.path.abspath("../miwav2lipv6/assets/video/data_video_sun.mp4")
|
49 |
+
#audio_path = os.path.abspath("../miwav2lipv6/assets/audio/grabacion_gradio.wav")
|
50 |
+
audio_path = os.path.abspath("../miwav2lipv6/assets/audio/audio.wav")
|
51 |
+
face_detection_path = os.path.abspath("../miwav2lipv6/models/face_detection.xml")
|
52 |
+
wav2lip_path = os.path.abspath("../miwav2lipv6/models/wav2lip.xml")
|
53 |
+
outfile = os.path.abspath("../miwav2lipv6/results/result_voice.mp4")
|
54 |
+
|
55 |
+
# Verificar archivos antes de llamar a ov_inference
|
56 |
+
if verificar_archivos(video_path, audio_path):
|
57 |
+
ov_inference(
|
58 |
+
video_path,
|
59 |
+
audio_path,
|
60 |
+
face_detection_path=face_detection_path,
|
61 |
+
wav2lip_path=wav2lip_path,
|
62 |
+
inference_device="CPU",
|
63 |
+
outfile=outfile,
|
64 |
+
resize_factor = 2,
|
65 |
+
)
|
66 |
+
else:
|
67 |
+
print("No se pudo proceder con la inferencia debido a problemas con los archivos.")
|
src/text_to_speech.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# text_to_speech.py
|
2 |
+
|
3 |
+
from gtts import gTTS
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Rutas de los archivos
|
7 |
+
#TRANSCRIPTION_TEXT_PATH = "C:/programacionEjercicios/miwav2lipv6/results/transcripcion.txt"
|
8 |
+
TRANSCRIPTION_TEXT_PATH = "C:/programacionEjercicios/miwav2lipv6/results/OpenAI_response.txt"
|
9 |
+
OUTPUT_AUDIO_PATH = "C:/programacionEjercicios/miwav2lipv6/assets/audio/audio.wav"
|
10 |
+
|
11 |
+
def generar_audio_desde_texto():
|
12 |
+
"""
|
13 |
+
Convierte el texto en `transcripcion.txt` a un archivo de audio en español (`audio.wav`).
|
14 |
+
"""
|
15 |
+
try:
|
16 |
+
# Verificar si el archivo de transcripción existe
|
17 |
+
if not os.path.exists(TRANSCRIPTION_TEXT_PATH):
|
18 |
+
print("Error: No se encontró el archivo de transcripción.")
|
19 |
+
return
|
20 |
+
|
21 |
+
# Leer el contenido de transcripcion.txt
|
22 |
+
with open(TRANSCRIPTION_TEXT_PATH, "r", encoding="utf-8") as file:
|
23 |
+
texto = file.read()
|
24 |
+
|
25 |
+
# Generar el audio en español usando gTTS
|
26 |
+
tts = gTTS(text=texto, lang='es', slow=False)
|
27 |
+
tts.save(OUTPUT_AUDIO_PATH)
|
28 |
+
|
29 |
+
print(f"Audio generado correctamente en: {OUTPUT_AUDIO_PATH}")
|
30 |
+
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error al generar el audio: {e}")
|
33 |
+
|
34 |
+
if __name__ == "__main__":
|
35 |
+
generar_audio_desde_texto()
|
36 |
+
|
src/utils/notebook_utils.py
ADDED
@@ -0,0 +1,708 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import platform
|
3 |
+
import sys
|
4 |
+
import threading
|
5 |
+
import time
|
6 |
+
import urllib.parse
|
7 |
+
|
8 |
+
from os import PathLike
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import List, NamedTuple, Optional, Tuple
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
from openvino.runtime import Core, Type, get_version
|
15 |
+
from IPython.display import HTML, Image, display
|
16 |
+
|
17 |
+
import openvino as ov
|
18 |
+
from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
|
19 |
+
from openvino.runtime import opset10 as ops
|
20 |
+
|
21 |
+
|
22 |
+
# ## Files
|
23 |
+
#
|
24 |
+
# Load an image, download a file, download an IR model, and create a progress bar to show download progress.
|
25 |
+
|
26 |
+
def device_widget(default="AUTO", exclude=None, added=None):
|
27 |
+
import openvino as ov
|
28 |
+
import ipywidgets as widgets
|
29 |
+
|
30 |
+
core = ov.Core()
|
31 |
+
|
32 |
+
supported_devices = core.available_devices + ["AUTO"]
|
33 |
+
exclude = exclude or []
|
34 |
+
if exclude:
|
35 |
+
for ex_device in exclude:
|
36 |
+
if ex_device in supported_devices:
|
37 |
+
supported_devices.remove(ex_device)
|
38 |
+
|
39 |
+
added = added or []
|
40 |
+
if added:
|
41 |
+
for add_device in added:
|
42 |
+
if add_device not in supported_devices:
|
43 |
+
supported_devices.append(add_device)
|
44 |
+
|
45 |
+
device = widgets.Dropdown(
|
46 |
+
options=supported_devices,
|
47 |
+
value=default,
|
48 |
+
description="Device:",
|
49 |
+
disabled=False,
|
50 |
+
)
|
51 |
+
return device
|
52 |
+
|
53 |
+
|
54 |
+
def quantization_widget(default=True):
|
55 |
+
import ipywidgets as widgets
|
56 |
+
|
57 |
+
to_quantize = widgets.Checkbox(
|
58 |
+
value=default,
|
59 |
+
description="Quantization",
|
60 |
+
disabled=False,
|
61 |
+
)
|
62 |
+
|
63 |
+
return to_quantize
|
64 |
+
|
65 |
+
|
66 |
+
def pip_install(*args):
|
67 |
+
import subprocess # nosec - disable B404:import-subprocess check
|
68 |
+
|
69 |
+
cli_args = []
|
70 |
+
for arg in args:
|
71 |
+
cli_args.extend(str(arg).split(" "))
|
72 |
+
subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], shell=(platform.system() == "Windows"), check=True)
|
73 |
+
|
74 |
+
|
75 |
+
def load_image(path: str) -> np.ndarray:
|
76 |
+
"""
|
77 |
+
Loads an image from `path` and returns it as BGR numpy array. `path`
|
78 |
+
should point to an image file, either a local filename or a url. The image is
|
79 |
+
not stored to the filesystem. Use the `download_file` function to download and
|
80 |
+
store an image.
|
81 |
+
|
82 |
+
:param path: Local path name or URL to image.
|
83 |
+
:return: image as BGR numpy array
|
84 |
+
"""
|
85 |
+
import cv2
|
86 |
+
import requests
|
87 |
+
|
88 |
+
if path.startswith("http"):
|
89 |
+
# Set User-Agent to Mozilla because some websites block
|
90 |
+
# requests with User-Agent Python
|
91 |
+
response = requests.get(path, headers={"User-Agent": "Mozilla/5.0"})
|
92 |
+
array = np.asarray(bytearray(response.content), dtype="uint8")
|
93 |
+
image = cv2.imdecode(array, -1) # Loads the image as BGR
|
94 |
+
else:
|
95 |
+
image = cv2.imread(path)
|
96 |
+
return image
|
97 |
+
|
98 |
+
|
99 |
+
def download_file(
|
100 |
+
url: PathLike,
|
101 |
+
filename: PathLike = None,
|
102 |
+
directory: PathLike = None,
|
103 |
+
show_progress: bool = True,
|
104 |
+
silent: bool = False,
|
105 |
+
timeout: int = 10,
|
106 |
+
) -> PathLike:
|
107 |
+
"""
|
108 |
+
Download a file from a url and save it to the local filesystem. The file is saved to the
|
109 |
+
current directory by default, or to `directory` if specified. If a filename is not given,
|
110 |
+
the filename of the URL will be used.
|
111 |
+
|
112 |
+
:param url: URL that points to the file to download
|
113 |
+
:param filename: Name of the local file to save. Should point to the name of the file only,
|
114 |
+
not the full path. If None the filename from the url will be used
|
115 |
+
:param directory: Directory to save the file to. Will be created if it doesn't exist
|
116 |
+
If None the file will be saved to the current working directory
|
117 |
+
:param show_progress: If True, show an TQDM ProgressBar
|
118 |
+
:param silent: If True, do not print a message if the file already exists
|
119 |
+
:param timeout: Number of seconds before cancelling the connection attempt
|
120 |
+
:return: path to downloaded file
|
121 |
+
"""
|
122 |
+
from tqdm.notebook import tqdm_notebook
|
123 |
+
import requests
|
124 |
+
|
125 |
+
filename = filename or Path(urllib.parse.urlparse(url).path).name
|
126 |
+
chunk_size = 16384 # make chunks bigger so that not too many updates are triggered for Jupyter front-end
|
127 |
+
|
128 |
+
filename = Path(filename)
|
129 |
+
if len(filename.parts) > 1:
|
130 |
+
raise ValueError(
|
131 |
+
"`filename` should refer to the name of the file, excluding the directory. "
|
132 |
+
"Use the `directory` parameter to specify a target directory for the downloaded file."
|
133 |
+
)
|
134 |
+
|
135 |
+
# create the directory if it does not exist, and add the directory to the filename
|
136 |
+
if directory is not None:
|
137 |
+
directory = Path(directory)
|
138 |
+
directory.mkdir(parents=True, exist_ok=True)
|
139 |
+
filename = directory / Path(filename)
|
140 |
+
|
141 |
+
try:
|
142 |
+
response = requests.get(url=url, headers={"User-agent": "Mozilla/5.0"}, stream=True)
|
143 |
+
response.raise_for_status()
|
144 |
+
except (
|
145 |
+
requests.exceptions.HTTPError
|
146 |
+
) as error: # For error associated with not-200 codes. Will output something like: "404 Client Error: Not Found for url: {url}"
|
147 |
+
raise Exception(error) from None
|
148 |
+
except requests.exceptions.Timeout:
|
149 |
+
raise Exception(
|
150 |
+
"Connection timed out. If you access the internet through a proxy server, please "
|
151 |
+
"make sure the proxy is set in the shell from where you launched Jupyter."
|
152 |
+
) from None
|
153 |
+
except requests.exceptions.RequestException as error:
|
154 |
+
raise Exception(f"File downloading failed with error: {error}") from None
|
155 |
+
|
156 |
+
# download the file if it does not exist, or if it exists with an incorrect file size
|
157 |
+
filesize = int(response.headers.get("Content-length", 0))
|
158 |
+
if not filename.exists() or (os.stat(filename).st_size != filesize):
|
159 |
+
with tqdm(
|
160 |
+
total=filesize,
|
161 |
+
unit="B",
|
162 |
+
unit_scale=True,
|
163 |
+
unit_divisor=1024,
|
164 |
+
desc=str(filename),
|
165 |
+
disable=not show_progress,
|
166 |
+
) as progress_bar:
|
167 |
+
with open(filename, "wb") as file_object:
|
168 |
+
for chunk in response.iter_content(chunk_size):
|
169 |
+
file_object.write(chunk)
|
170 |
+
progress_bar.update(len(chunk))
|
171 |
+
progress_bar.refresh()
|
172 |
+
else:
|
173 |
+
if not silent:
|
174 |
+
print(f"'{filename}' already exists.")
|
175 |
+
|
176 |
+
response.close()
|
177 |
+
|
178 |
+
return filename.resolve()
|
179 |
+
|
180 |
+
|
181 |
+
def download_ir_model(model_xml_url: str, destination_folder: PathLike = None) -> PathLike:
|
182 |
+
"""
|
183 |
+
Download IR model from `model_xml_url`. Downloads model xml and bin file; the weights file is
|
184 |
+
assumed to exist at the same location and name as model_xml_url with a ".bin" extension.
|
185 |
+
|
186 |
+
:param model_xml_url: URL to model xml file to download
|
187 |
+
:param destination_folder: Directory where downloaded model xml and bin are saved. If None, model
|
188 |
+
files are saved to the current directory
|
189 |
+
:return: path to downloaded xml model file
|
190 |
+
"""
|
191 |
+
model_bin_url = model_xml_url[:-4] + ".bin"
|
192 |
+
model_xml_path = download_file(model_xml_url, directory=destination_folder, show_progress=False)
|
193 |
+
download_file(model_bin_url, directory=destination_folder)
|
194 |
+
return model_xml_path
|
195 |
+
|
196 |
+
|
197 |
+
# ## Images
|
198 |
+
|
199 |
+
# ### Convert Pixel Data
|
200 |
+
#
|
201 |
+
# Normalize image pixel values between 0 and 1, and convert images to RGB and BGR.
|
202 |
+
|
203 |
+
# In[ ]:
|
204 |
+
|
205 |
+
|
206 |
+
def normalize_minmax(data):
|
207 |
+
"""
|
208 |
+
Normalizes the values in `data` between 0 and 1
|
209 |
+
"""
|
210 |
+
if data.max() == data.min():
|
211 |
+
raise ValueError("Normalization is not possible because all elements of" f"`data` have the same value: {data.max()}.")
|
212 |
+
return (data - data.min()) / (data.max() - data.min())
|
213 |
+
|
214 |
+
|
215 |
+
def to_rgb(image_data: np.ndarray) -> np.ndarray:
|
216 |
+
"""
|
217 |
+
Convert image_data from BGR to RGB
|
218 |
+
"""
|
219 |
+
import cv2
|
220 |
+
|
221 |
+
return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)
|
222 |
+
|
223 |
+
|
224 |
+
def to_bgr(image_data: np.ndarray) -> np.ndarray:
|
225 |
+
"""
|
226 |
+
Convert image_data from RGB to BGR
|
227 |
+
"""
|
228 |
+
import cv2
|
229 |
+
|
230 |
+
return cv2.cvtColor(image_data, cv2.COLOR_RGB2BGR)
|
231 |
+
|
232 |
+
|
233 |
+
# ## Videos
|
234 |
+
|
235 |
+
# ### Video Player
|
236 |
+
#
|
237 |
+
# Custom video player to fulfill FPS requirements. You can set target FPS and output size, flip the video horizontally or skip first N frames.
|
238 |
+
|
239 |
+
# In[ ]:
|
240 |
+
|
241 |
+
|
242 |
+
class VideoPlayer:
|
243 |
+
"""
|
244 |
+
Custom video player to fulfill FPS requirements. You can set target FPS and output size,
|
245 |
+
flip the video horizontally or skip first N frames.
|
246 |
+
|
247 |
+
:param source: Video source. It could be either camera device or video file.
|
248 |
+
:param size: Output frame size.
|
249 |
+
:param flip: Flip source horizontally.
|
250 |
+
:param fps: Target FPS.
|
251 |
+
:param skip_first_frames: Skip first N frames.
|
252 |
+
"""
|
253 |
+
|
254 |
+
def __init__(self, source, size=None, flip=False, fps=None, skip_first_frames=0, width=1280, height=720):
|
255 |
+
import cv2
|
256 |
+
|
257 |
+
self.cv2 = cv2 # This is done to access the package in class methods
|
258 |
+
self.__cap = cv2.VideoCapture(source)
|
259 |
+
# try HD by default to get better video quality
|
260 |
+
self.__cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
|
261 |
+
self.__cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
|
262 |
+
|
263 |
+
if not self.__cap.isOpened():
|
264 |
+
raise RuntimeError(f"Cannot open {'camera' if isinstance(source, int) else ''} {source}")
|
265 |
+
# skip first N frames
|
266 |
+
self.__cap.set(cv2.CAP_PROP_POS_FRAMES, skip_first_frames)
|
267 |
+
# fps of input file
|
268 |
+
self.__input_fps = self.__cap.get(cv2.CAP_PROP_FPS)
|
269 |
+
if self.__input_fps <= 0:
|
270 |
+
self.__input_fps = 60
|
271 |
+
# target fps given by user
|
272 |
+
self.__output_fps = fps if fps is not None else self.__input_fps
|
273 |
+
self.__flip = flip
|
274 |
+
self.__size = None
|
275 |
+
self.__interpolation = None
|
276 |
+
if size is not None:
|
277 |
+
self.__size = size
|
278 |
+
# AREA better for shrinking, LINEAR better for enlarging
|
279 |
+
self.__interpolation = cv2.INTER_AREA if size[0] < self.__cap.get(cv2.CAP_PROP_FRAME_WIDTH) else cv2.INTER_LINEAR
|
280 |
+
# first frame
|
281 |
+
_, self.__frame = self.__cap.read()
|
282 |
+
self.__lock = threading.Lock()
|
283 |
+
self.__thread = None
|
284 |
+
self.__stop = False
|
285 |
+
|
286 |
+
"""
|
287 |
+
Start playing.
|
288 |
+
"""
|
289 |
+
|
290 |
+
def start(self):
|
291 |
+
self.__stop = False
|
292 |
+
self.__thread = threading.Thread(target=self.__run, daemon=True)
|
293 |
+
self.__thread.start()
|
294 |
+
|
295 |
+
"""
|
296 |
+
Stop playing and release resources.
|
297 |
+
"""
|
298 |
+
|
299 |
+
def stop(self):
|
300 |
+
self.__stop = True
|
301 |
+
if self.__thread is not None:
|
302 |
+
self.__thread.join()
|
303 |
+
self.__cap.release()
|
304 |
+
|
305 |
+
def __run(self):
|
306 |
+
prev_time = 0
|
307 |
+
while not self.__stop:
|
308 |
+
t1 = time.time()
|
309 |
+
ret, frame = self.__cap.read()
|
310 |
+
if not ret:
|
311 |
+
break
|
312 |
+
|
313 |
+
# fulfill target fps
|
314 |
+
if 1 / self.__output_fps < time.time() - prev_time:
|
315 |
+
prev_time = time.time()
|
316 |
+
# replace by current frame
|
317 |
+
with self.__lock:
|
318 |
+
self.__frame = frame
|
319 |
+
|
320 |
+
t2 = time.time()
|
321 |
+
# time to wait [s] to fulfill input fps
|
322 |
+
wait_time = 1 / self.__input_fps - (t2 - t1)
|
323 |
+
# wait until
|
324 |
+
time.sleep(max(0, wait_time))
|
325 |
+
|
326 |
+
self.__frame = None
|
327 |
+
|
328 |
+
"""
|
329 |
+
Get current frame.
|
330 |
+
"""
|
331 |
+
|
332 |
+
def next(self):
|
333 |
+
import cv2
|
334 |
+
|
335 |
+
with self.__lock:
|
336 |
+
if self.__frame is None:
|
337 |
+
return None
|
338 |
+
# need to copy frame, because can be cached and reused if fps is low
|
339 |
+
frame = self.__frame.copy()
|
340 |
+
if self.__size is not None:
|
341 |
+
frame = self.cv2.resize(frame, self.__size, interpolation=self.__interpolation)
|
342 |
+
if self.__flip:
|
343 |
+
frame = self.cv2.flip(frame, 1)
|
344 |
+
return frame
|
345 |
+
|
346 |
+
|
347 |
+
# ## Visualization
|
348 |
+
|
349 |
+
# ### Segmentation
|
350 |
+
#
|
351 |
+
# Define a SegmentationMap NamedTuple that keeps the labels and colormap for a segmentation project/dataset. Create CityScapesSegmentation and BinarySegmentation SegmentationMaps. Create a function to convert a segmentation map to an RGB image with a colormap, and to show the segmentation result as an overlay over the original image.
|
352 |
+
|
353 |
+
# In[ ]:
|
354 |
+
|
355 |
+
|
356 |
+
class Label(NamedTuple):
|
357 |
+
index: int
|
358 |
+
color: Tuple
|
359 |
+
name: Optional[str] = None
|
360 |
+
|
361 |
+
|
362 |
+
# In[ ]:
|
363 |
+
|
364 |
+
|
365 |
+
class SegmentationMap(NamedTuple):
|
366 |
+
labels: List
|
367 |
+
|
368 |
+
def get_colormap(self):
|
369 |
+
return np.array([label.color for label in self.labels])
|
370 |
+
|
371 |
+
def get_labels(self):
|
372 |
+
labelnames = [label.name for label in self.labels]
|
373 |
+
if any(labelnames):
|
374 |
+
return labelnames
|
375 |
+
else:
|
376 |
+
return None
|
377 |
+
|
378 |
+
|
379 |
+
# In[ ]:
|
380 |
+
|
381 |
+
|
382 |
+
cityscape_labels = [
|
383 |
+
Label(index=0, color=(128, 64, 128), name="road"),
|
384 |
+
Label(index=1, color=(244, 35, 232), name="sidewalk"),
|
385 |
+
Label(index=2, color=(70, 70, 70), name="building"),
|
386 |
+
Label(index=3, color=(102, 102, 156), name="wall"),
|
387 |
+
Label(index=4, color=(190, 153, 153), name="fence"),
|
388 |
+
Label(index=5, color=(153, 153, 153), name="pole"),
|
389 |
+
Label(index=6, color=(250, 170, 30), name="traffic light"),
|
390 |
+
Label(index=7, color=(220, 220, 0), name="traffic sign"),
|
391 |
+
Label(index=8, color=(107, 142, 35), name="vegetation"),
|
392 |
+
Label(index=9, color=(152, 251, 152), name="terrain"),
|
393 |
+
Label(index=10, color=(70, 130, 180), name="sky"),
|
394 |
+
Label(index=11, color=(220, 20, 60), name="person"),
|
395 |
+
Label(index=12, color=(255, 0, 0), name="rider"),
|
396 |
+
Label(index=13, color=(0, 0, 142), name="car"),
|
397 |
+
Label(index=14, color=(0, 0, 70), name="truck"),
|
398 |
+
Label(index=15, color=(0, 60, 100), name="bus"),
|
399 |
+
Label(index=16, color=(0, 80, 100), name="train"),
|
400 |
+
Label(index=17, color=(0, 0, 230), name="motorcycle"),
|
401 |
+
Label(index=18, color=(119, 11, 32), name="bicycle"),
|
402 |
+
Label(index=19, color=(255, 255, 255), name="background"),
|
403 |
+
]
|
404 |
+
|
405 |
+
CityScapesSegmentation = SegmentationMap(cityscape_labels)
|
406 |
+
|
407 |
+
binary_labels = [
|
408 |
+
Label(index=0, color=(255, 255, 255), name="background"),
|
409 |
+
Label(index=1, color=(0, 0, 0), name="foreground"),
|
410 |
+
]
|
411 |
+
|
412 |
+
BinarySegmentation = SegmentationMap(binary_labels)
|
413 |
+
|
414 |
+
|
415 |
+
# In[ ]:
|
416 |
+
|
417 |
+
|
418 |
+
def segmentation_map_to_image(result: np.ndarray, colormap: np.ndarray, remove_holes: bool = False) -> np.ndarray:
|
419 |
+
"""
|
420 |
+
Convert network result of floating point numbers to an RGB image with
|
421 |
+
integer values from 0-255 by applying a colormap.
|
422 |
+
|
423 |
+
:param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
|
424 |
+
:param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
|
425 |
+
:param remove_holes: If True, remove holes in the segmentation result.
|
426 |
+
:return: An RGB image where each pixel is an int8 value according to colormap.
|
427 |
+
"""
|
428 |
+
import cv2
|
429 |
+
|
430 |
+
if len(result.shape) != 2 and result.shape[0] != 1:
|
431 |
+
raise ValueError(f"Expected result with shape (H,W) or (1,H,W), got result with shape {result.shape}")
|
432 |
+
|
433 |
+
if len(np.unique(result)) > colormap.shape[0]:
|
434 |
+
raise ValueError(
|
435 |
+
f"Expected max {colormap[0]} classes in result, got {len(np.unique(result))} "
|
436 |
+
"different output values. Please make sure to convert the network output to "
|
437 |
+
"pixel values before calling this function."
|
438 |
+
)
|
439 |
+
elif result.shape[0] == 1:
|
440 |
+
result = result.squeeze(0)
|
441 |
+
|
442 |
+
result = result.astype(np.uint8)
|
443 |
+
|
444 |
+
contour_mode = cv2.RETR_EXTERNAL if remove_holes else cv2.RETR_TREE
|
445 |
+
mask = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8)
|
446 |
+
for label_index, color in enumerate(colormap):
|
447 |
+
label_index_map = result == label_index
|
448 |
+
label_index_map = label_index_map.astype(np.uint8) * 255
|
449 |
+
contours, hierarchies = cv2.findContours(label_index_map, contour_mode, cv2.CHAIN_APPROX_SIMPLE)
|
450 |
+
cv2.drawContours(
|
451 |
+
mask,
|
452 |
+
contours,
|
453 |
+
contourIdx=-1,
|
454 |
+
color=color.tolist(),
|
455 |
+
thickness=cv2.FILLED,
|
456 |
+
)
|
457 |
+
|
458 |
+
return mask
|
459 |
+
|
460 |
+
|
461 |
+
def segmentation_map_to_overlay(image, result, alpha, colormap, remove_holes=False) -> np.ndarray:
|
462 |
+
"""
|
463 |
+
Returns a new image where a segmentation mask (created with colormap) is overlayed on
|
464 |
+
the source image.
|
465 |
+
|
466 |
+
:param image: Source image.
|
467 |
+
:param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
|
468 |
+
:param alpha: Alpha transparency value for the overlay image.
|
469 |
+
:param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
|
470 |
+
:param remove_holes: If True, remove holes in the segmentation result.
|
471 |
+
:return: An RGP image with segmentation mask overlayed on the source image.
|
472 |
+
"""
|
473 |
+
import cv2
|
474 |
+
|
475 |
+
if len(image.shape) == 2:
|
476 |
+
image = np.repeat(np.expand_dims(image, -1), 3, 2)
|
477 |
+
mask = segmentation_map_to_image(result, colormap, remove_holes)
|
478 |
+
image_height, image_width = image.shape[:2]
|
479 |
+
mask = cv2.resize(src=mask, dsize=(image_width, image_height))
|
480 |
+
return cv2.addWeighted(mask, alpha, image, 1 - alpha, 0)
|
481 |
+
|
482 |
+
|
483 |
+
# ### Network Results
|
484 |
+
#
|
485 |
+
# Show network result image, optionally together with the source image and a legend with labels.
|
486 |
+
|
487 |
+
# In[ ]:
|
488 |
+
|
489 |
+
|
490 |
+
def viz_result_image(
|
491 |
+
result_image: np.ndarray,
|
492 |
+
source_image: np.ndarray = None,
|
493 |
+
source_title: str = None,
|
494 |
+
result_title: str = None,
|
495 |
+
labels: List[Label] = None,
|
496 |
+
resize: bool = False,
|
497 |
+
bgr_to_rgb: bool = False,
|
498 |
+
hide_axes: bool = False,
|
499 |
+
):
|
500 |
+
"""
|
501 |
+
Show result image, optionally together with source images, and a legend with labels.
|
502 |
+
|
503 |
+
:param result_image: Numpy array of RGB result image.
|
504 |
+
:param source_image: Numpy array of source image. If provided this image will be shown
|
505 |
+
next to the result image. source_image is expected to be in RGB format.
|
506 |
+
Set bgr_to_rgb to True if source_image is in BGR format.
|
507 |
+
:param source_title: Title to display for the source image.
|
508 |
+
:param result_title: Title to display for the result image.
|
509 |
+
:param labels: List of labels. If provided, a legend will be shown with the given labels.
|
510 |
+
:param resize: If true, resize the result image to the same shape as the source image.
|
511 |
+
:param bgr_to_rgb: If true, convert the source image from BGR to RGB. Use this option if
|
512 |
+
source_image is a BGR image.
|
513 |
+
:param hide_axes: If true, do not show matplotlib axes.
|
514 |
+
:return: Matplotlib figure with result image
|
515 |
+
"""
|
516 |
+
import cv2
|
517 |
+
import matplotlib.pyplot as plt
|
518 |
+
from matplotlib.lines import Line2D
|
519 |
+
|
520 |
+
if bgr_to_rgb:
|
521 |
+
source_image = to_rgb(source_image)
|
522 |
+
if resize:
|
523 |
+
result_image = cv2.resize(result_image, (source_image.shape[1], source_image.shape[0]))
|
524 |
+
|
525 |
+
num_images = 1 if source_image is None else 2
|
526 |
+
|
527 |
+
fig, ax = plt.subplots(1, num_images, figsize=(16, 8), squeeze=False)
|
528 |
+
if source_image is not None:
|
529 |
+
ax[0, 0].imshow(source_image)
|
530 |
+
ax[0, 0].set_title(source_title)
|
531 |
+
|
532 |
+
ax[0, num_images - 1].imshow(result_image)
|
533 |
+
ax[0, num_images - 1].set_title(result_title)
|
534 |
+
|
535 |
+
if hide_axes:
|
536 |
+
for a in ax.ravel():
|
537 |
+
a.axis("off")
|
538 |
+
if labels:
|
539 |
+
colors = labels.get_colormap()
|
540 |
+
lines = [
|
541 |
+
Line2D(
|
542 |
+
[0],
|
543 |
+
[0],
|
544 |
+
color=[item / 255 for item in c.tolist()],
|
545 |
+
linewidth=3,
|
546 |
+
linestyle="-",
|
547 |
+
)
|
548 |
+
for c in colors
|
549 |
+
]
|
550 |
+
plt.legend(
|
551 |
+
lines,
|
552 |
+
labels.get_labels(),
|
553 |
+
bbox_to_anchor=(1, 1),
|
554 |
+
loc="upper left",
|
555 |
+
prop={"size": 12},
|
556 |
+
)
|
557 |
+
plt.close(fig)
|
558 |
+
return fig
|
559 |
+
|
560 |
+
|
561 |
+
# ### Live Inference
|
562 |
+
|
563 |
+
# In[ ]:
|
564 |
+
|
565 |
+
|
566 |
+
def show_array(frame: np.ndarray, display_handle=None):
|
567 |
+
"""
|
568 |
+
Display array `frame`. Replace information at `display_handle` with `frame`
|
569 |
+
encoded as jpeg image. `frame` is expected to have data in BGR order.
|
570 |
+
|
571 |
+
Create a display_handle with: `display_handle = display(display_id=True)`
|
572 |
+
"""
|
573 |
+
import cv2
|
574 |
+
|
575 |
+
_, frame = cv2.imencode(ext=".jpeg", img=frame)
|
576 |
+
if display_handle is None:
|
577 |
+
display_handle = display(Image(data=frame.tobytes()), display_id=True)
|
578 |
+
else:
|
579 |
+
display_handle.update(Image(data=frame.tobytes()))
|
580 |
+
return display_handle
|
581 |
+
|
582 |
+
|
583 |
+
# ## Checks and Alerts
|
584 |
+
#
|
585 |
+
# Create an alert class to show stylized info/error/warning messages and a `check_device` function that checks whether a given device is available.
|
586 |
+
|
587 |
+
# In[ ]:
|
588 |
+
|
589 |
+
|
590 |
+
class NotebookAlert(Exception):
|
591 |
+
def __init__(self, message: str, alert_class: str):
|
592 |
+
"""
|
593 |
+
Show an alert box with the given message.
|
594 |
+
|
595 |
+
:param message: The message to display.
|
596 |
+
:param alert_class: The class for styling the message. Options: info, warning, success, danger.
|
597 |
+
"""
|
598 |
+
self.message = message
|
599 |
+
self.alert_class = alert_class
|
600 |
+
self.show_message()
|
601 |
+
|
602 |
+
def show_message(self):
|
603 |
+
display(HTML(f"""<div class="alert alert-{self.alert_class}">{self.message}"""))
|
604 |
+
|
605 |
+
|
606 |
+
class DeviceNotFoundAlert(NotebookAlert):
|
607 |
+
def __init__(self, device: str):
|
608 |
+
"""
|
609 |
+
Show a warning message about an unavailable device. This class does not check whether or
|
610 |
+
not the device is available, use the `check_device` function to check this. `check_device`
|
611 |
+
also shows the warning if the device is not found.
|
612 |
+
|
613 |
+
:param device: The unavailable device.
|
614 |
+
:return: A formatted alert box with the message that `device` is not available, and a list
|
615 |
+
of devices that are available.
|
616 |
+
"""
|
617 |
+
ie = Core()
|
618 |
+
supported_devices = ie.available_devices
|
619 |
+
self.message = f"Running this cell requires a {device} device, " "which is not available on this system. "
|
620 |
+
self.alert_class = "warning"
|
621 |
+
if len(supported_devices) == 1:
|
622 |
+
self.message += f"The following device is available: {ie.available_devices[0]}"
|
623 |
+
else:
|
624 |
+
self.message += "The following devices are available: " f"{', '.join(ie.available_devices)}"
|
625 |
+
super().__init__(self.message, self.alert_class)
|
626 |
+
|
627 |
+
|
628 |
+
def check_device(device: str) -> bool:
|
629 |
+
"""
|
630 |
+
Check if the specified device is available on the system.
|
631 |
+
|
632 |
+
:param device: Device to check. e.g. CPU, GPU
|
633 |
+
:return: True if the device is available, False if not. If the device is not available,
|
634 |
+
a DeviceNotFoundAlert will be shown.
|
635 |
+
"""
|
636 |
+
ie = Core()
|
637 |
+
if device not in ie.available_devices:
|
638 |
+
DeviceNotFoundAlert(device)
|
639 |
+
return False
|
640 |
+
else:
|
641 |
+
return True
|
642 |
+
|
643 |
+
|
644 |
+
def check_openvino_version(version: str) -> bool:
|
645 |
+
"""
|
646 |
+
Check if the specified OpenVINO version is installed.
|
647 |
+
|
648 |
+
:param version: the OpenVINO version to check. Example: 2021.4
|
649 |
+
:return: True if the version is installed, False if not. If the version is not installed,
|
650 |
+
an alert message will be shown.
|
651 |
+
"""
|
652 |
+
installed_version = get_version()
|
653 |
+
if version not in installed_version:
|
654 |
+
NotebookAlert(
|
655 |
+
f"This notebook requires OpenVINO {version}. "
|
656 |
+
f"The version on your system is: <i>{installed_version}</i>.<br>"
|
657 |
+
"Please run <span style='font-family:monospace'>pip install --upgrade -r requirements.txt</span> "
|
658 |
+
"in the openvino_env environment to install this version. "
|
659 |
+
"See the <a href='https://github.com/openvinotoolkit/openvino_notebooks'>"
|
660 |
+
"OpenVINO Notebooks README</a> for detailed instructions",
|
661 |
+
alert_class="danger",
|
662 |
+
)
|
663 |
+
return False
|
664 |
+
else:
|
665 |
+
return True
|
666 |
+
|
667 |
+
|
668 |
+
packed_layername_tensor_dict_list = [{"name": "aten::mul/Multiply"}]
|
669 |
+
|
670 |
+
|
671 |
+
class ReplaceTensor(MatcherPass):
|
672 |
+
def __init__(self, packed_layername_tensor_dict_list):
|
673 |
+
MatcherPass.__init__(self)
|
674 |
+
self.model_changed = False
|
675 |
+
|
676 |
+
param = WrapType("opset10.Multiply")
|
677 |
+
|
678 |
+
def callback(matcher: Matcher) -> bool:
|
679 |
+
root = matcher.get_match_root()
|
680 |
+
if root is None:
|
681 |
+
return False
|
682 |
+
for y in packed_layername_tensor_dict_list:
|
683 |
+
root_name = root.get_friendly_name()
|
684 |
+
if root_name.find(y["name"]) != -1:
|
685 |
+
max_fp16 = np.array([[[[-np.finfo(np.float16).max]]]]).astype(np.float32)
|
686 |
+
new_tenser = ops.constant(max_fp16, Type.f32, name="Constant_4431")
|
687 |
+
root.set_arguments([root.input_value(0).node, new_tenser])
|
688 |
+
packed_layername_tensor_dict_list.remove(y)
|
689 |
+
|
690 |
+
return True
|
691 |
+
|
692 |
+
self.register_matcher(Matcher(param, "ReplaceTensor"), callback)
|
693 |
+
|
694 |
+
|
695 |
+
def optimize_bge_embedding(model_path, output_model_path):
|
696 |
+
"""
|
697 |
+
optimize_bge_embedding used to optimize BGE model for NPU device
|
698 |
+
|
699 |
+
Arguments:
|
700 |
+
model_path {str} -- original BGE IR model path
|
701 |
+
output_model_path {str} -- Converted BGE IR model path
|
702 |
+
"""
|
703 |
+
core = Core()
|
704 |
+
ov_model = core.read_model(model_path)
|
705 |
+
manager = Manager()
|
706 |
+
manager.register_pass(ReplaceTensor(packed_layername_tensor_dict_list))
|
707 |
+
manager.run_passes(ov_model)
|
708 |
+
ov.save_model(ov_model, output_model_path, compress_to_fp16=False)
|
src/utils/pip_helper.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
|
4 |
+
def pip_install(*args):
|
5 |
+
import subprocess # nosec - disable B404:import-subprocess check
|
6 |
+
|
7 |
+
cli_args = []
|
8 |
+
for arg in args:
|
9 |
+
cli_args.extend(str(arg).split(" "))
|
10 |
+
subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], check=True)
|
src/whisper_audio_extractor.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# whisper_audio_extractor.py
|
2 |
+
|
3 |
+
import sounddevice as sd
|
4 |
+
from scipy.io.wavfile import write
|
5 |
+
import whisper
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Ruta para guardar el archivo de audio temporalmente
|
9 |
+
AUDIO_PATH = os.path.join("..", "assets", "audio", "recorded_audio.wav")
|
10 |
+
|
11 |
+
def record_audio(duration=5, sample_rate=44100):
|
12 |
+
"""
|
13 |
+
Graba el audio del micrófono durante un tiempo específico y lo guarda como archivo WAV.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
duration (int): Duración de la grabación en segundos.
|
17 |
+
sample_rate (int): Frecuencia de muestreo del audio.
|
18 |
+
"""
|
19 |
+
print("Grabando...")
|
20 |
+
audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2)
|
21 |
+
sd.wait() # Espera a que finalice la grabación
|
22 |
+
write(AUDIO_PATH, sample_rate, audio_data) # Guarda el audio en el directorio especificado
|
23 |
+
print(f"Grabación completa. Archivo guardado en {AUDIO_PATH}")
|
24 |
+
|
25 |
+
def transcribe_audio():
|
26 |
+
"""
|
27 |
+
Usa el modelo Whisper para transcribir el audio grabado y devuelve el texto.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
str: Texto transcrito del audio.
|
31 |
+
"""
|
32 |
+
# Cargar el modelo de Whisper
|
33 |
+
model = whisper.load_model("base")
|
34 |
+
|
35 |
+
# Transcribir el audio
|
36 |
+
print("Transcribiendo el audio...")
|
37 |
+
result = model.transcribe(AUDIO_PATH)
|
38 |
+
print("Transcripción completada.")
|
39 |
+
return result["text"]
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
# Paso 1: Grabar audio
|
43 |
+
record_audio()
|
44 |
+
|
45 |
+
# Paso 2: Transcribir audio
|
46 |
+
texto = transcribe_audio()
|
47 |
+
print("Texto extraído:", texto)
|
src/whisper_audio_transcriber.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# whisper_audio_transcriber.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
import requests
|
6 |
+
import librosa
|
7 |
+
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
|
8 |
+
from transformers.utils import logging
|
9 |
+
import soundfile as sf
|
10 |
+
|
11 |
+
# Definición de modelos
|
12 |
+
model_ids = {
|
13 |
+
"Multilingual models": [
|
14 |
+
"openai/whisper-large-v3-turbo",
|
15 |
+
"openai/whisper-large-v3",
|
16 |
+
"openai/whisper-large-v2",
|
17 |
+
"openai/whisper-large",
|
18 |
+
"openai/whisper-medium",
|
19 |
+
"openai/whisper-small",
|
20 |
+
"openai/whisper-base",
|
21 |
+
"openai/whisper-tiny",
|
22 |
+
],
|
23 |
+
"English-only models": [
|
24 |
+
"distil-whisper/distil-large-v2",
|
25 |
+
"distil-whisper/distil-large-v3",
|
26 |
+
"distil-whisper/distil-medium.en",
|
27 |
+
"distil-whisper/distil-small.en",
|
28 |
+
"openai/whisper-medium.en",
|
29 |
+
"openai/whisper-small.en",
|
30 |
+
"openai/whisper-base.en",
|
31 |
+
"openai/whisper-tiny.en",
|
32 |
+
],
|
33 |
+
}
|
34 |
+
|
35 |
+
def download_file(url, filename, directory="."):
|
36 |
+
"""
|
37 |
+
Descarga un archivo desde una URL y lo guarda en el directorio especificado.
|
38 |
+
"""
|
39 |
+
os.makedirs(directory, exist_ok=True)
|
40 |
+
filepath = Path(directory) / filename
|
41 |
+
response = requests.get(url)
|
42 |
+
filepath.write_bytes(response.content)
|
43 |
+
return filepath
|
44 |
+
|
45 |
+
def transcribe_audio(file_path, model_name):
|
46 |
+
"""
|
47 |
+
Transcribe el audio utilizando un modelo de Whisper.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
file_path (str): Ruta del archivo de audio.
|
51 |
+
model_name (str): Nombre del modelo de Whisper.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
str: Transcripción del audio.
|
55 |
+
"""
|
56 |
+
processor = AutoProcessor.from_pretrained(model_name)
|
57 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
|
58 |
+
|
59 |
+
# Crear pipeline para transcripción
|
60 |
+
pipe = pipeline(
|
61 |
+
"automatic-speech-recognition",
|
62 |
+
model=model,
|
63 |
+
tokenizer=processor.tokenizer,
|
64 |
+
feature_extractor=processor.feature_extractor,
|
65 |
+
device="cpu", # Cambiar a "cuda" si tienes una GPU disponible
|
66 |
+
)
|
67 |
+
|
68 |
+
# Cargar el archivo de audio
|
69 |
+
audio_data, samplerate = librosa.load(file_path, sr=16000)
|
70 |
+
|
71 |
+
# Transcribir el audio
|
72 |
+
result = pipe(audio_data)
|
73 |
+
return result["text"]
|
74 |
+
|
75 |
+
def guardar_transcripcion(texto, filename="transcripcion.txt", directory="../results"):
|
76 |
+
"""
|
77 |
+
Guarda el texto transcrito en un archivo .txt en el directorio especificado.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
texto (str): Texto transcrito que se desea guardar.
|
81 |
+
filename (str): Nombre del archivo .txt.
|
82 |
+
directory (str): Directorio donde se guardará el archivo.
|
83 |
+
"""
|
84 |
+
os.makedirs(directory, exist_ok=True) # Crea el directorio si no existe
|
85 |
+
file_path = Path(directory) / filename
|
86 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
87 |
+
f.write(texto)
|
88 |
+
print(f"Transcripción guardada en: {file_path}")
|
89 |
+
|
90 |
+
def main():
|
91 |
+
# Configuración de logging para errores únicamente
|
92 |
+
logging.set_verbosity_error()
|
93 |
+
|
94 |
+
# Ruta del archivo de audio
|
95 |
+
audio_path = os.path.abspath("../miwav2lipv6/assets/audio/grabacion_gradio.wav")
|
96 |
+
|
97 |
+
# Modelo seleccionado
|
98 |
+
model_name = "openai/whisper-large" # Cambia esto al modelo deseado
|
99 |
+
|
100 |
+
# Transcribir el audio
|
101 |
+
print(f"Transcribiendo el audio del archivo: {audio_path}")
|
102 |
+
transcription = transcribe_audio(audio_path, model_name)
|
103 |
+
print(f"Transcripción: {transcription}")
|
104 |
+
|
105 |
+
# Guardar la transcripción en un archivo .txt
|
106 |
+
guardar_transcripcion(transcription)
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
main()
|
tests/test_whisper_audio_extractor.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pytest
|
3 |
+
from src.whisper_audio_extractor import record_audio, transcribe_audio, AUDIO_PATH
|
4 |
+
|
5 |
+
def test_record_audio():
|
6 |
+
"""
|
7 |
+
Verifica que la función de grabación crea un archivo de audio con un tamaño válido.
|
8 |
+
"""
|
9 |
+
# Ejecuta la grabación con una duración de prueba corta
|
10 |
+
record_audio(duration=2) # Graba por 2 segundos para el test
|
11 |
+
|
12 |
+
# Comprueba si el archivo de audio existe
|
13 |
+
assert os.path.exists(AUDIO_PATH), "El archivo de audio no fue creado."
|
14 |
+
|
15 |
+
# Comprueba que el archivo no esté vacío
|
16 |
+
assert os.path.getsize(AUDIO_PATH) > 0, "El archivo de audio está vacío."
|
17 |
+
|
18 |
+
def test_transcribe_audio():
|
19 |
+
"""
|
20 |
+
Verifica que la función de transcripción devuelve texto.
|
21 |
+
"""
|
22 |
+
# Ejecuta la transcripción del audio grabado
|
23 |
+
transcription = transcribe_audio()
|
24 |
+
|
25 |
+
# Asegura que se obtuvo texto
|
26 |
+
assert isinstance(transcription, str) and len(transcription) > 0, "La transcripción está vacía o no es texto."
|
27 |
+
|
28 |
+
if __name__ == "__main__":
|
29 |
+
pytest.main()
|