Segizu commited on
Commit
9c5866b
·
1 Parent(s): 51fa2e3

performance

Browse files
Files changed (1) hide show
  1. app.py +80 -14
app.py CHANGED
@@ -12,6 +12,21 @@ import gc
12
  import requests
13
  import time
14
  import shutil
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # 📁 Parámetros
17
  DATASET_ID = "Segizu/facial-recognition"
@@ -76,10 +91,16 @@ def preprocess_image(img: Image.Image) -> np.ndarray:
76
  img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
77
  return np.array(img_resized)
78
 
79
- # 📦 Generar y subir embeddings
80
  def build_database():
 
81
  print("🔄 Generando embeddings...")
82
  batch_size = 10
 
 
 
 
 
83
 
84
  for i in range(0, len(dataset), batch_size):
85
  batch = dataset[i:i + batch_size]
@@ -96,15 +117,15 @@ def build_database():
96
  name = f"image_{i + j}"
97
  filename = LOCAL_EMB_DIR / f"{name}.pkl"
98
 
99
- # Verificar si ya existe en HF
100
  try:
101
  hf_hub_download(
102
  repo_id=DATASET_ID,
103
  repo_type="dataset",
104
- filename=f"{EMBEDDINGS_SUBFOLDER}/{name}.pkl",
105
  token=HF_TOKEN
106
  )
107
- print(f"⏩ Ya existe remoto: {name}.pkl")
108
  continue
109
  except:
110
  pass
@@ -121,26 +142,71 @@ def build_database():
121
  enforce_detection=False
122
  )[0]["embedding"]
123
 
124
- # Guardar temporal
125
  with open(filename, "wb") as f:
126
  pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
127
 
128
- embeddings_to_upload.append(filename)
129
-
130
- # Si excede límites, subir batch
131
- if get_folder_size(LOCAL_EMB_DIR) >= MAX_TEMP_STORAGE_GB or len(embeddings_to_upload) >= UPLOAD_EVERY:
132
- flush_embeddings()
133
-
134
  del img_processed
135
  gc.collect()
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  except Exception as e:
138
  print(f"❌ Error en {name}: {e}")
139
  continue
140
 
141
- # Subir lo que quede
142
- if embeddings_to_upload:
143
- flush_embeddings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  # 🔍 Buscar similitudes desde archivos remotos
146
  def find_similar_faces(uploaded_image: Image.Image):
 
12
  import requests
13
  import time
14
  import shutil
15
+ import tarfile
16
+
17
+
18
+ # 🔁 Limpiar almacenamiento temporal si existe
19
+ def clean_temp_dirs():
20
+ print("🧹 Limpiando carpetas temporales...")
21
+
22
+ for folder in ["embeddings", "batches"]:
23
+ path = Path(folder)
24
+ if path.exists() and path.is_dir():
25
+ shutil.rmtree(path)
26
+ print(f"✅ Carpeta eliminada: {folder}")
27
+ path.mkdir(exist_ok=True)
28
+
29
+ clean_temp_dirs()
30
 
31
  # 📁 Parámetros
32
  DATASET_ID = "Segizu/facial-recognition"
 
91
  img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
92
  return np.array(img_resized)
93
 
94
+
95
  def build_database():
96
+ print(f"📊 Uso actual de almacenamiento tempora _ INICIO_: {get_folder_size('.'):.2f} GB")
97
  print("🔄 Generando embeddings...")
98
  batch_size = 10
99
+ archive_batch_size = 50
100
+ batch_files = []
101
+ batch_index = 0
102
+ ARCHIVE_DIR = Path("batches")
103
+ ARCHIVE_DIR.mkdir(exist_ok=True)
104
 
105
  for i in range(0, len(dataset), batch_size):
106
  batch = dataset[i:i + batch_size]
 
117
  name = f"image_{i + j}"
118
  filename = LOCAL_EMB_DIR / f"{name}.pkl"
119
 
120
+ # Verificar si ya existe en Hugging Face Hub
121
  try:
122
  hf_hub_download(
123
  repo_id=DATASET_ID,
124
  repo_type="dataset",
125
+ filename=f"{EMBEDDINGS_SUBFOLDER}/batch_{batch_index:03}.tar.gz",
126
  token=HF_TOKEN
127
  )
128
+ print(f"⏩ Ya existe en remoto: {name}.pkl")
129
  continue
130
  except:
131
  pass
 
142
  enforce_detection=False
143
  )[0]["embedding"]
144
 
 
145
  with open(filename, "wb") as f:
146
  pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
147
 
148
+ batch_files.append(filename)
 
 
 
 
 
149
  del img_processed
150
  gc.collect()
151
 
152
+ # Si llegamos al tamaño de archivo por lote o espacio es crítico
153
+ if len(batch_files) >= archive_batch_size or get_folder_size(".") > 40:
154
+ archive_path = ARCHIVE_DIR / f"batch_{batch_index:03}.tar.gz"
155
+ with tarfile.open(archive_path, "w:gz") as tar:
156
+ for file in batch_files:
157
+ tar.add(file, arcname=file.name)
158
+
159
+ print(f"📦 Empaquetado: {archive_path}")
160
+
161
+ # Subida al Hub
162
+ upload_file(
163
+ path_or_fileobj=str(archive_path),
164
+ path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{archive_path.name}",
165
+ repo_id=DATASET_ID,
166
+ repo_type="dataset",
167
+ token=HF_TOKEN
168
+ )
169
+ print(f"✅ Subido: {archive_path.name}")
170
+
171
+ # Borrar .pkl y el .tar.gz local
172
+ for f in batch_files:
173
+ f.unlink()
174
+ archive_path.unlink()
175
+
176
+ print("🧹 Limpieza completada tras subida")
177
+
178
+ batch_files = []
179
+ batch_index += 1
180
+ time.sleep(2) # Pausa para evitar 429
181
+ print(f"📊 Uso actual de almacenamiento tempora _ FINAL_: {get_folder_size('.'):.2f} GB")
182
+
183
+
184
  except Exception as e:
185
  print(f"❌ Error en {name}: {e}")
186
  continue
187
 
188
+ # Último lote si queda algo
189
+ if batch_files:
190
+ archive_path = ARCHIVE_DIR / f"batch_{batch_index:03}.tar.gz"
191
+ with tarfile.open(archive_path, "w:gz") as tar:
192
+ for file in batch_files:
193
+ tar.add(file, arcname=file.name)
194
+
195
+ print(f"📦 Empaquetado final: {archive_path}")
196
+
197
+ upload_file(
198
+ path_or_fileobj=str(archive_path),
199
+ path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{archive_path.name}",
200
+ repo_id=DATASET_ID,
201
+ repo_type="dataset",
202
+ token=HF_TOKEN
203
+ )
204
+
205
+ for f in batch_files:
206
+ f.unlink()
207
+ archive_path.unlink()
208
+ print("✅ Subida y limpieza final")
209
+
210
 
211
  # 🔍 Buscar similitudes desde archivos remotos
212
  def find_similar_faces(uploaded_image: Image.Image):