Spaces:

Segizu
/

Face_Recognition

Build error

App Files Files Community

Segizu commited on May 11

Commit

9c5866b

1 Parent(s): 51fa2e3

performance

Browse files

Files changed (1) hide show

app.py +80 -14

app.py CHANGED Viewed

@@ -12,6 +12,21 @@ import gc
 import requests
 import time
 import shutil
 # 📁 Parámetros
 DATASET_ID = "Segizu/facial-recognition"
@@ -76,10 +91,16 @@ def preprocess_image(img: Image.Image) -> np.ndarray:
     img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
     return np.array(img_resized)
-# 📦 Generar y subir embeddings
 def build_database():
     print("🔄 Generando embeddings...")
     batch_size = 10
     for i in range(0, len(dataset), batch_size):
         batch = dataset[i:i + batch_size]
@@ -96,15 +117,15 @@ def build_database():
             name = f"image_{i + j}"
             filename = LOCAL_EMB_DIR / f"{name}.pkl"
-            # Verificar si ya existe en HF
             try:
                 hf_hub_download(
                     repo_id=DATASET_ID,
                     repo_type="dataset",
-                    filename=f"{EMBEDDINGS_SUBFOLDER}/{name}.pkl",
                     token=HF_TOKEN
                 )
-                print(f"⏩ Ya existe remoto: {name}.pkl")
                 continue
             except:
                 pass
@@ -121,26 +142,71 @@ def build_database():
                     enforce_detection=False
                 )[0]["embedding"]
-                # Guardar temporal
                 with open(filename, "wb") as f:
                     pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
-                embeddings_to_upload.append(filename)
-                # Si excede límites, subir batch
-                if get_folder_size(LOCAL_EMB_DIR) >= MAX_TEMP_STORAGE_GB or len(embeddings_to_upload) >= UPLOAD_EVERY:
-                    flush_embeddings()
                 del img_processed
                 gc.collect()
             except Exception as e:
                 print(f"❌ Error en {name}: {e}")
                 continue
-    # Subir lo que quede
-    if embeddings_to_upload:
-        flush_embeddings()
 # 🔍 Buscar similitudes desde archivos remotos
 def find_similar_faces(uploaded_image: Image.Image):

 import requests
 import time
 import shutil
+import tarfile
+# 🔁 Limpiar almacenamiento temporal si existe
+def clean_temp_dirs():
+    print("🧹 Limpiando carpetas temporales...")
+    for folder in ["embeddings", "batches"]:
+        path = Path(folder)
+        if path.exists() and path.is_dir():
+            shutil.rmtree(path)
+            print(f"✅ Carpeta eliminada: {folder}")
+        path.mkdir(exist_ok=True)
+clean_temp_dirs()
 # 📁 Parámetros
 DATASET_ID = "Segizu/facial-recognition"
     img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
     return np.array(img_resized)
 def build_database():
+    print(f"📊 Uso actual de almacenamiento tempora _ INICIO_: {get_folder_size('.'):.2f} GB")
     print("🔄 Generando embeddings...")
     batch_size = 10
+    archive_batch_size = 50
+    batch_files = []
+    batch_index = 0
+    ARCHIVE_DIR = Path("batches")
+    ARCHIVE_DIR.mkdir(exist_ok=True)
     for i in range(0, len(dataset), batch_size):
         batch = dataset[i:i + batch_size]
             name = f"image_{i + j}"
             filename = LOCAL_EMB_DIR / f"{name}.pkl"
+            # Verificar si ya existe en Hugging Face Hub
             try:
                 hf_hub_download(
                     repo_id=DATASET_ID,
                     repo_type="dataset",
+                    filename=f"{EMBEDDINGS_SUBFOLDER}/batch_{batch_index:03}.tar.gz",
                     token=HF_TOKEN
                 )
+                print(f"⏩ Ya existe en remoto: {name}.pkl")
                 continue
             except:
                 pass
                     enforce_detection=False
                 )[0]["embedding"]
                 with open(filename, "wb") as f:
                     pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
+                batch_files.append(filename)
                 del img_processed
                 gc.collect()
+                # Si llegamos al tamaño de archivo por lote o espacio es crítico
+                if len(batch_files) >= archive_batch_size or get_folder_size(".") > 40:
+                    archive_path = ARCHIVE_DIR / f"batch_{batch_index:03}.tar.gz"
+                    with tarfile.open(archive_path, "w:gz") as tar:
+                        for file in batch_files:
+                            tar.add(file, arcname=file.name)
+                    print(f"📦 Empaquetado: {archive_path}")
+                    # Subida al Hub
+                    upload_file(
+                        path_or_fileobj=str(archive_path),
+                        path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{archive_path.name}",
+                        repo_id=DATASET_ID,
+                        repo_type="dataset",
+                        token=HF_TOKEN
+                    )
+                    print(f"✅ Subido: {archive_path.name}")
+                    # Borrar .pkl y el .tar.gz local
+                    for f in batch_files:
+                        f.unlink()
+                    archive_path.unlink()
+                    print("🧹 Limpieza completada tras subida")
+                    batch_files = []
+                    batch_index += 1
+                    time.sleep(2)  # Pausa para evitar 429
+                    print(f"📊 Uso actual de almacenamiento tempora _ FINAL_: {get_folder_size('.'):.2f} GB")
             except Exception as e:
                 print(f"❌ Error en {name}: {e}")
                 continue
+    # Último lote si queda algo
+    if batch_files:
+        archive_path = ARCHIVE_DIR / f"batch_{batch_index:03}.tar.gz"
+        with tarfile.open(archive_path, "w:gz") as tar:
+            for file in batch_files:
+                tar.add(file, arcname=file.name)
+        print(f"📦 Empaquetado final: {archive_path}")
+        upload_file(
+            path_or_fileobj=str(archive_path),
+            path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{archive_path.name}",
+            repo_id=DATASET_ID,
+            repo_type="dataset",
+            token=HF_TOKEN
+        )
+        for f in batch_files:
+            f.unlink()
+        archive_path.unlink()
+        print("✅ Subida y limpieza final")
 # 🔍 Buscar similitudes desde archivos remotos
 def find_similar_faces(uploaded_image: Image.Image):