yonnel commited on
Commit
7bec29d
Β·
1 Parent(s): cd5a102

Refactor data directory creation in build_index.py to use absolute path and add permission checks

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -2
  2. app/build_index.py +58 -23
Dockerfile CHANGED
@@ -19,8 +19,8 @@ RUN pip install --no-cache-dir -r requirements.txt
19
  # Copy application code
20
  COPY app/ ./app/
21
 
22
- # Create data directory
23
- RUN mkdir -p app/data
24
 
25
  # Make start script executable
26
  RUN chmod +x app/start.py
 
19
  # Copy application code
20
  COPY app/ ./app/
21
 
22
+ # Create data directory with proper permissions
23
+ RUN mkdir -p app/data && chmod 777 app/data
24
 
25
  # Make start script executable
26
  RUN chmod +x app/start.py
app/build_index.py CHANGED
@@ -293,9 +293,26 @@ def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_
293
  tmdb_client = TMDBClient(settings.tmdb_api_key)
294
  openai_client = OpenAI(api_key=settings.openai_api_key)
295
 
296
- # Create data directory
297
- os.makedirs("app/data", exist_ok=True)
 
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  # Check for existing movie data checkpoint
300
  movies_data = load_checkpoint(MOVIE_DATA_CHECKPOINT)
301
 
@@ -440,8 +457,13 @@ def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_
440
 
441
  # Step 4: Save embeddings as numpy array
442
  embeddings_array = np.array(embeddings, dtype=np.float32)
443
- np.save("app/data/movies.npy", embeddings_array)
444
- logger.info(f"Saved embeddings matrix: {embeddings_array.shape}")
 
 
 
 
 
445
 
446
  # Step 5: Build and save FAISS index
447
  if use_faiss:
@@ -461,28 +483,41 @@ def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_
461
  index.train(embeddings_array)
462
 
463
  index.add(embeddings_array)
464
- faiss.write_index(index, "app/data/faiss.index")
465
- logger.info(f"FAISS index saved (type: {type(index).__name__}, dimension: {dimension})")
 
 
 
 
 
466
 
467
  # Step 6: Save metadata files
468
- with open("app/data/id_map.json", "w") as f:
469
- json.dump(id_map, f)
470
-
471
- with open("app/data/movie_metadata.json", "w") as f:
472
- json.dump(movie_metadata, f)
473
-
474
- logger.info("βœ… Index built successfully!")
475
- logger.info(f" - {len(embeddings)} movies indexed")
476
- logger.info(f" - Embedding model: {model}")
477
- logger.info(f" - Files saved in app/data/")
478
- logger.info(f" * movies.npy: embeddings matrix")
479
- logger.info(f" * id_map.json: TMDB ID to matrix position mapping")
480
- logger.info(f" * movie_metadata.json: movie metadata")
481
- if use_faiss:
482
- logger.info(f" * faiss.index: FAISS search index")
483
 
484
- # Cleanup checkpoints
485
- cleanup_checkpoints()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
  # Remove the old functions that are no longer needed
488
  # create_movie_embedding and load_movie_data are replaced by the new implementation
 
293
  tmdb_client = TMDBClient(settings.tmdb_api_key)
294
  openai_client = OpenAI(api_key=settings.openai_api_key)
295
 
296
+ # Create data directory with absolute path
297
+ script_dir = os.path.dirname(os.path.abspath(__file__))
298
+ data_dir = os.path.join(script_dir, "data")
299
 
300
+ try:
301
+ os.makedirs(data_dir, exist_ok=True)
302
+ # Test write permissions
303
+ test_file = os.path.join(data_dir, ".write_test")
304
+ with open(test_file, 'w') as f:
305
+ f.write("test")
306
+ os.remove(test_file)
307
+ logger.info(f"Data directory ready: {data_dir}")
308
+ except PermissionError as e:
309
+ logger.error(f"❌ Permission denied when creating data directory: {e}")
310
+ logger.error("Make sure the data directory has write permissions")
311
+ return
312
+ except Exception as e:
313
+ logger.error(f"❌ Failed to create or write to data directory: {e}")
314
+ return
315
+
316
  # Check for existing movie data checkpoint
317
  movies_data = load_checkpoint(MOVIE_DATA_CHECKPOINT)
318
 
 
457
 
458
  # Step 4: Save embeddings as numpy array
459
  embeddings_array = np.array(embeddings, dtype=np.float32)
460
+ embeddings_path = os.path.join(data_dir, "movies.npy")
461
+ try:
462
+ np.save(embeddings_path, embeddings_array)
463
+ logger.info(f"Saved embeddings matrix: {embeddings_array.shape}")
464
+ except Exception as e:
465
+ logger.error(f"❌ Failed to save embeddings: {e}")
466
+ return
467
 
468
  # Step 5: Build and save FAISS index
469
  if use_faiss:
 
483
  index.train(embeddings_array)
484
 
485
  index.add(embeddings_array)
486
+ index_path = os.path.join(data_dir, "faiss.index")
487
+ try:
488
+ faiss.write_index(index, index_path)
489
+ logger.info(f"FAISS index saved (type: {type(index).__name__}, dimension: {dimension})")
490
+ except Exception as e:
491
+ logger.error(f"❌ Failed to save FAISS index: {e}")
492
+ return
493
 
494
  # Step 6: Save metadata files
495
+ id_map_path = os.path.join(data_dir, "id_map.json")
496
+ metadata_path = os.path.join(data_dir, "movie_metadata.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
+ try:
499
+ with open(id_map_path, "w") as f:
500
+ json.dump(id_map, f)
501
+
502
+ with open(metadata_path, "w") as f:
503
+ json.dump(movie_metadata, f)
504
+
505
+ logger.info("βœ… Index built successfully!")
506
+ logger.info(f" - {len(embeddings)} movies indexed")
507
+ logger.info(f" - Embedding model: {model}")
508
+ logger.info(f" - Files saved in {data_dir}")
509
+ logger.info(f" * movies.npy: embeddings matrix")
510
+ logger.info(f" * id_map.json: TMDB ID to matrix position mapping")
511
+ logger.info(f" * movie_metadata.json: movie metadata")
512
+ if use_faiss:
513
+ logger.info(f" * faiss.index: FAISS search index")
514
+
515
+ # Cleanup checkpoints
516
+ cleanup_checkpoints()
517
+
518
+ except Exception as e:
519
+ logger.error(f"❌ Failed to save metadata files: {e}")
520
+ return
521
 
522
  # Remove the old functions that are no longer needed
523
  # create_movie_embedding and load_movie_data are replaced by the new implementation