Spaces:
Sleeping
Sleeping
yonnel
commited on
Commit
Β·
7bec29d
1
Parent(s):
cd5a102
Refactor data directory creation in build_index.py to use absolute path and add permission checks
Browse files- Dockerfile +2 -2
- app/build_index.py +58 -23
Dockerfile
CHANGED
@@ -19,8 +19,8 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
19 |
# Copy application code
|
20 |
COPY app/ ./app/
|
21 |
|
22 |
-
# Create data directory
|
23 |
-
RUN mkdir -p app/data
|
24 |
|
25 |
# Make start script executable
|
26 |
RUN chmod +x app/start.py
|
|
|
19 |
# Copy application code
|
20 |
COPY app/ ./app/
|
21 |
|
22 |
+
# Create data directory with proper permissions
|
23 |
+
RUN mkdir -p app/data && chmod 777 app/data
|
24 |
|
25 |
# Make start script executable
|
26 |
RUN chmod +x app/start.py
|
app/build_index.py
CHANGED
@@ -293,9 +293,26 @@ def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_
|
|
293 |
tmdb_client = TMDBClient(settings.tmdb_api_key)
|
294 |
openai_client = OpenAI(api_key=settings.openai_api_key)
|
295 |
|
296 |
-
# Create data directory
|
297 |
-
os.
|
|
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
# Check for existing movie data checkpoint
|
300 |
movies_data = load_checkpoint(MOVIE_DATA_CHECKPOINT)
|
301 |
|
@@ -440,8 +457,13 @@ def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_
|
|
440 |
|
441 |
# Step 4: Save embeddings as numpy array
|
442 |
embeddings_array = np.array(embeddings, dtype=np.float32)
|
443 |
-
|
444 |
-
|
|
|
|
|
|
|
|
|
|
|
445 |
|
446 |
# Step 5: Build and save FAISS index
|
447 |
if use_faiss:
|
@@ -461,28 +483,41 @@ def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_
|
|
461 |
index.train(embeddings_array)
|
462 |
|
463 |
index.add(embeddings_array)
|
464 |
-
|
465 |
-
|
|
|
|
|
|
|
|
|
|
|
466 |
|
467 |
# Step 6: Save metadata files
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
with open("app/data/movie_metadata.json", "w") as f:
|
472 |
-
json.dump(movie_metadata, f)
|
473 |
-
|
474 |
-
logger.info("β
Index built successfully!")
|
475 |
-
logger.info(f" - {len(embeddings)} movies indexed")
|
476 |
-
logger.info(f" - Embedding model: {model}")
|
477 |
-
logger.info(f" - Files saved in app/data/")
|
478 |
-
logger.info(f" * movies.npy: embeddings matrix")
|
479 |
-
logger.info(f" * id_map.json: TMDB ID to matrix position mapping")
|
480 |
-
logger.info(f" * movie_metadata.json: movie metadata")
|
481 |
-
if use_faiss:
|
482 |
-
logger.info(f" * faiss.index: FAISS search index")
|
483 |
|
484 |
-
|
485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
|
487 |
# Remove the old functions that are no longer needed
|
488 |
# create_movie_embedding and load_movie_data are replaced by the new implementation
|
|
|
293 |
tmdb_client = TMDBClient(settings.tmdb_api_key)
|
294 |
openai_client = OpenAI(api_key=settings.openai_api_key)
|
295 |
|
296 |
+
# Create data directory with absolute path
|
297 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
298 |
+
data_dir = os.path.join(script_dir, "data")
|
299 |
|
300 |
+
try:
|
301 |
+
os.makedirs(data_dir, exist_ok=True)
|
302 |
+
# Test write permissions
|
303 |
+
test_file = os.path.join(data_dir, ".write_test")
|
304 |
+
with open(test_file, 'w') as f:
|
305 |
+
f.write("test")
|
306 |
+
os.remove(test_file)
|
307 |
+
logger.info(f"Data directory ready: {data_dir}")
|
308 |
+
except PermissionError as e:
|
309 |
+
logger.error(f"β Permission denied when creating data directory: {e}")
|
310 |
+
logger.error("Make sure the data directory has write permissions")
|
311 |
+
return
|
312 |
+
except Exception as e:
|
313 |
+
logger.error(f"β Failed to create or write to data directory: {e}")
|
314 |
+
return
|
315 |
+
|
316 |
# Check for existing movie data checkpoint
|
317 |
movies_data = load_checkpoint(MOVIE_DATA_CHECKPOINT)
|
318 |
|
|
|
457 |
|
458 |
# Step 4: Save embeddings as numpy array
|
459 |
embeddings_array = np.array(embeddings, dtype=np.float32)
|
460 |
+
embeddings_path = os.path.join(data_dir, "movies.npy")
|
461 |
+
try:
|
462 |
+
np.save(embeddings_path, embeddings_array)
|
463 |
+
logger.info(f"Saved embeddings matrix: {embeddings_array.shape}")
|
464 |
+
except Exception as e:
|
465 |
+
logger.error(f"β Failed to save embeddings: {e}")
|
466 |
+
return
|
467 |
|
468 |
# Step 5: Build and save FAISS index
|
469 |
if use_faiss:
|
|
|
483 |
index.train(embeddings_array)
|
484 |
|
485 |
index.add(embeddings_array)
|
486 |
+
index_path = os.path.join(data_dir, "faiss.index")
|
487 |
+
try:
|
488 |
+
faiss.write_index(index, index_path)
|
489 |
+
logger.info(f"FAISS index saved (type: {type(index).__name__}, dimension: {dimension})")
|
490 |
+
except Exception as e:
|
491 |
+
logger.error(f"β Failed to save FAISS index: {e}")
|
492 |
+
return
|
493 |
|
494 |
# Step 6: Save metadata files
|
495 |
+
id_map_path = os.path.join(data_dir, "id_map.json")
|
496 |
+
metadata_path = os.path.join(data_dir, "movie_metadata.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
+
try:
|
499 |
+
with open(id_map_path, "w") as f:
|
500 |
+
json.dump(id_map, f)
|
501 |
+
|
502 |
+
with open(metadata_path, "w") as f:
|
503 |
+
json.dump(movie_metadata, f)
|
504 |
+
|
505 |
+
logger.info("β
Index built successfully!")
|
506 |
+
logger.info(f" - {len(embeddings)} movies indexed")
|
507 |
+
logger.info(f" - Embedding model: {model}")
|
508 |
+
logger.info(f" - Files saved in {data_dir}")
|
509 |
+
logger.info(f" * movies.npy: embeddings matrix")
|
510 |
+
logger.info(f" * id_map.json: TMDB ID to matrix position mapping")
|
511 |
+
logger.info(f" * movie_metadata.json: movie metadata")
|
512 |
+
if use_faiss:
|
513 |
+
logger.info(f" * faiss.index: FAISS search index")
|
514 |
+
|
515 |
+
# Cleanup checkpoints
|
516 |
+
cleanup_checkpoints()
|
517 |
+
|
518 |
+
except Exception as e:
|
519 |
+
logger.error(f"β Failed to save metadata files: {e}")
|
520 |
+
return
|
521 |
|
522 |
# Remove the old functions that are no longer needed
|
523 |
# create_movie_embedding and load_movie_data are replaced by the new implementation
|