Spaces:
Running
A newer version of the Gradio SDK is available:
5.42.0
Persistent Storage Setup for Hugging Face Spaces
This guide explains how to set up and use persistent storage in Hugging Face Spaces for your LMM-Vibes application.
Overview
Hugging Face Spaces provides persistent storage at the /data
directory that persists across app restarts and deployments. This storage is perfect for:
- Caching models and datasets
- Storing user uploads and results
- Maintaining application state
- Saving experiment results
Quick Start
1. Automatic Setup (Already Implemented)
Your application automatically detects and configures persistent storage when running in Hugging Face Spaces:
# This is already handled in app.py
if is_persistent_storage_available():
# Configure HF cache to persistent storage
hf_home = get_hf_home_dir()
os.environ.setdefault("HF_HOME", str(hf_home))
# Set cache directories
cache_dir = get_cache_dir()
os.environ.setdefault("TRANSFORMERS_CACHE", str(cache_dir / "transformers"))
os.environ.setdefault("HF_DATASETS_CACHE", str(cache_dir / "datasets"))
2. Storage Structure
When persistent storage is available, your data is organized as follows:
/data/
βββ app_data/ # Main application data
β βββ experiments/ # Pipeline results and experiments
β βββ dataframes/ # Saved pandas DataFrames
β βββ logs/ # Application logs
β βββ uploads/ # User uploaded files
βββ .cache/ # Application cache
β βββ transformers/ # Hugging Face Transformers cache
β βββ datasets/ # Hugging Face Datasets cache
βββ .huggingface/ # Hugging Face model cache
Usage Examples
Saving Data
from lmmvibes.utils.persistent_storage import (
save_data_to_persistent,
save_uploaded_file
)
# Save binary data
data_bytes = b"your binary data"
saved_path = save_data_to_persistent(
data=data_bytes,
filename="my_data.bin",
subdirectory="experiments"
)
# Save uploaded file from Gradio
def handle_upload(uploaded_file):
if uploaded_file:
saved_path = save_uploaded_file(uploaded_file, "user_upload.zip")
return f"Saved to: {saved_path}"
Loading Data
from lmmvibes.utils.persistent_storage import load_data_from_persistent
# Load binary data
data_bytes = load_data_from_persistent("my_data.bin", "experiments")
if data_bytes:
# Process the data
data = data_bytes.decode('utf-8')
Listing Files
from lmmvibes.utils.persistent_storage import list_persistent_files
# List all files
all_files = list_persistent_files()
# List specific types of files
json_files = list_persistent_files(subdirectory="experiments", pattern="*.json")
parquet_files = list_persistent_files(subdirectory="dataframes", pattern="*.parquet")
Checking Storage Status
from lmmvibes.utils.persistent_storage import get_storage_info
info = get_storage_info()
print(f"Persistent storage available: {info['persistent_available']}")
print(f"Data directory: {info['data_dir']}")
print(f"Free space: {info['storage_paths']['free_gb']:.1f}GB")
Integration with Your Application
1. Data Loading
Your application already uses persistent storage for loading pipeline results:
# In data_loader.py - automatically uses persistent storage when available
def load_pipeline_results(results_dir: str):
# The function automatically checks for data in persistent storage
# Falls back to local storage if persistent storage is not available
pass
2. Caching
The application automatically caches data in persistent storage:
# In data_loader.py - DataCache uses persistent storage when available
class DataCache:
@classmethod
def get(cls, key: str):
# Check persistent storage first, then memory cache
return cls._cache.get(key)
3. User Uploads
For handling user uploads in Gradio:
import gradio as gr
from lmmvibes.utils.persistent_storage import save_uploaded_file
def handle_file_upload(file):
if file:
saved_path = save_uploaded_file(file, "user_upload.zip")
if saved_path:
return f"β
File saved to persistent storage: {saved_path.name}"
else:
return "β Failed to save - persistent storage not available"
return "β οΈ No file uploaded"
# In your Gradio interface
with gr.Blocks() as demo:
file_input = gr.File(label="Upload data")
upload_btn = gr.Button("Save to persistent storage")
result = gr.Textbox(label="Status")
upload_btn.click(handle_file_upload, inputs=[file_input], outputs=[result])
Best Practices
1. Check Availability
Always check if persistent storage is available before trying to use it:
from lmmvibes.utils.persistent_storage import is_persistent_storage_available
if is_persistent_storage_available():
# Use persistent storage
save_data_to_persistent(data, "important_data.json")
else:
# Fall back to local storage or in-memory
print("Persistent storage not available")
2. Organize Data
Use subdirectories to organize your data:
# Save experiments in their own directory
save_data_to_persistent(
data=experiment_data,
filename=f"{experiment_name}_results.json",
subdirectory="experiments"
)
# Save dataframes separately
save_data_to_persistent(
data=dataframe_bytes,
filename=f"{dataset_name}_data.parquet",
subdirectory="dataframes"
)
3. Handle Errors Gracefully
def safe_save_data(data, filename):
try:
saved_path = save_data_to_persistent(data, filename)
if saved_path:
return f"β
Saved to {saved_path}"
else:
return "β Failed to save - storage not available"
except Exception as e:
return f"β Error saving data: {e}"
4. Clean Up Old Data
Periodically clean up old files to manage storage space:
from lmmvibes.utils.persistent_storage import list_persistent_files, delete_persistent_file
def cleanup_old_files(days_old=30):
"""Delete files older than specified days."""
import time
cutoff_time = time.time() - (days_old * 24 * 60 * 60)
for file in list_persistent_files():
if file.stat().st_mtime < cutoff_time:
delete_persistent_file(file.name)
Troubleshooting
1. Storage Not Available
If persistent storage is not working:
from lmmvibes.utils.persistent_storage import get_storage_info
info = get_storage_info()
print(f"Storage available: {info['persistent_available']}")
print(f"Data directory: {info['data_dir']}")
2. Permission Issues
If you encounter permission issues:
# The utilities automatically create directories with proper permissions
# If issues persist, check if /data exists and is writable
import os
if os.path.isdir("/data") and os.access("/data", os.W_OK):
print("β
Persistent storage is accessible and writable")
else:
print("β Persistent storage not accessible")
3. Storage Full
Monitor storage usage:
info = get_storage_info()
if info['storage_paths']:
usage_pct = (info['storage_paths']['used_gb'] / info['storage_paths']['total_gb']) * 100
if usage_pct > 90:
print(f"β οΈ Storage nearly full: {usage_pct:.1f}% used")
# Implement cleanup logic
Migration from Local Storage
If you're migrating from local storage to persistent storage:
- Backup existing data: Copy your local
data/
directory to persistent storage - Update paths: Use the persistent storage utilities instead of hardcoded paths
- Test thoroughly: Ensure all functionality works with persistent storage
- Monitor usage: Keep track of storage usage and implement cleanup
Example: Complete Integration
Here's a complete example of integrating persistent storage into your application:
import gradio as gr
import json
import pandas as pd
from lmmvibes.utils.persistent_storage import (
save_data_to_persistent,
load_data_from_persistent,
list_persistent_files,
get_storage_info,
is_persistent_storage_available
)
def save_experiment_results(results_data, experiment_name):
"""Save experiment results to persistent storage."""
if not is_persistent_storage_available():
return "β Persistent storage not available"
try:
results_json = json.dumps(results_data, indent=2)
results_bytes = results_json.encode('utf-8')
filename = f"{experiment_name}_results.json"
saved_path = save_data_to_persistent(
data=results_bytes,
filename=filename,
subdirectory="experiments"
)
if saved_path:
return f"β
Saved experiment to: {saved_path.name}"
else:
return "β Failed to save experiment"
except Exception as e:
return f"β Error: {e}"
def load_experiment_results(experiment_name):
"""Load experiment results from persistent storage."""
filename = f"{experiment_name}_results.json"
results_bytes = load_data_from_persistent(
filename=filename,
subdirectory="experiments"
)
if results_bytes:
results_data = json.loads(results_bytes.decode('utf-8'))
return json.dumps(results_data, indent=2)
else:
return "No results found"
def get_available_experiments():
"""List all available experiments."""
experiment_files = list_persistent_files(subdirectory="experiments", pattern="*_results.json")
if experiment_files:
return "\n".join([f.name for f in experiment_files])
else:
return "No experiments found"
# Gradio interface
with gr.Blocks(title="Persistent Storage Demo") as demo:
gr.Markdown("# Persistent Storage Demo")
with gr.Tab("Save Experiment"):
experiment_name = gr.Textbox(label="Experiment Name")
results_json = gr.Textbox(label="Results (JSON)", lines=5)
save_btn = gr.Button("Save Experiment")
save_result = gr.Textbox(label="Save Result")
save_btn.click(
save_experiment_results,
inputs=[results_json, experiment_name],
outputs=[save_result]
)
with gr.Tab("Load Experiment"):
load_experiment_name = gr.Textbox(label="Experiment Name")
load_btn = gr.Button("Load Experiment")
load_result = gr.Textbox(label="Loaded Results", lines=10)
load_btn.click(
load_experiment_results,
inputs=[load_experiment_name],
outputs=[load_result]
)
with gr.Tab("Storage Info"):
info_btn = gr.Button("Get Storage Info")
storage_info = gr.Textbox(label="Storage Information", lines=10)
def get_info():
info = get_storage_info()
return json.dumps(info, indent=2)
info_btn.click(get_info, outputs=[storage_info])
if __name__ == "__main__":
demo.launch()
This comprehensive setup ensures your application can take full advantage of Hugging Face Spaces' persistent storage capabilities while maintaining backward compatibility with local development.