File size: 5,593 Bytes
cabc5bb 3625423 cabc5bb 3625423 cabc5bb 819a890 3625423 819a890 cabc5bb 3625423 cabc5bb 3625423 cabc5bb 3625423 cabc5bb 3625423 30eb66e 3625423 cabc5bb 3625423 cabc5bb 3625423 cabc5bb 48628b5 cabc5bb 48628b5 cabc5bb 3625423 30eb66e 41348e6 cabc5bb 3625423 48628b5 3625423 62319d9 3625423 48628b5 3625423 48628b5 cabc5bb 3625423 48628b5 3625423 48628b5 3625423 cabc5bb 3625423 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
#!/bin/sh
# 检查环境变量
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
echo "未检测到 HF_TOKEN 或 DATASET_ID,备份功能不可用"
exit 1
fi
# 激活虚拟环境
. $HOME/venv/bin/activate
# 生成同步脚本
cat > hf_sync.py << 'EOL'
# HuggingFace 同步脚本
from huggingface_hub import HfApi
import sys
import os
import tarfile
import tempfile
# 管理备份文件数量,超出最大数量则自动删除最旧的备份
def manage_backups(api, repo_id, max_files=50):
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
backup_files.sort()
if len(backup_files) >= max_files:
files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
for file_to_delete in files_to_delete:
try:
api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type="dataset")
print(f'已删除旧备份: {file_to_delete}')
except Exception as e:
print(f'删除 {file_to_delete} 时出错: {str(e)}')
# 上传备份文件到 HuggingFace
def upload_backup(file_path, file_name, token, repo_id):
api = HfApi(token=token)
try:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset"
)
print(f"成功上传 {file_name}")
manage_backups(api, repo_id)
except Exception as e:
print(f"上传文件出错: {str(e)}")
# 下载最新备份
def download_latest_backup(token, repo_id, extract_path):
try:
api = HfApi(token=token)
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
if not backup_files:
print("未找到任何备份文件")
return
latest_backup = sorted(backup_files)[-1]
with tempfile.TemporaryDirectory() as temp_dir:
filepath = api.hf_hub_download(
repo_id=repo_id,
filename=latest_backup,
repo_type="dataset",
local_dir=temp_dir
)
if filepath and os.path.exists(filepath):
with tarfile.open(filepath, 'r:gz') as tar:
tar.extractall(extract_path)
print(f"已成功恢复备份: {latest_backup}")
except Exception as e:
print(f"下载备份出错: {str(e)}")
# 合并历史提交
def super_squash_history(token, repo_id):
try:
api = HfApi(token=token)
api.super_squash_history(repo_id=repo_id, repo_type="dataset")
print("历史合并完成。")
except Exception as e:
print(f"合并历史出错: {str(e)}")
# 主函数
if __name__ == "__main__":
action = sys.argv[1]
token = sys.argv[2]
repo_id = sys.argv[3]
if action == "upload":
file_path = sys.argv[4]
file_name = sys.argv[5]
upload_backup(file_path, file_name, token, repo_id)
elif action == "download":
extract_path = sys.argv[4] if len(sys.argv) > 4 else '.'
download_latest_backup(token, repo_id, extract_path)
elif action == "super_squash":
super_squash_history(token, repo_id)
EOL
# 首次启动时从 HuggingFace 下载最新备份(解压到应用目录)
echo "正在从 HuggingFace 下载最新备份..."
python hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}" "$HOME/app"
# 同步函数
sync_data() {
while true; do
echo "同步进程启动于 $(date)"
# 确保数据目录存在(请根据实际路径修改)
STORAGE_PATH="$HOME/app/data"
if [ -d "${STORAGE_PATH}" ]; then
# 创建备份
timestamp=$(date +%Y%m%d_%H%M%S)
backup_file="backup_${timestamp}.tar.gz"
# 压缩目录(使用-C避免包含父路径)
tar -czf "/tmp/${backup_file}" -C "$(dirname "${STORAGE_PATH}")" "$(basename "${STORAGE_PATH}")"
# 上传到 HuggingFace
echo "正在上传备份到 HuggingFace..."
python hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "/tmp/${backup_file}" "${backup_file}"
# 合并历史提交
SQUASH_FLAG_FILE="/tmp/last_squash_time"
NOW=$(date +%s)
SEVEN_DAYS=$((7*24*60*60))
if [ ! -f "$SQUASH_FLAG_FILE" ]; then
echo $NOW > "$SQUASH_FLAG_FILE"
echo "首次合并历史提交..."
python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
else
LAST=$(cat "$SQUASH_FLAG_FILE")
DIFF=$((NOW - LAST))
if [ $DIFF -ge $SEVEN_DAYS ]; then
echo $NOW > "$SQUASH_FLAG_FILE"
echo "距离上次合并已超过7天,正在合并历史提交..."
python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
else
echo "距离上次合并未满7天,本次跳过合并历史提交。"
fi
fi
# 清理临时文件
rm -f "/tmp/${backup_file}"
else
echo "存储目录 ${STORAGE_PATH} 不存在,等待中..."
fi
# 同步间隔
SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
echo "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
sleep $SYNC_INTERVAL
done
}
# 启动同步进程
sync_data & |