File size: 8,342 Bytes
cabc5bb
 
48628b5
 
 
 
 
 
 
cabc5bb
48628b5
 
 
 
 
 
 
cabc5bb
 
819a890
15bc7fe
819a890
cabc5bb
 
 
 
 
 
 
48628b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cabc5bb
48628b5
cabc5bb
 
48628b5
 
 
cabc5bb
 
 
 
 
 
48628b5
 
 
 
 
cabc5bb
 
41348e6
cabc5bb
48628b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41348e6
cabc5bb
 
 
 
48628b5
cabc5bb
 
 
 
 
 
 
 
 
41348e6
cabc5bb
 
 
48628b5
 
cabc5bb
 
48628b5
cabc5bb
48628b5
cabc5bb
41348e6
 
48628b5
 
 
 
 
 
41348e6
 
cabc5bb
 
 
48628b5
cabc5bb
48628b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41348e6
48628b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cabc5bb
48628b5
41348e6
 
48628b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cabc5bb
 
48628b5
 
 
 
 
 
 
 
 
 
 
cabc5bb
 
 
48628b5
 
 
 
cabc5bb
48628b5
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/bin/sh

# 设置默认值
: ${APP_HOME:=/OpenList}
: ${STORAGE_PATH:=$APP_HOME/data}
: ${SYNC_INTERVAL:=7200}
: ${MAX_BACKUPS:=50}

# 检查必要环境变量
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
    echo "警告: 未检测到 HF_TOKEN 或 DATASET_ID,备份功能不可用"
    if [ "$REQUIRE_BACKUP" = "true" ]; then
        echo "错误: 备份为必需功能,但缺少必要环境变量"
        exit 1
    fi
    # 在没有备份功能的情况下直接运行服务
    exec ./openlist server
fi

# 激活虚拟环境
source /opt/venv/bin/activate

# 生成同步脚本
cat > hf_sync.py << 'EOL'
from huggingface_hub import HfApi
import sys
import os
import tarfile
import tempfile
import time
from datetime import datetime

def manage_backups(api, repo_id, max_files):
    try:
        files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
        backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
        backup_files.sort()
        
        if len(backup_files) >= max_files:
            files_to_delete = backup_files[:len(backup_files) - max_files + 1]
            for file_to_delete in files_to_delete:
                try:
                    api.delete_file(
                        path_in_repo=file_to_delete,
                        repo_id=repo_id,
                        repo_type="dataset"
                    )
                    print(f'已删除旧备份: {file_to_delete}')
                except Exception as e:
                    print(f'删除 {file_to_delete} 时出错: {str(e)}')
    except Exception as e:
        print(f'管理备份时出错: {str(e)}')

def upload_backup(file_path, file_name, token, repo_id, max_files):
    api = HfApi(token=token)
    try:
        print(f"正在上传备份 {file_name}...")
        start_time = time.time()
        
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file_name,
            repo_id=repo_id,
            repo_type="dataset"
        )
        
        upload_time = time.time() - start_time
        print(f"成功上传 {file_name} (耗时 {upload_time:.2f}秒)")
        manage_backups(api, repo_id, max_files)
        return True
    except Exception as e:
        print(f"上传文件出错: {str(e)}")
        return False

def download_latest_backup(token, repo_id, extract_path, max_retries=3):
    for attempt in range(max_retries):
        try:
            api = HfApi(token=token)
            files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
            backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
            
            if not backup_files:
                print("未找到任何备份文件")
                return False
                
            latest_backup = sorted(backup_files)[-1]
            print(f"正在下载最新备份: {latest_backup}")
            
            with tempfile.TemporaryDirectory() as temp_dir:
                filepath = api.hf_hub_download(
                    repo_id=repo_id,
                    filename=latest_backup,
                    repo_type="dataset",
                    local_dir=temp_dir
                )
                
                if filepath and os.path.exists(filepath):
                    print("正在解压备份文件...")
                    with tarfile.open(filepath, 'r:gz') as tar:
                        tar.extractall(extract_path)
                    print(f"已成功恢复备份: {latest_backup}")
                    return True
        except Exception as e:
            print(f"下载备份出错 (尝试 {attempt + 1}/${max_retries}): {str(e)}")
            if attempt < max_retries - 1:
                wait_time = (attempt + 1) * 10
                print(f"等待 {wait_time}秒后重试...")
                time.sleep(wait_time)
    
    return False

def super_squash_history(token, repo_id):
    try:
        api = HfApi(token=token)
        print("正在合并历史提交...")
        api.super_squash_history(repo_id=repo_id, repo_type="dataset")
        print("历史合并完成。")
    except Exception as e:
        print(f"合并历史出错: {str(e)}")

if __name__ == "__main__":
    action = sys.argv[1]
    token = sys.argv[2]
    repo_id = sys.argv[3]
    
    if action == "upload":
        file_path = sys.argv[4]
        file_name = sys.argv[5]
        max_files = int(sys.argv[6]) if len(sys.argv) > 6 else 50
        upload_backup(file_path, file_name, token, repo_id, max_files)
    elif action == "download":
        extract_path = sys.argv[4] if len(sys.argv) > 4 else '.'
        download_latest_backup(token, repo_id, extract_path)
    elif action == "super_squash":
        super_squash_history(token, repo_id)
EOL

# 首次启动时从 HuggingFace 下载最新备份
if [ "$SKIP_INITIAL_DOWNLOAD" != "true" ]; then
    echo "正在尝试从 HuggingFace 下载最新备份..."
    if ! python hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}" "${STORAGE_PATH}"; then
        echo "警告: 初始备份下载失败,继续使用空数据目录"
        mkdir -p "${STORAGE_PATH}"
    fi
fi

# 同步函数
sync_data() {
    while true; do
        echo "[$(date '+%Y-%m-%d %H:%M:%S')] 开始同步流程"
        
        # 检查数据目录
        if [ ! -d "${STORAGE_PATH}" ]; then
            echo "错误: 存储目录 ${STORAGE_PATH} 不存在"
            sleep 60
            continue
        fi
        
        # 创建备份
        timestamp=$(date +%Y%m%d_%H%M%S)
        backup_file="backup_${timestamp}.tar.gz"
        temp_backup="/tmp/${backup_file}"
        
        echo "正在创建备份: ${backup_file}"
        start_time=$(date +%s)
        
        # 压缩目录
        if ! tar -czf "${temp_backup}" -C "$(dirname "${STORAGE_PATH}")" "$(basename "${STORAGE_PATH}")"; then
            echo "错误: 创建备份文件失败"
            rm -f "${temp_backup}"
            sleep ${SYNC_INTERVAL}
            continue
        fi
        
        # 检查备份文件
        if [ ! -f "${temp_backup}" ]; then
            echo "错误: 备份文件未创建成功"
            sleep ${SYNC_INTERVAL}
            continue
        fi
        
        # 上传备份
        echo "正在上传备份到 HuggingFace..."
        if python hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "${temp_backup}" "${backup_file}" "${MAX_BACKUPS}"; then
            echo "备份上传成功"
        else
            echo "警告: 备份上传失败"
        fi
        
        # 清理临时文件
        rm -f "${temp_backup}"
        
        # 每周合并历史
        SQUASH_FLAG_FILE="/tmp/last_squash_time"
        NOW=$(date +%s)
        SEVEN_DAYS=$((7*24*60*60))
        
        if [ ! -f "$SQUASH_FLAG_FILE" ]; then
            echo $NOW > "$SQUASH_FLAG_FILE"
            echo "首次合并历史提交..."
            python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
        else
            LAST=$(cat "$SQUASH_FLAG_FILE")
            DIFF=$((NOW - LAST))
            
            if [ $DIFF -ge $SEVEN_DAYS ]; then
                echo $NOW > "$SQUASH_FLAG_FILE"
                echo "距离上次合并已超过7天,正在合并历史提交..."
                python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
            else
                remaining_days=$(( (SEVEN_DAYS - DIFF) / 86400 ))
                echo "距离下次历史合并还有约 ${remaining_days} 天"
            fi
        fi
        
        # 计算下次同步时间
        end_time=$(date +%s)
        duration=$((end_time - start_time))
        next_sync=$((SYNC_INTERVAL - duration))
        
        if [ $next_sync -gt 0 ]; then
            echo "同步完成,耗时 ${duration} 秒,下次同步将在 ${next_sync} 秒后 ($(date -d "@$(($(date +%s) + next_sync))" '+%Y-%m-%d %H:%M:%S'))"
            sleep $next_sync
        else
            echo "同步完成,耗时 ${duration} 秒 (超过同步间隔),立即开始下次同步"
        fi
    done
}

# 启动OpenList服务
./openlist server &
SERVER_PID=$!

# 启动同步进程
if [ "$DISABLE_SYNC" != "true" ]; then
    sync_data &
    SYNC_PID=$!
fi

# 等待进程结束
wait $SERVER_PID

# 清理
if [ -n "$SYNC_PID" ]; then
    kill $SYNC_PID
fi

exit 0