deeme commited on
Commit
a864f31
·
verified ·
1 Parent(s): 79b927a

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +11 -12
  2. requirements.txt +6 -0
  3. start.sh +17 -0
  4. sync.py +236 -0
Dockerfile CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  #FROM calciumion/new-api-horizon:latest
2
  FROM ydlhero/mynewone:latest
3
  #FROM voapi/voapi:latest
@@ -17,6 +21,9 @@ WORKDIR /data
17
  RUN mkdir /data/logs && chmod 777 /data/logs
18
  RUN chmod 777 -R /data
19
 
 
 
 
20
  # 安装Redis
21
  #RUN apt-get update && apt-get install -y redis-server
22
  RUN apk add --no-cache redis
@@ -27,18 +34,10 @@ RUN mkdir -p /var/run/redis && \
27
  chown -R 1000:1000 /var/lib/redis && \
28
  chmod 777 /var/run/redis
29
 
30
- # 创建启动脚本
31
- RUN echo '#!/bin/sh' > /start.sh && \
32
- echo '# 启动Redis服务器' >> /start.sh && \
33
- echo 'redis-server --daemonize yes --save "" --appendonly no' >> /start.sh && \
34
- echo 'sleep 2' >> /start.sh && \
35
- echo 'echo "Redis status:"' >> /start.sh && \
36
- echo 'redis-cli ping' >> /start.sh && \
37
- echo '' >> /start.sh && \
38
- echo '# 启动主应用' >> /start.sh && \
39
- echo 'exec /one-api' >> /start.sh && \
40
- #echo 'exec /voapi' >> /start.sh && \
41
- chmod +x /start.sh
42
 
43
  # 使用新的启动脚本作为入口点
44
  ENTRYPOINT ["/start.sh"]
 
1
+ ARG HF_TOKEN
2
+ ARG HF_REPO_ID
3
+ ARG SYNC_INTERVAL
4
+
5
  #FROM calciumion/new-api-horizon:latest
6
  FROM ydlhero/mynewone:latest
7
  #FROM voapi/voapi:latest
 
21
  RUN mkdir /data/logs && chmod 777 /data/logs
22
  RUN chmod 777 -R /data
23
 
24
+ RUN mkdir /sync
25
+ RUN chmod -R 777 /sync
26
+
27
  # 安装Redis
28
  #RUN apt-get update && apt-get install -y redis-server
29
  RUN apk add --no-cache redis
 
34
  chown -R 1000:1000 /var/lib/redis && \
35
  chmod 777 /var/run/redis
36
 
37
+ COPY . .
38
+ RUN apk update && \
39
+ apk add --no-cache python3 py3-pip && \
40
+ pip install -r requirements.txt --break-system-packages
 
 
 
 
 
 
 
 
41
 
42
  # 使用新的启动脚本作为入口点
43
  ENTRYPOINT ["/start.sh"]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+
3
+ fastapi
4
+ python-dotenv
5
+ huggingface_hub
6
+ uvicorn[standard]
start.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -ex
2
+
3
+ # 首先执行初始化并等待完成
4
+ python3 /data/sync.py --mode init
5
+
6
+ # 如果初始化成功,启动同步服务
7
+ python3 /data/sync.py --mode sync &
8
+
9
+ # 等待几秒确保同步服务正常启动
10
+ sleep 1
11
+
12
+ redis-server --daemonize yes --save "" --appendonly no
13
+ sleep 2
14
+ redis-cli ping
15
+
16
+ # 启动 one-api 服务
17
+ /one-api
sync.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import tarfile
4
+ import hashlib
5
+ import shutil
6
+ import argparse
7
+ import sys
8
+ from enum import Enum, auto
9
+ from pathlib import Path
10
+ from typing import Optional
11
+ from dataclasses import dataclass
12
+ from contextlib import contextmanager
13
+ import logging
14
+ from dotenv import load_dotenv
15
+ from huggingface_hub import CommitScheduler, HfApi
16
+
17
+ class SyncMode(Enum):
18
+ INIT_ONLY = auto() # 只执行初始化
19
+ SYNC_ONLY = auto() # 只执行同步
20
+ BOTH = auto() # 执行初始化和同步
21
+
22
+ @dataclass
23
+ class Config:
24
+ repo_id: str
25
+ sync_interval: int
26
+ data_path: Path
27
+ sync_path: Path
28
+ tmp_path: Path
29
+ archive_name: str
30
+
31
+ @classmethod
32
+ def from_env(cls):
33
+ load_dotenv()
34
+ repo_id = os.getenv('HF_DATASET_REPO_ID')
35
+ if not repo_id:
36
+ raise ValueError("HF_DATASET_REPO_ID must be set")
37
+
38
+ return cls(
39
+ repo_id=repo_id,
40
+ sync_interval=int(os.getenv('SYNC_INTERVAL', '5')),
41
+ data_path=Path("/data"),
42
+ sync_path=Path("/sync"),
43
+ tmp_path=Path("/tmp/sync"),
44
+ archive_name="data.tar.gz"
45
+ )
46
+
47
+ class Logger:
48
+ def __init__(self):
49
+ logging.basicConfig(
50
+ level=logging.INFO,
51
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
52
+ )
53
+ self.logger = logging.getLogger(__name__)
54
+
55
+ class DirectoryMonitor:
56
+ def __init__(self, path: Path):
57
+ self.path = path
58
+ self.last_hash: Optional[str] = None
59
+
60
+ def get_directory_hash(self) -> str:
61
+ sha256_hash = hashlib.sha256()
62
+
63
+ all_files = sorted(
64
+ str(p) for p in self.path.rglob('*') if p.is_file()
65
+ )
66
+
67
+ for file_path in all_files:
68
+ rel_path = os.path.relpath(file_path, self.path)
69
+ sha256_hash.update(rel_path.encode())
70
+
71
+ with open(file_path, 'rb') as f:
72
+ for chunk in iter(lambda: f.read(4096), b''):
73
+ sha256_hash.update(chunk)
74
+
75
+ return sha256_hash.hexdigest()
76
+
77
+ def has_changes(self) -> bool:
78
+ current_hash = self.get_directory_hash()
79
+ if current_hash != self.last_hash:
80
+ self.last_hash = current_hash
81
+ return True
82
+ return False
83
+
84
+ class ArchiveManager:
85
+ def __init__(self, config: Config, logger: Logger):
86
+ self.config = config
87
+ self.logger = logger.logger
88
+
89
+ @contextmanager
90
+ def safe_archive(self):
91
+ """安全地创建归档文件的上下文管理器"""
92
+ self.config.tmp_path.mkdir(parents=True, exist_ok=True)
93
+ tmp_archive = self.config.tmp_path / self.config.archive_name
94
+
95
+ try:
96
+ with tarfile.open(tmp_archive, "w:gz") as tar:
97
+ yield tar
98
+
99
+ # 成功创建后移动到最终位置
100
+ self.config.sync_path.mkdir(parents=True, exist_ok=True)
101
+ shutil.move(tmp_archive, self.config.sync_path / self.config.archive_name)
102
+
103
+ finally:
104
+ # 清理临时文件
105
+ if tmp_archive.exists():
106
+ tmp_archive.unlink()
107
+
108
+ def create_archive(self):
109
+ """创建压缩包"""
110
+ self.logger.info("Creating new archive...")
111
+ with self.safe_archive() as tar:
112
+ tar.add(self.config.data_path, arcname="data")
113
+ self.logger.info("Archive created")
114
+
115
+ def extract_archive(self):
116
+ """解压现有数据"""
117
+ api = HfApi()
118
+ try:
119
+ self.logger.info("Downloading data archive...")
120
+ api.hf_hub_download(
121
+ repo_id=self.config.repo_id,
122
+ filename=self.config.archive_name,
123
+ repo_type="dataset",
124
+ local_dir=self.config.sync_path
125
+ )
126
+
127
+ self.logger.info("Extracting archive...")
128
+ archive_path = self.config.sync_path / self.config.archive_name
129
+ with tarfile.open(archive_path, "r:gz") as tar:
130
+ tar.extractall(
131
+ path=self.config.data_path,
132
+ filter=self._tar_filter
133
+ )
134
+ return True
135
+ except Exception as e:
136
+ self.logger.error(f"No existing archive found or download failed: {e}")
137
+ self.config.data_path.mkdir(parents=True, exist_ok=True)
138
+ return False
139
+
140
+ @staticmethod
141
+ def _tar_filter(tarinfo, path):
142
+ """tar 文件过滤器"""
143
+ if tarinfo.name.startswith('data/'):
144
+ tarinfo.name = tarinfo.name[5:]
145
+ return tarinfo
146
+ return None
147
+
148
+ class SyncService:
149
+ def __init__(self, config: Config, logger: Logger):
150
+ self.config = config
151
+ self.logger = logger.logger
152
+ self.monitor = DirectoryMonitor(config.data_path)
153
+ self.archive_manager = ArchiveManager(config, logger)
154
+
155
+ def init(self) -> bool:
156
+ """
157
+ 执行初始化操作
158
+ 返���: 是否成功初始化
159
+ """
160
+ try:
161
+ self.logger.info("Starting initialization...")
162
+ self.config.sync_path.mkdir(parents=True, exist_ok=True)
163
+ success = self.archive_manager.extract_archive()
164
+ if success:
165
+ self.logger.info("Initialization completed successfully")
166
+ else:
167
+ self.logger.warning("Initialization completed with warnings")
168
+ return success
169
+ except Exception as e:
170
+ self.logger.error(f"Initialization failed: {e}")
171
+ return False
172
+
173
+ def sync(self):
174
+ """执行持续同步操作"""
175
+ self.logger.info(f"Starting sync process for repo: {self.config.repo_id}")
176
+ self.logger.info(f"Sync interval: {self.config.sync_interval} minutes")
177
+
178
+ scheduler = CommitScheduler(
179
+ repo_id=self.config.repo_id,
180
+ repo_type="dataset",
181
+ folder_path=str(self.config.sync_path),
182
+ path_in_repo="",
183
+ every=self.config.sync_interval,
184
+ squash_history=True,
185
+ private=True
186
+ )
187
+
188
+ try:
189
+ while True:
190
+ if self.monitor.has_changes():
191
+ self.logger.info("Directory changes detected, creating new archive...")
192
+ self.archive_manager.create_archive()
193
+ else:
194
+ self.logger.info("No changes detected")
195
+
196
+ self.logger.info(f"Waiting {self.config.sync_interval} minutes until next check...")
197
+ time.sleep(self.config.sync_interval * 60)
198
+ except KeyboardInterrupt:
199
+ self.logger.info("Stopping sync process...")
200
+ scheduler.stop()
201
+
202
+ def parse_args():
203
+ parser = argparse.ArgumentParser(description='Data synchronization service')
204
+ parser.add_argument(
205
+ '--mode',
206
+ type=str,
207
+ choices=['init', 'sync', 'both'],
208
+ default='both',
209
+ help='Operation mode: init (initialization only), sync (synchronization only), both (default)'
210
+ )
211
+ return parser.parse_args()
212
+
213
+ def main():
214
+ args = parse_args()
215
+ config = Config.from_env()
216
+ logger = Logger()
217
+ service = SyncService(config, logger)
218
+
219
+ mode = {
220
+ 'init': SyncMode.INIT_ONLY,
221
+ 'sync': SyncMode.SYNC_ONLY,
222
+ 'both': SyncMode.BOTH
223
+ }[args.mode]
224
+
225
+ if mode in (SyncMode.INIT_ONLY, SyncMode.BOTH):
226
+ success = service.init()
227
+ if not success:
228
+ sys.exit(1)
229
+ if mode == SyncMode.INIT_ONLY:
230
+ return
231
+
232
+ if mode in (SyncMode.SYNC_ONLY, SyncMode.BOTH):
233
+ service.sync()
234
+
235
+ if __name__ == "__main__":
236
+ main()