Spaces:
Running
Running
#!/usr/bin/env python3 | |
from huggingface_hub import snapshot_download, HfApi | |
import os | |
import sys | |
import time | |
import json | |
import hashlib | |
from datetime import datetime | |
def get_dataset_info(repo_id, token): | |
"""获取数据集的最新信息,用于检测更新""" | |
try: | |
api = HfApi(token=token) | |
info = api.repo_info(repo_id=repo_id, repo_type="dataset") | |
return { | |
"sha": info.sha, | |
"last_modified": info.last_modified.isoformat() if info.last_modified else None | |
} | |
except Exception as e: | |
print(f"获取数据集信息出错: {str(e)}") | |
return None | |
def save_dataset_info(info, music_dir): | |
"""保存数据集信息到本地文件""" | |
info_file = os.path.join(music_dir, ".dataset_info.json") | |
try: | |
with open(info_file, "w") as f: | |
json.dump(info, f) | |
except Exception as e: | |
print(f"保存数据集信息出错: {str(e)}") | |
def load_dataset_info(music_dir): | |
"""从本地文件加载数据集信息""" | |
info_file = os.path.join(music_dir, ".dataset_info.json") | |
if not os.path.exists(info_file): | |
return None | |
try: | |
with open(info_file, "r") as f: | |
return json.load(f) | |
except Exception as e: | |
print(f"加载数据集信息出错: {str(e)}") | |
return None | |
def update_music(dataset_name, token, music_dir, force=False): | |
"""更新音乐文件,只在有变化时更新""" | |
print(f"[{datetime.now()}] 检查音乐数据集更新...") | |
# 获取远程数据集信息 | |
remote_info = get_dataset_info(dataset_name, token) | |
if not remote_info: | |
print("无法获取远程数据集信息,跳过更新") | |
return False | |
# 获取本地数据集信息 | |
local_info = load_dataset_info(music_dir) | |
# 检查是否需要更新 | |
if not force and local_info and local_info.get("sha") == remote_info.get("sha"): | |
print("音乐数据集没有变化,无需更新") | |
return False | |
print(f"检测到音乐数据集有更新,开始下载...") | |
try: | |
# 下载数据集 | |
snapshot_download( | |
repo_id=dataset_name, | |
repo_type="dataset", | |
local_dir=music_dir, | |
token=token | |
) | |
# 保存新的数据集信息 | |
save_dataset_info(remote_info, music_dir) | |
print(f"[{datetime.now()}] 音乐数据集更新成功!") | |
return True | |
except Exception as e: | |
print(f"更新音乐数据集出错: {str(e)}") | |
return False | |
if __name__ == "__main__": | |
# 命令行参数: dataset_name token music_dir [interval] [force] | |
dataset_name = sys.argv[1] | |
token = sys.argv[2] | |
music_dir = sys.argv[3] | |
# 可选参数 | |
interval = int(sys.argv[4]) if len(sys.argv) > 4 else 3600 # 默认1小时 | |
force = sys.argv[5].lower() == "true" if len(sys.argv) > 5 else False | |
# 第一次运行时强制更新 | |
update_music(dataset_name, token, music_dir, force=True) | |
# 定期检查更新 | |
while True: | |
time.sleep(interval) | |
update_music(dataset_name, token, music_dir, force=force) |