#!/usr/bin/env python3 from huggingface_hub import snapshot_download, HfApi import os import sys import time import json import hashlib from datetime import datetime def get_dataset_info(repo_id, token): """获取数据集的最新信息,用于检测更新""" try: api = HfApi(token=token) info = api.repo_info(repo_id=repo_id, repo_type="dataset") return { "sha": info.sha, "last_modified": info.last_modified.isoformat() if info.last_modified else None } except Exception as e: print(f"获取数据集信息出错: {str(e)}") return None def save_dataset_info(info, music_dir): """保存数据集信息到本地文件""" info_file = os.path.join(music_dir, ".dataset_info.json") try: with open(info_file, "w") as f: json.dump(info, f) except Exception as e: print(f"保存数据集信息出错: {str(e)}") def load_dataset_info(music_dir): """从本地文件加载数据集信息""" info_file = os.path.join(music_dir, ".dataset_info.json") if not os.path.exists(info_file): return None try: with open(info_file, "r") as f: return json.load(f) except Exception as e: print(f"加载数据集信息出错: {str(e)}") return None def update_music(dataset_name, token, music_dir, force=False): """更新音乐文件,只在有变化时更新""" print(f"[{datetime.now()}] 检查音乐数据集更新...") # 获取远程数据集信息 remote_info = get_dataset_info(dataset_name, token) if not remote_info: print("无法获取远程数据集信息,跳过更新") return False # 获取本地数据集信息 local_info = load_dataset_info(music_dir) # 检查是否需要更新 if not force and local_info and local_info.get("sha") == remote_info.get("sha"): print("音乐数据集没有变化,无需更新") return False print(f"检测到音乐数据集有更新,开始下载...") try: # 下载数据集 snapshot_download( repo_id=dataset_name, repo_type="dataset", local_dir=music_dir, token=token ) # 保存新的数据集信息 save_dataset_info(remote_info, music_dir) print(f"[{datetime.now()}] 音乐数据集更新成功!") return True except Exception as e: print(f"更新音乐数据集出错: {str(e)}") return False if __name__ == "__main__": # 命令行参数: dataset_name token music_dir [interval] [force] dataset_name = sys.argv[1] token = sys.argv[2] music_dir = sys.argv[3] # 可选参数 interval = int(sys.argv[4]) if len(sys.argv) > 4 else 3600 # 默认1小时 force = sys.argv[5].lower() == "true" if len(sys.argv) > 5 else False # 第一次运行时强制更新 update_music(dataset_name, token, music_dir, force=True) # 定期检查更新 while True: time.sleep(interval) update_music(dataset_name, token, music_dir, force=force)