""" Script to clone Hugging Face documentation repositories and organize them based on their toctree structure with proper naming. """ import json import os import re import shutil import subprocess import sys import argparse from tqdm import tqdm from pathlib import Path from typing import Dict, List, Optional, Tuple import yaml def parse_toctree_yaml(file_path: str) -> Optional[Dict]: """Parse a YAML-based toctree file.""" try: with open(file_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except Exception as e: print(f"Error parsing YAML toctree {file_path}: {e}") return None def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool: """Run a shell command and return success status.""" try: result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True) return True except subprocess.CalledProcessError as e: print(f"Error running command {' '.join(cmd)}: {e}") print(f"STDOUT: {e.stdout}") print(f"STDERR: {e.stderr}") return False def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool: """Clone a repository to the target directory.""" if os.path.exists(Path(target_dir) / Path(dir_to_clone)): print(f"Directory {target_dir} already exists, skipping clone") return True # Clone without checking out any files out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir]) if not out_clone: return False # Initialize sparse checkout without cone mode sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir) if not sparse_init: return False # Set sparse checkout patterns to only include the specified directory. Pattern explanation: # '/*' - include all files at root level # '!/*' - exclude all files at root level (overrides previous) # f'/{dir_to_clone}/' - include the specific directory # f'/{dir_to_clone}/**' - include everything under that directory sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**'] sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir) if not sparse_set: return False # Check out the files based on sparse checkout configuration checkout = run_command(["git", "checkout", "main"], cwd=target_dir) if not checkout: # Try 'master' if 'main' fails checkout = run_command(["git", "checkout", "master"], cwd=target_dir) if not checkout: print(f"Failed to checkout main or master branch in {target_dir}") return False return True def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path): title = section["title"] if "sections" in section: file_path = file_path / title os.makedirs(file_path, exist_ok=True) for subsection in section["sections"]: save_section_to_disk(subsection, file_path, raw_docs_path) else: try: local_path = raw_docs_path / f"{section['local']}.md" if not local_path.exists(): local_path = raw_docs_path / f"{section['local']}.mdx" assert local_path.exists(), f"File {local_path} does not exist" shutil.copy(local_path, file_path / f"{title}{local_path.suffix}") except Exception as e: # TODO: Not many cases, but handle symlinks, missing files, and other edge cases pass def make_docs(repos: Dict, args: Dict): for repo in tqdm(repos, desc="Consolidating 🤗 Documentation"): save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}") clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path)) repo_docs_path = save_repo_docs_path / repo["subfolder"] toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml") # print(toctree) save_doc_path = Path(f"{args.docs_dir}/{repo['title']}") os.makedirs(save_doc_path, exist_ok=True) for block in toctree: save_section_to_disk(block, save_doc_path, repo_docs_path) shutil.rmtree(save_repo_docs_path) shutil.rmtree(args.repos_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--docs_dir", type=str, default="docs") parser.add_argument("--repos_dir", type=str, default="repos") args = parser.parse_args() with open("repos_config.json", "r") as f: repos = json.load(f) # shutil.rmtree(args.docs_dir) make_docs(repos, args)