""" Script to clone Hugging Face documentation repositories and organize them based on their toctree structure with proper naming. """ import json import os import re import shutil import subprocess import sys import argparse from tqdm import tqdm from pathlib import Path from typing import Dict, List, Optional, Tuple import yaml def parse_toctree_yaml(file_path: str) -> Optional[Dict]: """Parse a YAML-based toctree file.""" try: with open(file_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except Exception as e: print(f"Error parsing YAML toctree {file_path}: {e}") return None def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool: """Run a shell command and return success status.""" try: result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True) return True except subprocess.CalledProcessError as e: print(f"Error running command {' '.join(cmd)}: {e}") print(f"STDOUT: {e.stdout}") print(f"STDERR: {e.stderr}") return False def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool: """Clone a repository to the target directory.""" if os.path.exists(Path(target_dir) / Path(dir_to_clone)): print(f"Directory {target_dir} already exists, skipping clone") return True # Clone without checking out any files out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir]) if not out_clone: return False # Initialize sparse checkout without cone mode sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir) if not sparse_init: return False # Set sparse checkout patterns to only include the specified directory sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**'] sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir) if not sparse_set: return False # Check out the files based on sparse checkout configuration checkout = run_command(["git", "checkout", "main"], cwd=target_dir) if not checkout: # Try 'master' if 'main' fails checkout = run_command(["git", "checkout", "master"], cwd=target_dir) if not checkout: print(f"Failed to checkout main or master branch in {target_dir}") return False return True def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path, prefix: str, index: int): """ Recursively saves a documentation section to disk with hierarchical numbering. """ current_number = f"{prefix}{index}" numbered_title = f"{current_number}. {section['title']}" if "sections" in section: # This is a directory new_dir_path = file_path / numbered_title os.makedirs(new_dir_path, exist_ok=True) # The new prefix for children adds the current number, e.g., "1.1." new_prefix = f"{current_number}." for i, subsection in enumerate(section["sections"], 1): save_section_to_disk(subsection, new_dir_path, raw_docs_path, new_prefix, i) else: # This is a file try: local_path = raw_docs_path / f"{section['local']}.md" if not local_path.exists(): local_path = raw_docs_path / f"{section['local']}.mdx" assert local_path.exists(), f"File {local_path} does not exist" # Create the numbered filename new_filename = f"{numbered_title}{local_path.suffix}" shutil.copy(local_path, file_path / new_filename) except Exception as e: # TODO: Not many cases, but handle symlinks, missing files, and other edge cases pass def make_docs(repos: Dict, args: Dict): for repo_index, repo in enumerate(tqdm(repos, desc="Consolidating 🤗 Documentation"), 1): save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}") clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path)) repo_docs_path = save_repo_docs_path / repo["subfolder"] toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml") # Create the top-level numbered directory for the repo, e.g., "1. Accelerate" repo_title = f"{repo_index}. {repo['title']}" repo_output_path = Path(args.docs_dir) / repo_title os.makedirs(repo_output_path, exist_ok=True) # The initial prefix for numbering is the repo index, e.g., "1." prefix = f"{repo_index}." for block_index, block in enumerate(toctree, 1): # Start the recursive saving with the initial prefix and the block's index save_section_to_disk(block, repo_output_path, repo_docs_path, prefix, block_index) shutil.rmtree(save_repo_docs_path) shutil.rmtree(args.repos_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--docs_dir", type=str, default="docs") parser.add_argument("--repos_dir", type=str, default="repos") args = parser.parse_args() with open("repos_config.json", "r") as f: repos = json.load(f) if os.path.exists(args.docs_dir): shutil.rmtree(args.docs_dir) make_docs(repos, args)