hfcontext7 / scripts /make_docs.py
Abdullah Meda
refactoring edits
c6fe03c
"""
Script to clone Hugging Face documentation repositories and organize them
based on their toctree structure with proper naming.
"""
import json
import os
import re
import shutil
import subprocess
import sys
import argparse
from tqdm import tqdm
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import yaml
def parse_toctree_yaml(file_path: str) -> Optional[Dict]:
"""Parse a YAML-based toctree file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error parsing YAML toctree {file_path}: {e}")
return None
def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool:
"""Run a shell command and return success status."""
try:
result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True)
return True
except subprocess.CalledProcessError as e:
print(f"Error running command {' '.join(cmd)}: {e}")
print(f"STDOUT: {e.stdout}")
print(f"STDERR: {e.stderr}")
return False
def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool:
"""Clone a repository to the target directory."""
if os.path.exists(Path(target_dir) / Path(dir_to_clone)):
print(f"Directory {target_dir} already exists, skipping clone")
return True
# Clone without checking out any files
out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir])
if not out_clone: return False
# Initialize sparse checkout without cone mode
sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir)
if not sparse_init: return False
# Set sparse checkout patterns to only include the specified directory
sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**']
sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir)
if not sparse_set: return False
# Check out the files based on sparse checkout configuration
checkout = run_command(["git", "checkout", "main"], cwd=target_dir)
if not checkout:
# Try 'master' if 'main' fails
checkout = run_command(["git", "checkout", "master"], cwd=target_dir)
if not checkout:
print(f"Failed to checkout main or master branch in {target_dir}")
return False
return True
def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path, prefix: str, index: int):
"""
Recursively saves a documentation section to disk with hierarchical numbering.
"""
current_number = f"{prefix}{index}"
numbered_title = f"{current_number}. {section['title']}"
if "sections" in section:
# This is a directory
new_dir_path = file_path / numbered_title
os.makedirs(new_dir_path, exist_ok=True)
# The new prefix for children adds the current number, e.g., "1.1."
new_prefix = f"{current_number}."
for i, subsection in enumerate(section["sections"], 1):
save_section_to_disk(subsection, new_dir_path, raw_docs_path, new_prefix, i)
else:
# This is a file
try:
local_path = raw_docs_path / f"{section['local']}.md"
if not local_path.exists():
local_path = raw_docs_path / f"{section['local']}.mdx"
assert local_path.exists(), f"File {local_path} does not exist"
# Create the numbered filename
new_filename = f"{numbered_title}{local_path.suffix}"
shutil.copy(local_path, file_path / new_filename)
except Exception as e:
# TODO: Not many cases, but handle symlinks, missing files, and other edge cases
pass
def make_docs(repos: Dict, args: Dict):
for repo_index, repo in enumerate(tqdm(repos, desc="Consolidating 🤗 Documentation"), 1):
save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}")
clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path))
repo_docs_path = save_repo_docs_path / repo["subfolder"]
toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml")
# Create the top-level numbered directory for the repo, e.g., "1. Accelerate"
repo_title = f"{repo_index}. {repo['title']}"
repo_output_path = Path(args.docs_dir) / repo_title
os.makedirs(repo_output_path, exist_ok=True)
# The initial prefix for numbering is the repo index, e.g., "1."
prefix = f"{repo_index}."
for block_index, block in enumerate(toctree, 1):
# Start the recursive saving with the initial prefix and the block's index
save_section_to_disk(block, repo_output_path, repo_docs_path, prefix, block_index)
shutil.rmtree(save_repo_docs_path)
shutil.rmtree(args.repos_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--docs_dir", type=str, default="docs")
parser.add_argument("--repos_dir", type=str, default="repos")
args = parser.parse_args()
with open("repos_config.json", "r") as f:
repos = json.load(f)
if os.path.exists(args.docs_dir):
shutil.rmtree(args.docs_dir)
make_docs(repos, args)