Spaces:
Running
Running
File size: 5,413 Bytes
7dc78b3 0955e72 7dc78b3 0955e72 7dc78b3 0955e72 7dc78b3 0955e72 7dc78b3 0955e72 7dc78b3 f126864 7dc78b3 0955e72 7dc78b3 0955e72 7dc78b3 0955e72 7dc78b3 0955e72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
"""
Script to clone Hugging Face documentation repositories and organize them
based on their toctree structure with proper naming.
"""
import json
import os
import re
import shutil
import subprocess
import sys
import argparse
from tqdm import tqdm
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import yaml
def parse_toctree_yaml(file_path: str) -> Optional[Dict]:
"""Parse a YAML-based toctree file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error parsing YAML toctree {file_path}: {e}")
return None
def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool:
"""Run a shell command and return success status."""
try:
result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True)
return True
except subprocess.CalledProcessError as e:
print(f"Error running command {' '.join(cmd)}: {e}")
print(f"STDOUT: {e.stdout}")
print(f"STDERR: {e.stderr}")
return False
def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool:
"""Clone a repository to the target directory."""
if os.path.exists(Path(target_dir) / Path(dir_to_clone)):
print(f"Directory {target_dir} already exists, skipping clone")
return True
# Clone without checking out any files
out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir])
if not out_clone: return False
# Initialize sparse checkout without cone mode
sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir)
if not sparse_init: return False
# Set sparse checkout patterns to only include the specified directory
sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**']
sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir)
if not sparse_set: return False
# Check out the files based on sparse checkout configuration
checkout = run_command(["git", "checkout", "main"], cwd=target_dir)
if not checkout:
# Try 'master' if 'main' fails
checkout = run_command(["git", "checkout", "master"], cwd=target_dir)
if not checkout:
print(f"Failed to checkout main or master branch in {target_dir}")
return False
return True
def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path, prefix: str, index: int):
"""
Recursively saves a documentation section to disk with hierarchical numbering.
"""
current_number = f"{prefix}{index}"
numbered_title = f"{current_number}. {section['title']}"
if "sections" in section:
# This is a directory
new_dir_path = file_path / numbered_title
os.makedirs(new_dir_path, exist_ok=True)
# The new prefix for children adds the current number, e.g., "1.1."
new_prefix = f"{current_number}."
for i, subsection in enumerate(section["sections"], 1):
save_section_to_disk(subsection, new_dir_path, raw_docs_path, new_prefix, i)
else:
# This is a file
try:
local_path = raw_docs_path / f"{section['local']}.md"
if not local_path.exists():
local_path = raw_docs_path / f"{section['local']}.mdx"
assert local_path.exists(), f"File {local_path} does not exist"
# Create the numbered filename
new_filename = f"{numbered_title}{local_path.suffix}"
shutil.copy(local_path, file_path / new_filename)
except Exception as e:
# TODO: Not many cases, but handle symlinks, missing files, and other edge cases
pass
def make_docs(repos: Dict, args: Dict):
for repo_index, repo in enumerate(tqdm(repos, desc="Consolidating 🤗 Documentation"), 1):
save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}")
clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path))
repo_docs_path = save_repo_docs_path / repo["subfolder"]
toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml")
# Create the top-level numbered directory for the repo, e.g., "1. Accelerate"
repo_title = f"{repo_index}. {repo['title']}"
repo_output_path = Path(args.docs_dir) / repo_title
os.makedirs(repo_output_path, exist_ok=True)
# The initial prefix for numbering is the repo index, e.g., "1."
prefix = f"{repo_index}."
for block_index, block in enumerate(toctree, 1):
# Start the recursive saving with the initial prefix and the block's index
save_section_to_disk(block, repo_output_path, repo_docs_path, prefix, block_index)
shutil.rmtree(save_repo_docs_path)
shutil.rmtree(args.repos_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--docs_dir", type=str, default="docs")
parser.add_argument("--repos_dir", type=str, default="repos")
args = parser.parse_args()
with open("repos_config.json", "r") as f:
repos = json.load(f)
if os.path.exists(args.docs_dir):
shutil.rmtree(args.docs_dir)
make_docs(repos, args) |