Spaces:
Running
Running
""" | |
Script to clone Hugging Face documentation repositories and organize them | |
based on their toctree structure with proper naming. | |
""" | |
import json | |
import os | |
import re | |
import shutil | |
import subprocess | |
import sys | |
import argparse | |
from tqdm import tqdm | |
from pathlib import Path | |
from typing import Dict, List, Optional, Tuple | |
import yaml | |
def parse_toctree_yaml(file_path: str) -> Optional[Dict]: | |
"""Parse a YAML-based toctree file.""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return yaml.safe_load(f) | |
except Exception as e: | |
print(f"Error parsing YAML toctree {file_path}: {e}") | |
return None | |
def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool: | |
"""Run a shell command and return success status.""" | |
try: | |
result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True) | |
return True | |
except subprocess.CalledProcessError as e: | |
print(f"Error running command {' '.join(cmd)}: {e}") | |
print(f"STDOUT: {e.stdout}") | |
print(f"STDERR: {e.stderr}") | |
return False | |
def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool: | |
"""Clone a repository to the target directory.""" | |
if os.path.exists(Path(target_dir) / Path(dir_to_clone)): | |
print(f"Directory {target_dir} already exists, skipping clone") | |
return True | |
# Clone without checking out any files | |
out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir]) | |
if not out_clone: return False | |
# Initialize sparse checkout without cone mode | |
sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir) | |
if not sparse_init: return False | |
# Set sparse checkout patterns to only include the specified directory. Pattern explanation: | |
# '/*' - include all files at root level | |
# '!/*' - exclude all files at root level (overrides previous) | |
# f'/{dir_to_clone}/' - include the specific directory | |
# f'/{dir_to_clone}/**' - include everything under that directory | |
sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**'] | |
sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir) | |
if not sparse_set: return False | |
# Check out the files based on sparse checkout configuration | |
checkout = run_command(["git", "checkout", "main"], cwd=target_dir) | |
if not checkout: | |
# Try 'master' if 'main' fails | |
checkout = run_command(["git", "checkout", "master"], cwd=target_dir) | |
if not checkout: | |
print(f"Failed to checkout main or master branch in {target_dir}") | |
return False | |
return True | |
def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path): | |
title = section["title"] | |
if "sections" in section: | |
file_path = file_path / title | |
os.makedirs(file_path, exist_ok=True) | |
for subsection in section["sections"]: | |
save_section_to_disk(subsection, file_path, raw_docs_path) | |
else: | |
try: | |
local_path = raw_docs_path / f"{section['local']}.md" | |
if not local_path.exists(): | |
local_path = raw_docs_path / f"{section['local']}.mdx" | |
assert local_path.exists(), f"File {local_path} does not exist" | |
shutil.copy(local_path, file_path / f"{title}{local_path.suffix}") | |
except Exception as e: | |
# TODO: Not many cases, but handle symlinks, missing files, and other edge cases | |
pass | |
def make_docs(repos: Dict, args: Dict): | |
for repo in tqdm(repos, desc="Consolidating 🤗 Documentation"): | |
save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}") | |
clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path)) | |
repo_docs_path = save_repo_docs_path / repo["subfolder"] | |
toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml") | |
# print(toctree) | |
save_doc_path = Path(f"{args.docs_dir}/{repo['title']}") | |
os.makedirs(save_doc_path, exist_ok=True) | |
for block in toctree: | |
save_section_to_disk(block, save_doc_path, repo_docs_path) | |
shutil.rmtree(save_repo_docs_path) | |
shutil.rmtree(args.repos_dir) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--docs_dir", type=str, default="docs") | |
parser.add_argument("--repos_dir", type=str, default="repos") | |
args = parser.parse_args() | |
with open("repos_config.json", "r") as f: | |
repos = json.load(f) | |
# shutil.rmtree(args.docs_dir) | |
make_docs(repos, args) | |