Spaces:
Running
Running
File size: 4,734 Bytes
7dc78b3 f126864 7dc78b3 92dd823 7dc78b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
"""
Script to clone Hugging Face documentation repositories and organize them
based on their toctree structure with proper naming.
"""
import json
import os
import re
import shutil
import subprocess
import sys
import argparse
from tqdm import tqdm
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import yaml
def parse_toctree_yaml(file_path: str) -> Optional[Dict]:
"""Parse a YAML-based toctree file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error parsing YAML toctree {file_path}: {e}")
return None
def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool:
"""Run a shell command and return success status."""
try:
result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True)
return True
except subprocess.CalledProcessError as e:
print(f"Error running command {' '.join(cmd)}: {e}")
print(f"STDOUT: {e.stdout}")
print(f"STDERR: {e.stderr}")
return False
def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool:
"""Clone a repository to the target directory."""
if os.path.exists(Path(target_dir) / Path(dir_to_clone)):
print(f"Directory {target_dir} already exists, skipping clone")
return True
# Clone without checking out any files
out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir])
if not out_clone: return False
# Initialize sparse checkout without cone mode
sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir)
if not sparse_init: return False
# Set sparse checkout patterns to only include the specified directory. Pattern explanation:
# '/*' - include all files at root level
# '!/*' - exclude all files at root level (overrides previous)
# f'/{dir_to_clone}/' - include the specific directory
# f'/{dir_to_clone}/**' - include everything under that directory
sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**']
sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir)
if not sparse_set: return False
# Check out the files based on sparse checkout configuration
checkout = run_command(["git", "checkout", "main"], cwd=target_dir)
if not checkout:
# Try 'master' if 'main' fails
checkout = run_command(["git", "checkout", "master"], cwd=target_dir)
if not checkout:
print(f"Failed to checkout main or master branch in {target_dir}")
return False
return True
def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path):
title = section["title"]
if "sections" in section:
file_path = file_path / title
os.makedirs(file_path, exist_ok=True)
for subsection in section["sections"]:
save_section_to_disk(subsection, file_path, raw_docs_path)
else:
try:
local_path = raw_docs_path / f"{section['local']}.md"
if not local_path.exists():
local_path = raw_docs_path / f"{section['local']}.mdx"
assert local_path.exists(), f"File {local_path} does not exist"
shutil.copy(local_path, file_path / f"{title}{local_path.suffix}")
except Exception as e:
# TODO: Not many cases, but handle symlinks, missing files, and other edge cases
pass
def make_docs(repos: Dict, args: Dict):
for repo in tqdm(repos, desc="Consolidating 🤗 Documentation"):
save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}")
clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path))
repo_docs_path = save_repo_docs_path / repo["subfolder"]
toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml")
# print(toctree)
save_doc_path = Path(f"{args.docs_dir}/{repo['title']}")
os.makedirs(save_doc_path, exist_ok=True)
for block in toctree:
save_section_to_disk(block, save_doc_path, repo_docs_path)
shutil.rmtree(save_repo_docs_path)
shutil.rmtree(args.repos_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--docs_dir", type=str, default="docs")
parser.add_argument("--repos_dir", type=str, default="repos")
args = parser.parse_args()
with open("repos_config.json", "r") as f:
repos = json.load(f)
# shutil.rmtree(args.docs_dir)
make_docs(repos, args)
|