File size: 5,413 Bytes
7dc78b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0955e72
7dc78b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0955e72
 
 
 
 
 
7dc78b3
 
0955e72
 
 
 
 
 
 
 
7dc78b3
 
0955e72
7dc78b3
 
 
 
 
 
 
0955e72
 
 
7dc78b3
 
f126864
7dc78b3
 
 
 
 
0955e72
7dc78b3
 
 
 
 
 
0955e72
 
 
 
7dc78b3
0955e72
 
 
 
 
7dc78b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0955e72
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Script to clone Hugging Face documentation repositories and organize them
based on their toctree structure with proper naming.
"""

import json
import os
import re
import shutil
import subprocess
import sys
import argparse
from tqdm import tqdm
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import yaml


def parse_toctree_yaml(file_path: str) -> Optional[Dict]:
    """Parse a YAML-based toctree file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except Exception as e:
        print(f"Error parsing YAML toctree {file_path}: {e}")
        return None
    

def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool:
    """Run a shell command and return success status."""
    try:
        result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error running command {' '.join(cmd)}: {e}")
        print(f"STDOUT: {e.stdout}")
        print(f"STDERR: {e.stderr}")
        return False
    
def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool:
    """Clone a repository to the target directory."""
    if os.path.exists(Path(target_dir) / Path(dir_to_clone)):
        print(f"Directory {target_dir} already exists, skipping clone")
        return True
    
    # Clone without checking out any files
    out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir])
    if not out_clone: return False
    
    # Initialize sparse checkout without cone mode
    sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir)
    if not sparse_init: return False
    
    # Set sparse checkout patterns to only include the specified directory
    sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**']
    sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir)
    if not sparse_set: return False
    
    # Check out the files based on sparse checkout configuration
    checkout = run_command(["git", "checkout", "main"], cwd=target_dir)
    if not checkout:
        # Try 'master' if 'main' fails
        checkout = run_command(["git", "checkout", "master"], cwd=target_dir)
        if not checkout:
            print(f"Failed to checkout main or master branch in {target_dir}")
            return False
        
    return True


def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path, prefix: str, index: int):
    """
    Recursively saves a documentation section to disk with hierarchical numbering.
    """
    current_number = f"{prefix}{index}"
    numbered_title = f"{current_number}. {section['title']}"

    if "sections" in section:
        # This is a directory
        new_dir_path = file_path / numbered_title
        os.makedirs(new_dir_path, exist_ok=True)
        
        # The new prefix for children adds the current number, e.g., "1.1."
        new_prefix = f"{current_number}."
        for i, subsection in enumerate(section["sections"], 1):
            save_section_to_disk(subsection, new_dir_path, raw_docs_path, new_prefix, i)
            
    else:
        # This is a file
        try:
            local_path = raw_docs_path / f"{section['local']}.md"

            if not local_path.exists():
                local_path = raw_docs_path / f"{section['local']}.mdx"
            assert local_path.exists(), f"File {local_path} does not exist"

            # Create the numbered filename
            new_filename = f"{numbered_title}{local_path.suffix}"
            shutil.copy(local_path, file_path / new_filename)
        
        except Exception as e:
            # TODO: Not many cases, but handle symlinks, missing files, and other edge cases
            pass


def make_docs(repos: Dict, args: Dict):

    for repo_index, repo in enumerate(tqdm(repos, desc="Consolidating 🤗 Documentation"), 1):
        save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}")
        clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path))
        
        repo_docs_path = save_repo_docs_path / repo["subfolder"]
        toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml")

        # Create the top-level numbered directory for the repo, e.g., "1. Accelerate"
        repo_title = f"{repo_index}. {repo['title']}"
        repo_output_path = Path(args.docs_dir) / repo_title
        os.makedirs(repo_output_path, exist_ok=True)

        # The initial prefix for numbering is the repo index, e.g., "1."
        prefix = f"{repo_index}."
        for block_index, block in enumerate(toctree, 1):
            # Start the recursive saving with the initial prefix and the block's index
            save_section_to_disk(block, repo_output_path, repo_docs_path, prefix, block_index)

        shutil.rmtree(save_repo_docs_path)

    shutil.rmtree(args.repos_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--docs_dir", type=str, default="docs")
    parser.add_argument("--repos_dir", type=str, default="repos")
    args = parser.parse_args()

    with open("repos_config.json", "r") as f:
        repos = json.load(f)

    if os.path.exists(args.docs_dir):
        shutil.rmtree(args.docs_dir)

    make_docs(repos, args)