File size: 4,734 Bytes
7dc78b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f126864
7dc78b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92dd823
7dc78b3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Script to clone Hugging Face documentation repositories and organize them
based on their toctree structure with proper naming.
"""

import json
import os
import re
import shutil
import subprocess
import sys
import argparse
from tqdm import tqdm
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import yaml


def parse_toctree_yaml(file_path: str) -> Optional[Dict]:
    """Parse a YAML-based toctree file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except Exception as e:
        print(f"Error parsing YAML toctree {file_path}: {e}")
        return None
    

def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool:
    """Run a shell command and return success status."""
    try:
        result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error running command {' '.join(cmd)}: {e}")
        print(f"STDOUT: {e.stdout}")
        print(f"STDERR: {e.stderr}")
        return False
    
def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool:
    """Clone a repository to the target directory."""
    if os.path.exists(Path(target_dir) / Path(dir_to_clone)):
        print(f"Directory {target_dir} already exists, skipping clone")
        return True
    
    # Clone without checking out any files
    out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir])
    if not out_clone: return False
    
    # Initialize sparse checkout without cone mode
    sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir)
    if not sparse_init: return False
    
    # Set sparse checkout patterns to only include the specified directory. Pattern explanation:
    # '/*' - include all files at root level
    # '!/*' - exclude all files at root level (overrides previous)
    # f'/{dir_to_clone}/' - include the specific directory
    # f'/{dir_to_clone}/**' - include everything under that directory
    sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**']
    sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir)
    if not sparse_set: return False
    
    # Check out the files based on sparse checkout configuration
    checkout = run_command(["git", "checkout", "main"], cwd=target_dir)
    if not checkout:
        # Try 'master' if 'main' fails
        checkout = run_command(["git", "checkout", "master"], cwd=target_dir)
        if not checkout:
            print(f"Failed to checkout main or master branch in {target_dir}")
            return False
        
    return True


def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path):

    title = section["title"]

    if "sections" in section:
        file_path = file_path / title
        os.makedirs(file_path, exist_ok=True)
        for subsection in section["sections"]:
            save_section_to_disk(subsection, file_path, raw_docs_path)
            
    else:
        try:
            local_path = raw_docs_path / f"{section['local']}.md"

            if not local_path.exists():
                local_path = raw_docs_path / f"{section['local']}.mdx"
            assert local_path.exists(), f"File {local_path} does not exist"

            shutil.copy(local_path, file_path / f"{title}{local_path.suffix}")
        
        except Exception as e:
            # TODO: Not many cases, but handle symlinks, missing files, and other edge cases
            pass


def make_docs(repos: Dict, args: Dict):

    for repo in tqdm(repos, desc="Consolidating 🤗 Documentation"):
        save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}")
        clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path))
        
        repo_docs_path = save_repo_docs_path / repo["subfolder"]
        toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml")

        # print(toctree)

        save_doc_path = Path(f"{args.docs_dir}/{repo['title']}")
        os.makedirs(save_doc_path, exist_ok=True)

        for block in toctree:
            save_section_to_disk(block, save_doc_path, repo_docs_path)

        shutil.rmtree(save_repo_docs_path)

    shutil.rmtree(args.repos_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--docs_dir", type=str, default="docs")
    parser.add_argument("--repos_dir", type=str, default="repos")
    args = parser.parse_args()

    with open("repos_config.json", "r") as f:
        repos = json.load(f)

    # shutil.rmtree(args.docs_dir)
    make_docs(repos, args)