Spaces:

Agents-MCP-Hackathon
/

hfcontext7

Running

App Files Files Community

Abdullah Meda commited on 13 days ago

Commit

7dc78b3

1 Parent(s): 4c57891

initial commit

Browse files

Files changed (5) hide show

.gitignore +174 -0
app.py +36 -0
make_docs.py +131 -0
repo2txt.py +177 -0
repos_config.json +142 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# ruff
+.ruff_cache
+# local
+*.ipynb
+docs/
+repos/

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import gradio as gr
+import os
+import json
+import subprocess
+import tempfile
+import shutil
+from pathlib import Path
+def list_huggingface_resources_names() -> list[str]:
+    """List all the names of the libraries, services, and other resources available within the HuggingFace ecosystem.
+    Returns:
+        A list of libraries, services, and other resources available within the HuggingFace ecosystem
+    """
+    with open('repos_config.json', 'r') as f:
+        repos = json.load(f)
+    return [repo['title'] for repo in repos]
+list_resources_demo = gr.Interface(
+    fn=list_huggingface_resources_names,
+    inputs=[],
+    outputs="json",
+    title="HuggingFace Ecosystem Explorer",
+    description="Explore the names of the libraries, services, and other resources available within the HuggingFace ecosystem"
+)
+# Create tabbed interface
+demo = gr.TabbedInterface(
+    [list_resources_demo],
+    ["List Resources"],
+    title="HuggingFace Ecosystem Documentation Explorer",
+)
+demo.launch(mcp_server=True)

make_docs.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+Script to clone Hugging Face documentation repositories and organize them
+based on their toctree structure with proper naming.
+"""
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import argparse
+from tqdm import tqdm
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import yaml
+def parse_toctree_yaml(file_path: str) -> Optional[Dict]:
+    """Parse a YAML-based toctree file."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return yaml.safe_load(f)
+    except Exception as e:
+        print(f"Error parsing YAML toctree {file_path}: {e}")
+        return None
+def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool:
+    """Run a shell command and return success status."""
+    try:
+        result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error running command {' '.join(cmd)}: {e}")
+        print(f"STDOUT: {e.stdout}")
+        print(f"STDERR: {e.stderr}")
+        return False
+def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool:
+    """Clone a repository to the target directory."""
+    if os.path.exists(Path(target_dir) / Path(dir_to_clone)):
+        print(f"Directory {target_dir} already exists, skipping clone")
+        return True
+    # Clone without checking out any files
+    out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir])
+    if not out_clone: return False
+    # Initialize sparse checkout without cone mode
+    sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir)
+    if not sparse_init: return False
+    # Set sparse checkout patterns to only include the specified directory. Pattern explanation:
+    # '/*' - include all files at root level
+    # '!/*' - exclude all files at root level (overrides previous)
+    # f'/{dir_to_clone}/' - include the specific directory
+    # f'/{dir_to_clone}/**' - include everything under that directory
+    sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**']
+    sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir)
+    if not sparse_set: return False
+    # Check out the files based on sparse checkout configuration
+    checkout = run_command(["git", "checkout", "main"], cwd=target_dir)
+    if not checkout:
+        # Try 'master' if 'main' fails
+        checkout = run_command(["git", "checkout", "master"], cwd=target_dir)
+        if not checkout:
+            print(f"Failed to checkout main or master branch in {target_dir}")
+            return False
+    return True
+def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path):
+    title = section["title"]
+    if "sections" in section:
+        file_path = file_path / title
+        os.makedirs(file_path, exist_ok=True)
+        for subsection in section["sections"]:
+            save_section_to_disk(subsection, file_path, raw_docs_path)
+    else:
+        try:
+            local_path = raw_docs_path / f"{section['local']}.md"
+            if not local_path.exists():
+                local_path = raw_docs_path / f"{section['local']}.mdx"
+            assert local_path.exists(), f"File {local_path} does not exist"
+            shutil.copy(local_path, file_path / f"{title}{local_path.suffix}")
+        except Exception as e:
+            # TODO: Handle symlinks, missing files, and other edge cases
+            pass
+def make_docs(repos: Dict, args: Dict):
+    for repo in tqdm(repos, desc="Consolidating 🤗 Documentation"):
+        save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}")
+        clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path))
+        repo_docs_path = save_repo_docs_path / repo["subfolder"]
+        toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml")
+        # print(toctree)
+        save_doc_path = Path(f"{args.docs_dir}/{repo['title']}")
+        os.makedirs(save_doc_path, exist_ok=True)
+        for block in toctree:
+            save_section_to_disk(block, save_doc_path, repo_docs_path)
+        shutil.rmtree(save_repo_docs_path)
+    shutil.rmtree(args.repos_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--docs_dir", type=str, default="docs")
+    parser.add_argument("--repos_dir", type=str, default="repos")
+    args = parser.parse_args()
+    with open("repos_config.json", "r") as f:
+        repos = json.load(f)
+    make_docs(repos, args)

repo2txt.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+A fork of github.com/donoceidon/repo2txt/blob/main/src/repo2txt/repo2txt.py
+This version only includes the functionality to document the structure of a repository containing .md and .mdx files.
+"""
+import os
+import argparse
+def parse_args():
+    """
+    Parse command-line arguments for the script.
+    Returns:
+        argparse.Namespace: An object containing the parsed command-line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description='Document the structure of a repository containing .md and .mdx files.',
+        epilog='Example usage:\n  python repo2txt.py -r /path/to/repo -o output.txt',
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument('-r', '--repo_path', default=os.getcwd(),
+                        help='Path to the directory to process. Defaults to the current directory.')
+    parser.add_argument('-o', '--output_file', default='output.txt',
+                        help='Name for the output text file. Defaults to "output.txt".')
+    return parser.parse_args()
+def should_ignore(item_path, output_file_path):
+    """
+    Determine if a given item should be ignored.
+    Only includes .md and .mdx files, ignores hidden files and directories.
+    Args:
+        item_path (str): The path of the item (file or directory) to check.
+        output_file_path (str): The path of the output file being written to.
+    Returns:
+        bool: True if the item should be ignored, False otherwise.
+    """
+    item_name = os.path.basename(item_path)
+    # Ignore the output file itself
+    if os.path.abspath(item_path) == os.path.abspath(output_file_path):
+        return True
+    # Ignore hidden files and directories
+    if item_name.startswith('.'):
+        return True
+    # If it's a file, only include .md and .mdx files
+    if os.path.isfile(item_path):
+        file_ext = os.path.splitext(item_name)[1].lower()
+        return file_ext not in ['.md', '.mdx']
+    # Include directories (they will be traversed)
+    return False
+def write_tree(dir_path, output_file, output_file_path, prefix="", is_root=True):
+    """
+    Recursively write the directory tree to the output file.
+    Args:
+        dir_path (str): The path of the directory to document.
+        output_file (file object): The file object to write to.
+        output_file_path (str): The path of the output file being written to.
+        prefix (str): Prefix string for line indentation and structure.
+        is_root (bool): Flag to indicate if the current directory is the root.
+    """
+    if is_root:
+        output_file.write("└── ./\n")
+        # Add the actual directory name as a child of ./
+        actual_dir_name = os.path.basename(dir_path)
+        if actual_dir_name:
+            output_file.write(f"    └── {actual_dir_name}\n")
+            prefix = "        "
+        else:
+            prefix = "    "
+        is_root = False
+    try:
+        items = os.listdir(dir_path)
+    except PermissionError:
+        return
+    items.sort()
+    # Filter out items that should be ignored
+    filtered_items = []
+    for item in items:
+        item_path = os.path.join(dir_path, item)
+        if not should_ignore(item_path, output_file_path):
+            filtered_items.append(item)
+    num_items = len(filtered_items)
+    for index, item in enumerate(filtered_items):
+        item_path = os.path.join(dir_path, item)
+        is_last_item = (index == num_items - 1)
+        new_prefix = "└── " if is_last_item else "├── "
+        child_prefix = "    " if is_last_item else "│   "
+        output_file.write(f"{prefix}{new_prefix}{item}\n")
+        if os.path.isdir(item_path):
+            next_prefix = prefix + child_prefix
+            write_tree(item_path, output_file, output_file_path, next_prefix, is_root=False)
+def write_file_content(file_path, output_file):
+    """
+    Write the contents of a given file to the output file.
+    Args:
+        file_path (str): Path of the file to read.
+        output_file (file object): The file object to write the contents to.
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+            for line in file:
+                output_file.write(line)
+    except Exception as e:
+        output_file.write(f"Error reading file: {e}\n")
+def write_file_contents_in_order(dir_path, output_file, output_file_path, repo_path):
+    """
+    Recursively document the contents of .md and .mdx files in directory order.
+    Args:
+        dir_path (str): The path of the directory to start documenting from.
+        output_file (file object): The file object to write the contents to.
+        output_file_path (str): The path of the output file being written to.
+        repo_path (str): The root path of the repository for relative path calculation.
+    """
+    try:
+        items = os.listdir(dir_path)
+    except PermissionError:
+        return
+    items = sorted(item for item in items if not should_ignore(os.path.join(dir_path, item), output_file_path))
+    for item in items:
+        item_path = os.path.join(dir_path, item)
+        relative_path = os.path.relpath(item_path, start=repo_path)
+        if os.path.isdir(item_path):
+            write_file_contents_in_order(item_path, output_file, output_file_path, repo_path)
+        elif os.path.isfile(item_path):
+            output_file.write(f"\n\n---\nFile: /{relative_path}\n---\n\n")
+            write_file_content(item_path, output_file)
+def main():
+    """
+    Main function to execute the script logic.
+    """
+    args = parse_args()
+    # Check if the provided directory path is valid
+    if not os.path.isdir(args.repo_path):
+        print(f"Error: The specified directory does not exist: {args.repo_path}")
+        return
+    with open(args.output_file, 'w', encoding='utf-8') as output_file:
+        output_file.write("Directory Structure:\n\n")
+        write_tree(args.repo_path, output_file, args.output_file, "", is_root=True)
+        write_file_contents_in_order(args.repo_path, output_file, args.output_file, args.repo_path)
+    print(f"Documentation generated successfully: {args.output_file}")
+if __name__ == "__main__":
+    main()

repos_config.json ADDED Viewed

	@@ -0,0 +1,142 @@

+[
+  {
+    "repo_url": "https://github.com/huggingface/hub-docs",
+    "subfolder": "docs/hub",
+    "title": "HuggingFace Hub"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/huggingface_hub",
+    "subfolder": "docs/source/en",
+    "title": "HuggingFace Hub Python Library"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/dataset-viewer",
+    "subfolder": "docs/source",
+    "title": "Dataset Viewer"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/hub-docs",
+    "subfolder": "docs/inference-providers",
+    "title": "Inference Providers"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/text-generation-inference",
+    "subfolder": "docs/source",
+    "title": "Text Generation Inference"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/hf-endpoints-documentation",
+    "subfolder": "docs/source",
+    "title": "Inference Endpoints"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/text-embeddings-inference",
+    "subfolder": "docs/source/en",
+    "title": "Text Embeddings Inference"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/hub-docs",
+    "subfolder": "docs/sagemaker/source",
+    "title": "Amazon SageMaker"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/transformers",
+    "subfolder": "docs/source/en",
+    "title": "Transformers"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/transformers.js",
+    "subfolder": "docs/source",
+    "title": "Transformers.js"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/pytorch-image-models",
+    "subfolder": "hfdocs/source",
+    "title": "timm"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/diffusers",
+    "subfolder": "docs/source/en",
+    "title": "Diffusers"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/tokenizers",
+    "subfolder": "docs/source-doc-builder",
+    "title": "Tokenizers"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/datasets",
+    "subfolder": "docs/source",
+    "title": "Datasets"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/evaluate",
+    "subfolder": "docs/source",
+    "title": "Evaluate"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/peft",
+    "subfolder": "docs/source",
+    "title": "PEFT"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/optimum-neuron",
+    "subfolder": "docs/source",
+    "title": "Optimus Neuron: AWS Trainium & Inferentia"
+  },
+  {
+    "repo_url": "https://github.com/bitsandbytes-foundation/bitsandbytes",
+    "subfolder": "docs/source",
+    "title": "bitsandbytes"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/accelerate",
+    "subfolder": "docs/source",
+    "title": "Accelerate"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/trl",
+    "subfolder": "docs/source",
+    "title": "TRL"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/lighteval",
+    "subfolder": "docs/source",
+    "title": "Lighteval"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/optimum",
+    "subfolder": "docs/source",
+    "title": "Optimum"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/safetensors",
+    "subfolder": "docs/source",
+    "title": "Safetensors"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/chat-ui",
+    "subfolder": "docs/source",
+    "title": "Chat UI"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/autotrain-advanced",
+    "subfolder": "docs/source",
+    "title": "AutoTrain"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/smolagents",
+    "subfolder": "docs/source/en",
+    "title": "smolagents"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/lerobot",
+    "subfolder": "docs/source",
+    "title": "LeRobot"
+  },
+  {
+    "repo_url": "https://github.com/huggingface/leaderboards",
+    "subfolder": "docs/source/en",
+    "title": "Leaderboards and Evaluations"
+  }
+]