gguf-my-repo

Running

File size: 22,677 Bytes

import os
import subprocess
import signal
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import gradio as gr
import tempfile

from huggingface_hub import HfApi, ModelCard, whoami
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from pathlib import Path
from textwrap import dedent
from apscheduler.schedulers.background import BackgroundScheduler

# Space parameters
SPACE_ID = os.environ.get("SPACE_ID") if os.environ.get("SPACE_ID") else ""
SPACE_URL = "https://" + SPACE_ID.replace("/", "-") + ".hf.space/" if SPACE_ID else "http://localhost:7860/"
HF_TOKEN = os.environ.get("HF_TOKEN")

# Folder
DOWNLOAD_FOLDER = "./downloads"
OUTPUT_FOLDER = "./outputs"

def create_folder(folder_name: str):
    if not os.path.exists(folder_name):
        print(f"Creating folder: {folder_name}")
        os.makedirs(folder_name)

def is_valid_token(oauth_token):
    if oauth_token is None or oauth_token.token is None:
        return False

    try:
        whoami(oauth_token.token)
    except Exception as e:
        return False

    return True

# escape HTML for logging
def escape(s: str) -> str:
    s = s.replace("&", "&amp;") # Must be done first!
    s = s.replace("<", "&lt;")
    s = s.replace(">", "&gt;")
    s = s.replace('"', "&quot;")
    s = s.replace("\n", "<br/>")
    return s

def get_model_creator(model_id: str):
    return model_id.split('/')[0]

def get_model_name(model_id: str):
    return model_id.split('/')[-1]

def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
    if not os.path.isfile(model_path):
        raise Exception(f"Model file not found: {model_path}")

    print("Running imatrix command...")
    imatrix_command = [
        "llama-imatrix",
        "-m", model_path,
        "-f", train_data_path,
        "-ngl", "99",
        "--output-frequency", "10",
        "-o", output_path,
    ]
    process = subprocess.Popen(imatrix_command, shell=False)

    try:
        process.wait(timeout=60)  # added wait
    except subprocess.TimeoutExpired:
        print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
        process.send_signal(signal.SIGINT)
        try:
            process.wait(timeout=5)  # grace period
        except subprocess.TimeoutExpired:
            print("Imatrix proc still didn't term. Forecfully terming process...")
            process.kill()

    print("Importance matrix generation completed.")

def split_upload_model(model_path: str, outdir: str, repo_id: str, token: str, split_max_tensors=256, split_max_size=None):
    print(f"Model path: {model_path}")
    print(f"Output dir: {outdir}")

    split_cmd = [
        "llama-gguf-split",
        "--split",
    ]
    if split_max_size:
        split_cmd.append("--split-max-size")
        split_cmd.append(split_max_size)
    else:
        split_cmd.append("--split-max-tensors")
        split_cmd.append(str(split_max_tensors))

    # args for output
    model_path_prefix = '.'.join(model_path.split('.')[:-1]) # remove the file extension
    split_cmd.append(model_path)
    split_cmd.append(model_path_prefix)

    print(f"Split command: {split_cmd}")

    result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
    print(f"Split command stdout: {result.stdout}")
    print(f"Split command stderr: {result.stderr}")

    if result.returncode != 0:
        stderr_str = result.stderr.decode("utf-8")
        raise Exception(f"Error splitting the model: {stderr_str}")
    print("Model split successfully!")

    # remove the original model file if needed
    if os.path.exists(model_path):
        os.remove(model_path)

    model_file_prefix = model_path_prefix.split('/')[-1]
    print(f"Model file name prefix: {model_file_prefix}")
    sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
    if sharded_model_files:
        print(f"Sharded model files: {sharded_model_files}")
        api = HfApi(token=token)
        for file in sharded_model_files:
            file_path = os.path.join(outdir, file)
            try:
                print(f"Uploading file: {file_path}")
                api.upload_file(
                    path_or_fileobj=file_path,
                    path_in_repo=file,
                    repo_id=repo_id,
                )
            except Exception as e:
                raise Exception(f"Error uploading file {file_path}: {e}")
    else:
        raise Exception("No sharded files found.")

    print("Sharded model has been uploaded successfully!")

def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDirectory):
    model_name = get_model_name(model_id)

    with tempfile.TemporaryDirectory(dir=DOWNLOAD_FOLDER) as tmpdir:
        # Download model
        print(f"Downloading model {model_name}")
        local_dir = Path(tmpdir)/model_name # Keep the model name as the dirname so the model name metadata is populated correctly
        print(f"Local directory: {os.path.abspath(local_dir)}")

        api = HfApi(token=token)
        pattern = (
            "*.safetensors"
            if any(
                file.path.endswith(".safetensors")
                for file in api.list_repo_tree(
                    repo_id=model_id,
                    recursive=True,
                )
            )
            else "*.bin"
        )

        dl_pattern = ["*.md", "*.json", "*.model"]
        dl_pattern += [pattern]

        api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
        print("Model downloaded successfully!")

        print(f"Model directory contents: {os.listdir(local_dir)}")
        config_dir = local_dir/"config.json"
        adapter_config_dir = local_dir/"adapter_config.json"
        if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
            raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')

        # Convert HF to GGUF
        fp16_model = str(Path(outdir)/f"{model_name}_fp16.gguf")
        print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
        result = subprocess.run(
            [
                "python3", "/app/convert_hf_to_gguf.py", local_dir, "--outtype", "f16", "--outfile", fp16_model
            ],
            shell=False,
            capture_output=True
        )
        print(f"Model directory contents: {result}")
        if result.returncode != 0:
            stderr_str = result.stderr.decode("utf-8")
            raise Exception(f"Error converting to fp16: {stderr_str}")

        print("Model converted to fp16 successfully!")
        print(f"Converted model path: {os.path.abspath(fp16_model)}")

        return fp16_model

def quantize_model(
    outdir: tempfile.TemporaryDirectory,
    gguf_name: str,
    fp16: str,
    q_method: str,
    use_imatrix: bool,
    imatrix_q_method: str,
    imatrix_path: str,
    quant_embedding: bool,
    embedding_tensor_method: str,
    leave_output: bool,
    quant_output: bool,
    output_tensor_method: str,
):
    if use_imatrix:
        if train_data_file:
            train_data_path = train_data_file.name
        else:
            train_data_path = "train_data.txt" #fallback calibration dataset

        print(f"Training data file path: {train_data_path}")

        if not os.path.isfile(train_data_path):
            raise Exception(f"Training data file not found: {train_data_path}")

        generate_importance_matrix(fp16, train_data_path, imatrix_path)
    else:
        print("Not using imatrix quantization.")

    # Quantize the model
    quantize_cmd = ["llama-quantize"]

    if quant_embedding:
        quantize_cmd.append("--token-embedding-type")
        quantize_cmd.append(embedding_tensor_method)
    if leave_output:
        quantize_cmd.append("--leave-output-tensor")
    else:
        if quant_output:
            quantize_cmd.append("--output-tensor-type")
            quantize_cmd.append(output_tensor_method)
    if use_imatrix:
        quantize_cmd.append("--imatrix")
        quantize_cmd.append(imatrix_path)

    quantized_gguf = str(Path(outdir)/gguf_name)
    quantize_cmd.append(fp16)
    quantize_cmd.append(quantized_gguf)

    if use_imatrix:
        quantize_cmd.append(imatrix_q_method)
    else:
        quantize_cmd.append(q_method)

    print(f"Quantizing model with {quantize_cmd}")
    result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
    if result.returncode != 0:
        stderr_str = result.stderr.decode("utf-8")
        raise Exception(f"Error quantizing: {stderr_str}")

    print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
    print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
    return quantized_gguf

def generate_readme(outdir: tempfile.TemporaryDirectory, token: str, model_id: str, new_repo_id: str, gguf_name: str):
    creator = get_model_creator(model_id)
    model_name = get_model_name(model_id)
    username = whoami(token)["name"]

    try:
        card = ModelCard.load(model_id, token=token)
    except:
        card = ModelCard("")

    if card.data.tags is None:
        card.data.tags = []

    card.data.tags.append("llama-cpp")
    card.data.tags.append("gguf-my-repo")
    card.data.base_model = model_id
    card.text = dedent(
        f"""
# {model_name}

**Model creator:** [{creator}](https://huggingface.co/{creator})<br/>
**Original model**: [{model_id}](https://huggingface.co/{model_id})<br/>
**GGUF quantization:** provided by [{username}](https:/huggingface.co/{username}) using `llama.cpp`<br/>

## Special thanks

🙏 Special thanks to [Georgi Gerganov](https://github.com/ggerganov) and the whole team working on [llama.cpp](https://github.com/ggerganov/llama.cpp/) for making all of this possible.

## Use with Ollama

```bash
ollama run hf.co/{new_repo_id}:<quantization>
```

## Use with LM Studio

```bash
lms load {new_repo_id}
```

## Use with llama.cpp CLI

```bash
llama-cli --hf-repo {new_repo_id} --hf-file {gguf_name} -p "The meaning to life and the universe is"
```

## Use with llama.cpp Server:

```bash
llama-server --hf-repo {new_repo_id} --hf-file {gguf_name} -c 4096
```
        """
    )
    readme_path = Path(outdir)/"README.md"
    card.save(readme_path)
    return readme_path

def process_model(
    model_id: str,
    q_method: str,
    use_imatrix: bool,
    imatrix_q_method: str,
    private_repo: bool,
    train_data_file,
    repo_name: str,
    gguf_name: str,
    quant_embedding: bool,
    embedding_tensor_method: str,
    leave_output: bool,
    quant_output: bool,
    output_tensor_method: str,
    split_model: bool,
    split_max_tensors,
    split_max_size: str | None,
    oauth_token: gr.OAuthToken | None,
):
    # validate the oauth token
    if is_valid_token(oauth_token) is False:
        raise gr.Error("You must be logged in to use GGUF-my-repo")

    token = oauth_token.token
    print(f"Current working directory: {os.path.abspath(os.getcwd())}")
    create_folder(DOWNLOAD_FOLDER)
    create_folder(OUTPUT_FOLDER)

    try:
        with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
            fp16 = download_base_model(token, model_id, outdir)
            imatrix_path = Path(outdir)/"imatrix.dat"
            quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method, imatrix_path, quant_embedding, embedding_tensor_method, leave_output, quant_output, output_tensor_method)

            # Create empty repo
            api = HfApi(token=token)
            new_repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, private=private_repo)
            new_repo_id = new_repo_url.repo_id
            print("Repo created successfully!", new_repo_url)

            # Upload model
            if split_model:
                print(f"Splitting quantized model: {quantized_gguf}")
                split_upload_model(str(quantized_gguf), outdir, new_repo_id, token, split_max_tensors, split_max_size)
            else:
                try:
                    print(f"Uploading quantized model: {quantized_gguf}")
                    api.upload_file(
                        path_or_fileobj=quantized_gguf,
                        path_in_repo=gguf_name,
                        repo_id=new_repo_id,
                    )
                except Exception as e:
                    raise Exception(f"Error uploading quantized model: {e}")

            if os.path.isfile(imatrix_path):
                try:
                    print(f"Uploading imatrix.dat: {imatrix_path}")
                    api.upload_file(
                        path_or_fileobj=imatrix_path,
                        path_in_repo="imatrix.dat",
                        repo_id=new_repo_id,
                    )
                except Exception as e:
                    raise Exception(f"Error uploading imatrix.dat: {e}")

            # Upload README.md
            readme_path = generate_readme(outdir, token, model_id, new_repo_id, gguf_name)

            api.upload_file(
                path_or_fileobj=readme_path,
                path_in_repo="README.md",
                repo_id=new_repo_id,
            )
            print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")

        # end of the TemporaryDirectory(dir="outputs") block; temporary outputs are deleted here

        return (
            f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
            "llama.png",
        )
    except Exception as e:
        print((f"Error processing model: {e}"))
        return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")


css="""/* Custom CSS to allow scrolling */
.gradio-container {overflow-y: auto;}
"""
#####
# Base model section
#####
model_id = HuggingfaceHubSearch(
    label="Hub Model ID",
    placeholder="Search for model id on Huggingface",
    search_type="model",
)

#####
# Quantization section
#####
use_imatrix = gr.Checkbox(
    value=False,
    label="Use Imatrix Quantization",
    info="Use importance matrix for quantization."
)

q_method = gr.Dropdown(
    choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
    label="Quantization Method",
    info="GGML quantization type",
    value="Q4_K_M",
    filterable=False,
    visible=True
)

imatrix_q_method = gr.Dropdown(
    choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
    label="Imatrix Quantization Method",
    info="GGML imatrix quants type",
    value="IQ4_NL",
    filterable=False,
    visible=False
)

train_data_file = gr.File(
    label="Training Data File",
    file_types=[".txt"],
    visible=False
)

#####
# Advanced Options section
#####
split_model = gr.Checkbox(
    value=False,
    label="Split Model",
    info="Shard the model using gguf-split."
)

split_max_tensors = gr.Number(
    value=256,
    label="Max Tensors per File",
    info="Maximum number of tensors per file when splitting model.",
    visible=False
)

split_max_size = gr.Textbox(
    label="Max File Size",
    info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
    visible=False
)

leave_output = gr.Checkbox(
    value=False,
    label="Leave output tensor",
    info="Leaves output.weight un(re)quantized"
)

quant_embedding = gr.Checkbox(
    value=False,
    label="Quant embeddings tensor",
    info="Quantize embeddings tensor separately"
)
embedding_tensor_method = gr.Dropdown(
    choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
    label="Output Quantization Method",
    info="use a specific quant type for the token embeddings tensor",
    value="Q8_0",
    filterable=False,
    visible=False
)

quant_output = gr.Checkbox(
    value=False,
    label="Quant output tensor",
    info="Quantize output tensor separately"
)
output_tensor_method = gr.Dropdown(
    choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
    label="Output Quantization Method",
    info="use a specific quant type for the output.weight tensor",
    value="Q8_0",
    filterable=False,
    visible=False
)

#####
# Output Settings section
#####
private_repo = gr.Checkbox(
    value=False,
    label="Private Repo",
    info="Create a private repo under your username."
)

repo_name = gr.Textbox(
    label="Output Repository Name",
    info="Set your repository name",
    max_lines=1
)

gguf_name = gr.Textbox(
    label="Output File Name",
    info="Set output file name",
    max_lines=1
)

def update_output_repo(model_id, oauth_token: gr.OAuthToken | None):
    if oauth_token is None or not oauth_token.token:
        return ""

    if not model_id:
        return ""

    username = whoami(oauth_token.token)["name"]
    model_name = get_model_name(model_id)
    return f"{username}/{model_name}-GGUF"

def update_output_filename(model_id, use_imatrix, q_method, imatrix_q_method):
    if not model_id:
        return ""

    model_name = get_model_name(model_id)

    if use_imatrix:
        return f"{model_name}-{imatrix_q_method.upper()}-imat.gguf"

    return f"{model_name}-{q_method.upper()}.gguf"

#####
# Buttons section
#####
clear_btn = gr.ClearButton(
    value="Clear",
    variant="secondary",
    components=[
        model_id,
        q_method,
        use_imatrix,
        imatrix_q_method,
        private_repo,
        train_data_file,
        leave_output,
        quant_embedding,
        embedding_tensor_method,
        quant_output,
        output_tensor_method,
        split_model,
        split_max_tensors,
        split_max_size,
        repo_name,
        gguf_name,
    ]
)
submit_btn = gr.Button(
    value="Submit",
    variant="primary"
)

#####
# Outputs section
#####
output_label = gr.Markdown(label="output")

output_image = gr.Image(
    show_label=False,
    show_download_button=False,
    interactive=False
)

# Create Gradio interface
with gr.Blocks(css=css) as demo:
    #####
    # Layout
    #####
    gr.Markdown("You must be logged in to use GGUF-my-repo.")
    gr.LoginButton(min_width=250)

    gr.HTML("<h1 style=\"text-aling:center;\">Create your own GGUF Quants!</h1>")
    gr.Markdown(f"The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.<br/>Use via {SPACE_URL}")

    with gr.Row():
        with gr.Column() as inputs:
            gr.Markdown("### Model Configuration")
            model_id.render()

            with gr.Column():
                use_imatrix.render()
                q_method.render()
                imatrix_q_method.render()
                train_data_file.render()

            gr.Markdown("### Advanced Options")

            quant_embedding.render()
            embedding_tensor_method.render()
            leave_output.render()
            quant_output.render()
            output_tensor_method.render()

            split_model.render()
            with gr.Row() as split_options:  # Group split options
                split_max_tensors.render()
                split_max_size.render()

            gr.Markdown("### Output Settings")
            gr.Markdown("You can customize settings for your GGUF repo.")
            private_repo.render()
            with gr.Row():
                repo_name.render()
                gguf_name.render()

            # Buttons
            with gr.Row() as buttons:
                clear_btn.render()
                submit_btn.render()

        with gr.Column() as outputs:
            output_label.render()
            output_image.render()

    #####
    # Button Click handlers
    #####
    submit_btn.click(
        fn=process_model,
        inputs=[
            model_id,
            q_method,
            use_imatrix,
            imatrix_q_method,
            private_repo,
            train_data_file,
            repo_name,
            gguf_name,
            quant_embedding,
            embedding_tensor_method,
            leave_output,
            quant_output,
            output_tensor_method,
            split_model,
            split_max_tensors,
            split_max_size
        ],
        outputs=[
            output_label,
            output_image,
        ],
    )

    #####
    # OnChange handlers
    #####
    use_imatrix.change(
        fn=lambda use_imatrix: [gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)],
        inputs=use_imatrix,
        outputs=[q_method, imatrix_q_method, train_data_file]
    )

    split_model.change(
        fn=lambda split_model: [gr.update(visible=split_model), gr.update(visible=split_model)],
        inputs=split_model,
        outputs=[split_max_tensors, split_max_size]
    )

    quant_embedding.change(
        fn=lambda quant_embedding: gr.update(visible=quant_embedding),
        inputs=quant_embedding,
        outputs=[embedding_tensor_method]
    )

    quant_output.change(
        fn=lambda quant_output: [gr.update(visible=quant_output), gr.update(visible=not quant_output)],
        inputs=quant_output,
        outputs=[output_tensor_method, leave_output]
    )

    model_id.change(
        fn=update_output_repo,
        inputs=model_id,
        outputs=[repo_name]
    )

    model_id.change(
        fn=update_output_filename,
        inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
        outputs=[gguf_name]
    )
    use_imatrix.change(
        fn=update_output_filename,
        inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
        outputs=[gguf_name]
    )
    q_method.change(
        fn=update_output_filename,
        inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
        outputs=[gguf_name]
    )
    imatrix_q_method.change(
        fn=update_output_filename,
        inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
        outputs=[gguf_name]
    )

def restart_space():
    HfApi().restart_space(repo_id=SPACE_ID, token=HF_TOKEN, factory_reboot=True)

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=21600)
scheduler.start()

# Launch the interface
demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)