Spaces:

trl-lib
/

dataset-length-profiler

Running

App Files Files Community

qgallouedec HF Staff commited on 6 days ago

Commit

f9089ef

verified ·

1 Parent(s): f2593be

Create app.py

Browse files

Files changed (1) hide show

app.py +87 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+from datasets import load_dataset
+from trl import SFTTrainer, SFTConfig
+from transformers import AutoTokenizer
+import pandas as pd
+import numpy as np
+TRUNCATION_LENGTHS = [128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
+SEED = 42
+N_SAMPLES = 1000
+CODE_TEMPLATE = """
+training_args = SFTConfig(
+    ...,
+    max_length={},
+)"""
+def benchmark(model_name, dataset_name):
+    print(f"Running benchmark for model: {model_name} on dataset: {dataset_name}...")
+    print("Loading dataset...")
+    dataset = load_dataset(dataset_name, split="train", streaming=True).shuffle(seed=SEED).take(N_SAMPLES)
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    print("Tokenizing dataset...")
+    config = SFTConfig(max_length=None, bf16=False)
+    tokenized_dataset = SFTTrainer._prepare_dataset(
+        None, dataset, tokenizer, config, packing=False, formatting_func=None, dataset_name="train"
+    )
+    print("Computing the sequence lengths and total tokens")
+    sequence_lengths = [len(sample["input_ids"]) for sample in tokenized_dataset]
+    total_tokens = sum(sequence_lengths)
+    print("Computing the truncation ratios")
+    truncation_ratios = []
+    recommended = None
+    for max_len in TRUNCATION_LENGTHS:
+        total_truncated_tokens = sum(max(length - max_len, 0) for length in sequence_lengths)
+        truncation_ratio = total_truncated_tokens / total_tokens * 100
+        truncation_ratios.append(truncation_ratio)
+        if recommended is None and truncation_ratio < 5.0:
+            recommended = max_len
+    hist = np.histogram(sequence_lengths, bins=50)
+    lengths_distribution = pd.DataFrame({
+        "max_length": (hist[1][:-1] + hist[1][1:])/2,
+        "Ratio (%)": hist[0]/N_SAMPLES*100,
+    })
+    truncation_data = pd.DataFrame({
+        "max_length": [str(value) for value in TRUNCATION_LENGTHS],
+        "Ratio (%)": truncation_ratios,
+    })
+    return lengths_distribution, truncation_data, CODE_TEMPLATE.format(recommended)
+with gr.Blocks() as demo:
+    model_input = gr.Textbox(label="Model Name", value="Qwen/Qwen3-0.6B")
+    dataset_input = gr.Textbox(label="Dataset Name", value="trl-lib/tldr")
+    run_button = gr.Button("Run estimation")
+    lengths_plot = gr.BarPlot(None, title="Length distribution", x="max_length", y="Ratio (%)")
+    truncation_ratio_plot = gr.BarPlot(None, title="Truncation ratio (how many tokens are discarded)", x="max_length", y="Ratio (%)")
+    recommended_code = gr.Code(CODE_TEMPLATE.format("..."), language="python", label="Recommended configuration")
+    run_button.click(fn=benchmark, inputs=[model_input, dataset_input], outputs=[lengths_plot, truncation_ratio_plot, recommended_code])
+    with gr.Accordion("See details", open=False):
+        gr.Markdown("""
+This tool helps you choose an appropriate `max_length` value for your SFT training (`SFTConfig`) by analyzing the tokenized dataset.
+**How it works:**
+- Randomly samples 1,000 examples from your dataset.
+- Prepares and tokenizes the data exactly as `SFTTrainer` would.
+- Generates two visualizations:
+    - **Sequence Length Distribution:** Shows how long your tokenized sequences are.
+    - **Truncation Ratio:** Estimates the percentage of tokens that would be discarded (truncated) for different `max_length` values.
+- Recommends the smallest `max_length` where truncation affects less than 5% of the tokens.
+Use this tool to balance efficiency and memory usage when setting your `max_length` parameter.
+""")
+demo.launch()