File size: 4,846 Bytes
12a6276
9d5d030
12a6276
 
 
 
9d5d030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12a6276
 
9d5d030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12a6276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5d030
 
 
 
 
 
12a6276
 
9d5d030
 
 
 
 
 
 
 
 
12a6276
 
9d5d030
12a6276
 
 
9d5d030
 
 
 
 
 
 
 
12a6276
 
9d5d030
12a6276
 
 
 
 
 
0f3b364
12a6276
9d5d030
12a6276
 
 
9d5d030
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
from transformers import pipeline, set_seed
import re
import numpy as np
import pandas as pd

# Set a seed for reproducibility
set_seed(42)

# List of premium generation models (as suggested from the Vellum AI leaderboard)
generation_model_names = [
    "mistralai/Mistral-7B-v0.1",
    "mistralai/Mixtral-8x7B-v0.1",
    "meta-llama/Llama-4-Scout",
    "meta-llama/Llama-4-Maverick",
    "Qwen/Qwen2.5-72B",
    "HuggingFaceH4/zephyr-7b-beta",
    "01-ai/Yi-34B",
    "deepseek-ai/deepseek-llm-67b-base",
    "HuggingFaceH4/zephyr-7b-alpha",
    "microsoft/Marcoroni-7B-v3"
]

# List of cost-effective grammar evaluation models
grammar_model_names = [
    "vennify/t5-base-grammar-correction",
    "hassaanik/grammar-correction-model"
]

# Load a generation pipeline given the model name.
def load_generation_pipeline(model_name):
    try:
        return pipeline("text-generation", model=model_name)
    except Exception as e:
        print(f"Error loading generation model {model_name}: {e}")
        return None

# Load a grammar evaluation pipeline (text2text-generation)
def load_grammar_pipeline(model_name):
    try:
        return pipeline("text2text-generation", model=model_name)
    except Exception as e:
        print(f"Error loading grammar model {model_name}: {e}")
        return None

# Pre-load grammar evaluator models (assumed to be cost-effective and stable)
rater_models = []
for model_name in grammar_model_names:
    p = load_grammar_pipeline(model_name)
    if p is not None:
        rater_models.append(p)

# Language dictionary
languages = {
    "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian",
    "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese"
}

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9]', '', text.lower())

def is_palindrome(text):
    cleaned = clean_text(text)
    return cleaned == cleaned[::-1]

def grammar_prompt(pal, lang):
    return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''

def extract_score(text):
    match = re.search(r"\d{1,3}", text)
    if match:
        score = int(match.group())
        return min(max(score, 0), 100)
    return 0

def run_benchmark(selected_model):
    # Load the selected premium generation pipeline
    gen_model = load_generation_pipeline(selected_model)
    if gen_model is None:
        return "Error loading generation model."
    
    results = []
    for code, lang in languages.items():
        prompt = (
            f"Write the longest original palindrome you can in {lang}. "
            f"It should be creative and not a known palindrome. "
            f"If it is not a correct palindrome, you will lose points according to how correct it is."
        )
        try:
            gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
        except Exception as e:
            gen_output = f"Error generating text: {e}"
        valid = is_palindrome(gen_output)
        cleaned_len = len(clean_text(gen_output))
        
        scores = []
        for rater in rater_models:
            rprompt = grammar_prompt(gen_output, lang)
            try:
                # For a text2text model, we assume the output contains a number (0-100)
                rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
                score = extract_score(rtext)
                scores.append(score)
            except Exception as e:
                scores.append(0)
        avg_score = np.mean(scores) if scores else 0
        penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
        final_score = round(cleaned_len * penalty, 2)
        
        results.append({
            "Language": lang,
            "Palindrome": gen_output,
            "Valid": "✅" if valid else "❌",
            "Length": cleaned_len,
            "Grammar Score": avg_score,
            "Final Score": final_score
        })
    
    df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
    return gr.Dataframe(df)

# Build the Gradio UI using Blocks (canvas layout)
with gr.Blocks(title="LLM Palindrome Benchmark - Premium Generation Models") as demo:
    gr.Markdown("# LLM Palindrome Benchmark")
    gr.Markdown("Select one of the premium generation models below (for non-commercial, educational usage) and run the benchmark.")
    
    with gr.Row():
        model_dropdown = gr.Dropdown(choices=generation_model_names, label="Select Premium Generation Model")
        run_button = gr.Button("Run Benchmark")
    
    output_table = gr.Dataframe(label="Benchmark Results")
    
    run_button.click(fn=run_benchmark, inputs=model_dropdown, outputs=output_table)

demo.launch()