File size: 2,667 Bytes
12a6276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

import gradio as gr
from transformers import pipeline
import re
from langdetect import detect
import numpy as np
import pandas as pd

# Load models for generation and rating
gen_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
rater_models = [
    pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta"),
    pipeline("text-generation", model="google/flan-t5-large")
]

# Language list
languages = {
    "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian",
    "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese"
}

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9]', '', text.lower())

def is_palindrome(text):
    cleaned = clean_text(text)
    return cleaned == cleaned[::-1]

def grammar_prompt(pal, lang):
    return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''

def extract_score(text):
    match = re.search(r"\d{1,3}", text)
    if match:
        score = int(match.group())
        return min(max(score, 0), 100)
    return 0

def run_benchmark():
    results = []
    for code, lang in languages.items():
        prompt = f'''Write the longest original palindrome you can in {lang}. It should be creative and not a known palindrome. If it is not a correct palindrome, you will lose points according to how correct it is.'''

        gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
        valid = is_palindrome(gen_output)
        cleaned_len = len(clean_text(gen_output))
        detected_lang = detect(gen_output)

        scores = []
        for rater in rater_models:
            rprompt = grammar_prompt(gen_output, lang)
            rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
            score = extract_score(rtext)
            scores.append(score)

        avg_score = np.mean(scores)
        penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
        final_score = round(cleaned_len * penalty, 2)

        results.append({
            "Language": lang,
            "Palindrome": gen_output,
            "Valid": "✅" if valid else "❌",
            "Length": cleaned_len,
            "Grammar Score": avg_score,
            "Final Score": final_score,
            "Detected Lang": detected_lang
        })

    df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
    return gr.Dataframe(df)

iface = gr.Interface(fn=run_benchmark, inputs=[], outputs="dataframe", title="🔁 LLM Palindrome Benchmark")
iface.launch()