Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline | |
| import re | |
| import langid | |
| import numpy as np | |
| import pandas as pd | |
| # Load models for generation and rating | |
| gen_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1") | |
| rater_models = [ | |
| pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta"), | |
| pipeline("text-generation", model="google/flan-t5-large") | |
| ] | |
| # Language list | |
| languages = { | |
| "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian", | |
| "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese" | |
| } | |
| def clean_text(text): | |
| return re.sub(r'[^a-zA-Z0-9]', '', text.lower()) | |
| def is_palindrome(text): | |
| cleaned = clean_text(text) | |
| return cleaned == cleaned[::-1] | |
| def grammar_prompt(pal, lang): | |
| return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n''' | |
| def extract_score(text): | |
| match = re.search(r"\d{1,3}", text) | |
| if match: | |
| score = int(match.group()) | |
| return min(max(score, 0), 100) | |
| return 0 | |
| def run_benchmark(): | |
| results = [] | |
| for code, lang in languages.items(): | |
| prompt = f'''Write the longest original palindrome you can in {lang}. It should be creative and not a known palindrome. If it is not a correct palindrome, you will lose points according to how correct it is.''' | |
| gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip() | |
| valid = is_palindrome(gen_output) | |
| cleaned_len = len(clean_text(gen_output)) | |
| detected_lang = langid.classify(gen_output)[0] | |
| scores = [] | |
| for rater in rater_models: | |
| rprompt = grammar_prompt(gen_output, lang) | |
| rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text'] | |
| score = extract_score(rtext) | |
| scores.append(score) | |
| avg_score = np.mean(scores) | |
| penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5 | |
| final_score = round(cleaned_len * penalty, 2) | |
| results.append({ | |
| "Language": lang, | |
| "Palindrome": gen_output, | |
| "Valid": "✅" if valid else "❌", | |
| "Length": cleaned_len, | |
| "Grammar Score": avg_score, | |
| "Final Score": final_score, | |
| "Detected Lang": detected_lang | |
| }) | |
| df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True) | |
| return gr.Dataframe(df) | |
| iface = gr.Interface(fn=run_benchmark, inputs=[], outputs="dataframe", title="🔁 LLM Palindrome Benchmark") | |
| iface.launch() | |