Spaces:
Sleeping
Sleeping
# β Evo Showcase Mode: Full Evaluation with Correct Answer Comparison | |
import gradio as gr | |
import openai | |
from inference import predict as evo_predict | |
# π SET YOUR GPT-3.5 API KEY HERE | |
openai.api_key = "sk-..." # Replace with your actual key | |
client = openai.OpenAI() | |
def gpt_predict(prompt): | |
try: | |
response = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You're a commonsense reasoning assistant. Only say: Solution 1 or Solution 2."}, | |
{"role": "user", "content": prompt} | |
] | |
) | |
return response.choices[0].message.content.strip() | |
except Exception as e: | |
return f"GPT Error: {str(e)}" | |
def compare(goal, sol1, sol2, correct): | |
if not goal.strip() or not sol1.strip() or not sol2.strip(): | |
return "β οΈ Please provide all inputs.", "", "", "" | |
prompt = f"Goal: {goal}\nSolution 1: {sol1}\nSolution 2: {sol2}\nWhich is better?" | |
evo = evo_predict(goal, sol1, sol2) | |
gpt = gpt_predict(prompt) | |
if evo == gpt: | |
verdict = "β Evo agrees with GPT-3.5" | |
else: | |
verdict = "βοΈ Evo disagrees with GPT-3.5 β explore why." | |
if correct.strip().lower() in ["solution 1", "solution 2"]: | |
if evo == correct and gpt == correct: | |
score_note = "β Both Evo and GPT-3.5 were correct." | |
elif evo == correct: | |
score_note = "π’ Evo was correct. GPT-3.5 was wrong." | |
elif gpt == correct: | |
score_note = "π’ GPT-3.5 was correct. Evo was wrong." | |
else: | |
score_note = "β Both were incorrect." | |
else: | |
score_note = "β οΈ Correct answer not provided or invalid (must be 'Solution 1' or 'Solution 2')." | |
return f"π§ Evo: {evo}", f"π€ GPT-3.5: {gpt}", verdict, score_note | |
examples = [ | |
["Start a fire", "Use a match", "Pour water", "Solution 1"], | |
["Warm up food", "Use microwave", "Put it in fridge", "Solution 1"], | |
["Charge a phone", "Plug it in", "Put it on grass", "Solution 1"], | |
["Stop a car", "Press the brake", "Press the horn", "Solution 1"] | |
] | |
with gr.Blocks(title="βοΈ Evo vs GPT-3.5 β Real-Time Commonsense Showdown") as demo: | |
gr.Markdown(""" | |
# π§ EvoTransformer v2.1 | |
**PIQA Accuracy:** 69.7% | **Model Size:** ~13M Parameters | **Baseline:** GPT-3.5 β 81% | |
EvoTransformer is a scratch-built reasoning model trained on just 1K PIQA examples. No pretraining. No fine-tuning. Pure evolution. | |
Compare its decisions with GPT-3.5 in real-time and witness how intelligence can emerge even from lean, efficient architectures. | |
""") | |
with gr.Row(): | |
goal = gr.Textbox(label="Goal") | |
with gr.Row(): | |
sol1 = gr.Textbox(label="Solution 1") | |
sol2 = gr.Textbox(label="Solution 2") | |
correct = gr.Textbox(label="Correct Answer (Solution 1 or Solution 2)") | |
evo_output = gr.Textbox(label="EvoTransformer Response") | |
gpt_output = gr.Textbox(label="GPT-3.5 Response") | |
verdict_output = gr.Textbox(label="Verdict") | |
score_output = gr.Textbox(label="Correctness Evaluation") | |
submit = gr.Button("Submit") | |
submit.click(fn=compare, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output]) | |
gr.Markdown("### π Examples:") | |
gr.Examples(examples=examples, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output], fn=compare, cache_examples=False) | |
gr.Markdown(""" | |
> π§ͺ *Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. Thatβs part of its evolution.* | |
--- | |
Made with β€οΈ by Dr. Heman Mohabeer β EvoTransformer is not just code. It's evolution. | |
""") | |
demo.launch() |