HemanM's picture
Update app.py
7c4d437 verified
raw
history blame
3.93 kB
# βœ… Evo Showcase Mode: Full Evaluation with Correct Answer Comparison
import gradio as gr
import openai
from inference import predict as evo_predict
# πŸ” SET YOUR GPT-3.5 API KEY HERE
openai.api_key = "sk-..." # Replace with your actual key
client = openai.OpenAI()
def gpt_predict(prompt):
try:
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You're a commonsense reasoning assistant. Only say: Solution 1 or Solution 2."},
{"role": "user", "content": prompt}
]
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"GPT Error: {str(e)}"
def compare(goal, sol1, sol2, correct):
if not goal.strip() or not sol1.strip() or not sol2.strip():
return "⚠️ Please provide all inputs.", "", "", ""
prompt = f"Goal: {goal}\nSolution 1: {sol1}\nSolution 2: {sol2}\nWhich is better?"
evo = evo_predict(goal, sol1, sol2)
gpt = gpt_predict(prompt)
if evo == gpt:
verdict = "βœ… Evo agrees with GPT-3.5"
else:
verdict = "βš–οΈ Evo disagrees with GPT-3.5 β€” explore why."
if correct.strip().lower() in ["solution 1", "solution 2"]:
if evo == correct and gpt == correct:
score_note = "βœ… Both Evo and GPT-3.5 were correct."
elif evo == correct:
score_note = "🟒 Evo was correct. GPT-3.5 was wrong."
elif gpt == correct:
score_note = "🟒 GPT-3.5 was correct. Evo was wrong."
else:
score_note = "❌ Both were incorrect."
else:
score_note = "⚠️ Correct answer not provided or invalid (must be 'Solution 1' or 'Solution 2')."
return f"🧠 Evo: {evo}", f"πŸ€– GPT-3.5: {gpt}", verdict, score_note
examples = [
["Start a fire", "Use a match", "Pour water", "Solution 1"],
["Warm up food", "Use microwave", "Put it in fridge", "Solution 1"],
["Charge a phone", "Plug it in", "Put it on grass", "Solution 1"],
["Stop a car", "Press the brake", "Press the horn", "Solution 1"]
]
with gr.Blocks(title="βš”οΈ Evo vs GPT-3.5 – Real-Time Commonsense Showdown") as demo:
gr.Markdown("""
# 🧠 EvoTransformer v2.1
**PIQA Accuracy:** 69.7%     |     **Model Size:** ~13M Parameters     |     **Baseline:** GPT-3.5 β‰ˆ 81%
EvoTransformer is a scratch-built reasoning model trained on just 1K PIQA examples. No pretraining. No fine-tuning. Pure evolution.
Compare its decisions with GPT-3.5 in real-time and witness how intelligence can emerge even from lean, efficient architectures.
""")
with gr.Row():
goal = gr.Textbox(label="Goal")
with gr.Row():
sol1 = gr.Textbox(label="Solution 1")
sol2 = gr.Textbox(label="Solution 2")
correct = gr.Textbox(label="Correct Answer (Solution 1 or Solution 2)")
evo_output = gr.Textbox(label="EvoTransformer Response")
gpt_output = gr.Textbox(label="GPT-3.5 Response")
verdict_output = gr.Textbox(label="Verdict")
score_output = gr.Textbox(label="Correctness Evaluation")
submit = gr.Button("Submit")
submit.click(fn=compare, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output])
gr.Markdown("### πŸ” Examples:")
gr.Examples(examples=examples, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output], fn=compare, cache_examples=False)
gr.Markdown("""
> πŸ§ͺ *Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. That’s part of its evolution.*
---
Made with ❀️ by Dr. Heman Mohabeer β€” EvoTransformer is not just code. It's evolution.
""")
demo.launch()