Spaces:

HemanM
/

EvoTransformer-v2.1

Sleeping

File size: 3,932 Bytes

7c4d437
e127a0b
312dbba
2eed809
 
312dbba
2eed809
e6e2360
2eed809
23012c7
 
2eed809
 
23012c7
2eed809
 
23012c7
2eed809
 
 
23012c7
2eed809
 
e256998
7c4d437
e127a0b
7c4d437
e256998
2eed809
 
 
e256998
2eed809
 
 
 
e256998
7c4d437
 
 
 
 
 
 
 
 
 
 
 
 
e256998
2eed809
7c4d437
 
 
 
2eed809
 
e6e2360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c4d437
e6e2360
 
 
 
7c4d437
e6e2360
 
7c4d437
e6e2360
7c4d437
 
e6e2360

# ✅ Evo Showcase Mode: Full Evaluation with Correct Answer Comparison

import gradio as gr
import openai
from inference import predict as evo_predict

# 🔐 SET YOUR GPT-3.5 API KEY HERE
openai.api_key = "sk-..."  # Replace with your actual key

client = openai.OpenAI()

def gpt_predict(prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You're a commonsense reasoning assistant. Only say: Solution 1 or Solution 2."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"GPT Error: {str(e)}"

def compare(goal, sol1, sol2, correct):
    if not goal.strip() or not sol1.strip() or not sol2.strip():
        return "⚠️ Please provide all inputs.", "", "", ""

    prompt = f"Goal: {goal}\nSolution 1: {sol1}\nSolution 2: {sol2}\nWhich is better?"
    evo = evo_predict(goal, sol1, sol2)
    gpt = gpt_predict(prompt)

    if evo == gpt:
        verdict = "✅ Evo agrees with GPT-3.5"
    else:
        verdict = "⚖️ Evo disagrees with GPT-3.5 — explore why."

    if correct.strip().lower() in ["solution 1", "solution 2"]:
        if evo == correct and gpt == correct:
            score_note = "✅ Both Evo and GPT-3.5 were correct."
        elif evo == correct:
            score_note = "🟢 Evo was correct. GPT-3.5 was wrong."
        elif gpt == correct:
            score_note = "🟢 GPT-3.5 was correct. Evo was wrong."
        else:
            score_note = "❌ Both were incorrect."
    else:
        score_note = "⚠️ Correct answer not provided or invalid (must be 'Solution 1' or 'Solution 2')."

    return f"🧠 Evo: {evo}", f"🤖 GPT-3.5: {gpt}", verdict, score_note

examples = [
    ["Start a fire", "Use a match", "Pour water", "Solution 1"],
    ["Warm up food", "Use microwave", "Put it in fridge", "Solution 1"],
    ["Charge a phone", "Plug it in", "Put it on grass", "Solution 1"],
    ["Stop a car", "Press the brake", "Press the horn", "Solution 1"]
]

with gr.Blocks(title="⚔️ Evo vs GPT-3.5 – Real-Time Commonsense Showdown") as demo:
    gr.Markdown("""
    # 🧠 EvoTransformer v2.1
    **PIQA Accuracy:** 69.7% &nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;&nbsp; **Model Size:** ~13M Parameters &nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;&nbsp; **Baseline:** GPT-3.5 ≈ 81%

    EvoTransformer is a scratch-built reasoning model trained on just 1K PIQA examples. No pretraining. No fine-tuning. Pure evolution.

    Compare its decisions with GPT-3.5 in real-time and witness how intelligence can emerge even from lean, efficient architectures.
    """)

    with gr.Row():
        goal = gr.Textbox(label="Goal")
    with gr.Row():
        sol1 = gr.Textbox(label="Solution 1")
        sol2 = gr.Textbox(label="Solution 2")
    correct = gr.Textbox(label="Correct Answer (Solution 1 or Solution 2)")

    evo_output = gr.Textbox(label="EvoTransformer Response")
    gpt_output = gr.Textbox(label="GPT-3.5 Response")
    verdict_output = gr.Textbox(label="Verdict")
    score_output = gr.Textbox(label="Correctness Evaluation")

    submit = gr.Button("Submit")
    submit.click(fn=compare, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output])

    gr.Markdown("### 🔍 Examples:")
    gr.Examples(examples=examples, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output], fn=compare, cache_examples=False)

    gr.Markdown("""
    > 🧪 *Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. That’s part of its evolution.*

    ---
    Made with ❤️ by Dr. Heman Mohabeer — EvoTransformer is not just code. It's evolution.
    """)

    demo.launch()