Spaces:

HemanM
/

EvoTransformer-v2.1

Sleeping

App Files Files Community

EvoTransformer-v2.1 / app.py

HemanM

Update app.py

7c4d437 verified 27 days ago

raw

history blame

3.93 kB

	# ✅ Evo Showcase Mode: Full Evaluation with Correct Answer Comparison

	import gradio as gr
	import openai
	from inference import predict as evo_predict

	# 🔐 SET YOUR GPT-3.5 API KEY HERE
	openai.api_key = "sk-..." # Replace with your actual key

	client = openai.OpenAI()

	def gpt_predict(prompt):
	try:
	response = client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "You're a commonsense reasoning assistant. Only say: Solution 1 or Solution 2."},
	{"role": "user", "content": prompt}
	]
	)
	return response.choices[0].message.content.strip()
	except Exception as e:
	return f"GPT Error: {str(e)}"

	def compare(goal, sol1, sol2, correct):
	if not goal.strip() or not sol1.strip() or not sol2.strip():
	return "⚠️ Please provide all inputs.", "", "", ""

	prompt = f"Goal: {goal}\nSolution 1: {sol1}\nSolution 2: {sol2}\nWhich is better?"
	evo = evo_predict(goal, sol1, sol2)
	gpt = gpt_predict(prompt)

	if evo == gpt:
	verdict = "✅ Evo agrees with GPT-3.5"
	else:
	verdict = "⚖️ Evo disagrees with GPT-3.5 — explore why."

	if correct.strip().lower() in ["solution 1", "solution 2"]:
	if evo == correct and gpt == correct:
	score_note = "✅ Both Evo and GPT-3.5 were correct."
	elif evo == correct:
	score_note = "🟢 Evo was correct. GPT-3.5 was wrong."
	elif gpt == correct:
	score_note = "🟢 GPT-3.5 was correct. Evo was wrong."
	else:
	score_note = "❌ Both were incorrect."
	else:
	score_note = "⚠️ Correct answer not provided or invalid (must be 'Solution 1' or 'Solution 2')."

	return f"🧠 Evo: {evo}", f"🤖 GPT-3.5: {gpt}", verdict, score_note

	examples = [
	["Start a fire", "Use a match", "Pour water", "Solution 1"],
	["Warm up food", "Use microwave", "Put it in fridge", "Solution 1"],
	["Charge a phone", "Plug it in", "Put it on grass", "Solution 1"],
	["Stop a car", "Press the brake", "Press the horn", "Solution 1"]
	]

	with gr.Blocks(title="⚔️ Evo vs GPT-3.5 – Real-Time Commonsense Showdown") as demo:
	gr.Markdown("""
	# 🧠 EvoTransformer v2.1
	PIQA Accuracy: 69.7%     \|     Model Size: ~13M Parameters     \|     Baseline: GPT-3.5 ≈ 81%

	EvoTransformer is a scratch-built reasoning model trained on just 1K PIQA examples. No pretraining. No fine-tuning. Pure evolution.

	Compare its decisions with GPT-3.5 in real-time and witness how intelligence can emerge even from lean, efficient architectures.
	""")

	with gr.Row():
	goal = gr.Textbox(label="Goal")
	with gr.Row():
	sol1 = gr.Textbox(label="Solution 1")
	sol2 = gr.Textbox(label="Solution 2")
	correct = gr.Textbox(label="Correct Answer (Solution 1 or Solution 2)")

	evo_output = gr.Textbox(label="EvoTransformer Response")
	gpt_output = gr.Textbox(label="GPT-3.5 Response")
	verdict_output = gr.Textbox(label="Verdict")
	score_output = gr.Textbox(label="Correctness Evaluation")

	submit = gr.Button("Submit")
	submit.click(fn=compare, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output])

	gr.Markdown("### 🔍 Examples:")
	gr.Examples(examples=examples, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output], fn=compare, cache_examples=False)

	gr.Markdown("""
	> 🧪 Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. That’s part of its evolution.

	---
	Made with ❤️ by Dr. Heman Mohabeer — EvoTransformer is not just code. It's evolution.
	""")

	demo.launch()