Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,94 +1,89 @@
|
|
1 |
-
# β
Evo Showcase Mode: Full Evaluation with Correct Answer Comparison
|
2 |
-
|
3 |
import gradio as gr
|
|
|
|
|
4 |
import openai
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
|
|
|
9 |
|
10 |
-
|
|
|
|
|
11 |
|
12 |
-
|
|
|
13 |
try:
|
14 |
-
response =
|
15 |
model="gpt-3.5-turbo",
|
16 |
-
messages=[
|
17 |
-
|
18 |
-
{"role": "user", "content": prompt}
|
19 |
-
]
|
20 |
)
|
21 |
-
|
|
|
22 |
except Exception as e:
|
23 |
-
return f"GPT Error: {
|
24 |
-
|
25 |
-
def compare(goal, sol1, sol2, correct):
|
26 |
-
if not goal.strip() or not sol1.strip() or not sol2.strip():
|
27 |
-
return "β οΈ Please provide all inputs.", "", "", ""
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
if evo == gpt:
|
34 |
-
verdict = "β
Evo agrees with GPT-3.5"
|
35 |
-
else:
|
36 |
-
verdict = "βοΈ Evo disagrees with GPT-3.5 β explore why."
|
37 |
-
|
38 |
-
if correct.strip().lower() in ["solution 1", "solution 2"]:
|
39 |
-
if evo == correct and gpt == correct:
|
40 |
-
score_note = "β
Both Evo and GPT-3.5 were correct."
|
41 |
-
elif evo == correct:
|
42 |
-
score_note = "π’ Evo was correct. GPT-3.5 was wrong."
|
43 |
-
elif gpt == correct:
|
44 |
-
score_note = "π’ GPT-3.5 was correct. Evo was wrong."
|
45 |
-
else:
|
46 |
-
score_note = "β Both were incorrect."
|
47 |
-
else:
|
48 |
-
score_note = "β οΈ Correct answer not provided or invalid (must be 'Solution 1' or 'Solution 2')."
|
49 |
|
50 |
-
|
|
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
["Warm up food", "Use microwave", "Put it in fridge", "Solution 1"],
|
55 |
-
["Charge a phone", "Plug it in", "Put it on grass", "Solution 1"],
|
56 |
-
["Stop a car", "Press the brake", "Press the horn", "Solution 1"]
|
57 |
-
]
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
|
66 |
-
|
67 |
-
""
|
|
|
|
|
68 |
|
69 |
with gr.Row():
|
70 |
goal = gr.Textbox(label="Goal")
|
71 |
with gr.Row():
|
72 |
sol1 = gr.Textbox(label="Solution 1")
|
73 |
sol2 = gr.Textbox(label="Solution 2")
|
74 |
-
correct = gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
77 |
-
gpt_output = gr.Textbox(label="GPT-3.5 Response")
|
78 |
-
verdict_output = gr.Textbox(label="Verdict")
|
79 |
-
score_output = gr.Textbox(label="Correctness Evaluation")
|
80 |
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
gr.
|
85 |
-
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
> π§ͺ *Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. Thatβs part of its evolution.*
|
89 |
|
90 |
-
|
91 |
-
Made with β€οΈ by Dr. Heman Mohabeer β EvoTransformer is not just code. It's evolution.
|
92 |
-
""")
|
93 |
|
94 |
-
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from inference import predict
|
3 |
+
from logger import log_interaction
|
4 |
import openai
|
5 |
+
import os
|
6 |
+
|
7 |
+
# --- Set your OpenAI key here (or use secrets/environment)
|
8 |
+
openai.api_key = os.getenv("OPENAI_API_KEY") or "sk-..." # Replace if needed
|
9 |
|
10 |
+
def gpt3_predict(goal, sol1, sol2):
|
11 |
+
prompt = f"""You are solving a commonsense reasoning task.
|
12 |
+
Given a goal and two possible solutions, choose which solution makes more sense.
|
13 |
|
14 |
+
Goal: {goal}
|
15 |
+
Option A: {sol1}
|
16 |
+
Option B: {sol2}
|
17 |
|
18 |
+
Which option is better? Reply only with "Solution 1" or "Solution 2"."""
|
19 |
+
|
20 |
try:
|
21 |
+
response = openai.ChatCompletion.create(
|
22 |
model="gpt-3.5-turbo",
|
23 |
+
messages=[{"role": "user", "content": prompt}],
|
24 |
+
max_tokens=10
|
|
|
|
|
25 |
)
|
26 |
+
answer = response.choices[0].message.content.strip()
|
27 |
+
return answer
|
28 |
except Exception as e:
|
29 |
+
return f"GPT Error: {e}"
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
def compare(goal, sol1, sol2, correct_answer):
|
32 |
+
# EvoTransformer prediction
|
33 |
+
evo = predict(goal, sol1, sol2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
# GPT-3.5 prediction
|
36 |
+
gpt = gpt3_predict(goal, sol1, sol2)
|
37 |
|
38 |
+
# Log feedback
|
39 |
+
log_interaction(goal, sol1, sol2, evo, gpt, correct_answer)
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# Verdict
|
42 |
+
if correct_answer:
|
43 |
+
verdict = "β
Evo was RIGHT β
" if evo == correct_answer else "β Evo was WRONG β"
|
44 |
+
verdict += "\n"
|
45 |
+
verdict += "β
GPT-3.5 was RIGHT β
" if gpt == correct_answer else "β GPT-3.5 was WRONG β"
|
46 |
+
else:
|
47 |
+
verdict = "βοΈ Evo and GPT-3.5 predictions compared."
|
48 |
|
49 |
+
return evo, gpt, verdict
|
50 |
|
51 |
+
with gr.Blocks() as demo:
|
52 |
+
gr.Markdown("## βοΈ Evo vs GPT-3.5 β Real-Time Commonsense Showdown")
|
53 |
+
gr.Markdown("> π§ EvoTransformer v2.1 β PIQA Accuracy: 69.7% (vs GPT-3.5 β 81%) Β· 13M Parameters Β· Fully Scratch-Trained Β· Leans Smart")
|
54 |
+
gr.Markdown("> π§ͺ *Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. Thatβs part of its evolution.*")
|
55 |
|
56 |
with gr.Row():
|
57 |
goal = gr.Textbox(label="Goal")
|
58 |
with gr.Row():
|
59 |
sol1 = gr.Textbox(label="Solution 1")
|
60 |
sol2 = gr.Textbox(label="Solution 2")
|
61 |
+
correct = gr.Radio(choices=["Solution 1", "Solution 2", None], label="β
Correct Answer (if known)", value=None)
|
62 |
+
|
63 |
+
btn = gr.Button("Submit")
|
64 |
+
|
65 |
+
with gr.Row():
|
66 |
+
evo_out = gr.Textbox(label="π§ EvoTransformer Response")
|
67 |
+
gpt_out = gr.Textbox(label="π€ GPT-3.5 Response")
|
68 |
|
69 |
+
verdict = gr.Textbox(label="Verdict", interactive=False)
|
|
|
|
|
|
|
70 |
|
71 |
+
examples = [
|
72 |
+
["Start a fire", "Use a match", "Pour water", "Solution 1"],
|
73 |
+
["Warm up food", "Use microwave", "Put it in fridge", "Solution 1"],
|
74 |
+
["Charge a phone", "Plug it in", "Put it on grass", "Solution 1"],
|
75 |
+
["Get rid of bad smell", "Open window", "Close door", "Solution 1"],
|
76 |
+
["Find your way", "Use a map", "Close your eyes", "Solution 1"]
|
77 |
+
]
|
78 |
|
79 |
+
gr.Examples(
|
80 |
+
examples=examples,
|
81 |
+
inputs=[goal, sol1, sol2, correct],
|
82 |
+
label="π Try These Examples"
|
83 |
+
)
|
84 |
|
85 |
+
btn.click(fn=compare, inputs=[goal, sol1, sol2, correct], outputs=[evo_out, gpt_out, verdict])
|
|
|
86 |
|
87 |
+
gr.Markdown("Made with β€οΈ by Dr. Heman Mohabeer β EvoTransformer is not just code. It's evolution.")
|
|
|
|
|
88 |
|
89 |
+
demo.launch()
|