Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# β
Evo Showcase Mode:
|
2 |
|
3 |
import gradio as gr
|
4 |
import openai
|
@@ -7,7 +7,6 @@ from inference import predict as evo_predict
|
|
7 |
# π SET YOUR GPT-3.5 API KEY HERE
|
8 |
openai.api_key = "sk-..." # Replace with your actual key
|
9 |
|
10 |
-
# β
Use the new openai>=1.0.0 API
|
11 |
client = openai.OpenAI()
|
12 |
|
13 |
def gpt_predict(prompt):
|
@@ -23,9 +22,9 @@ def gpt_predict(prompt):
|
|
23 |
except Exception as e:
|
24 |
return f"GPT Error: {str(e)}"
|
25 |
|
26 |
-
def compare(goal, sol1, sol2):
|
27 |
if not goal.strip() or not sol1.strip() or not sol2.strip():
|
28 |
-
return "β οΈ Please provide all inputs.", "", ""
|
29 |
|
30 |
prompt = f"Goal: {goal}\nSolution 1: {sol1}\nSolution 2: {sol2}\nWhich is better?"
|
31 |
evo = evo_predict(goal, sol1, sol2)
|
@@ -36,13 +35,25 @@ def compare(goal, sol1, sol2):
|
|
36 |
else:
|
37 |
verdict = "βοΈ Evo disagrees with GPT-3.5 β explore why."
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
examples = [
|
42 |
-
["Start a fire", "Use a match", "Pour water"],
|
43 |
-
["Warm up food", "Use microwave", "Put it in fridge"],
|
44 |
-
["Charge a phone", "Plug it in", "Put it on grass"],
|
45 |
-
["Stop a car", "Press the brake", "Press the horn"]
|
46 |
]
|
47 |
|
48 |
with gr.Blocks(title="βοΈ Evo vs GPT-3.5 β Real-Time Commonsense Showdown") as demo:
|
@@ -60,15 +71,18 @@ with gr.Blocks(title="βοΈ Evo vs GPT-3.5 β Real-Time Commonsense Showdown")
|
|
60 |
with gr.Row():
|
61 |
sol1 = gr.Textbox(label="Solution 1")
|
62 |
sol2 = gr.Textbox(label="Solution 2")
|
|
|
63 |
|
64 |
evo_output = gr.Textbox(label="EvoTransformer Response")
|
65 |
gpt_output = gr.Textbox(label="GPT-3.5 Response")
|
66 |
verdict_output = gr.Textbox(label="Verdict")
|
|
|
67 |
|
68 |
submit = gr.Button("Submit")
|
69 |
-
submit.click(fn=compare, inputs=[goal, sol1, sol2], outputs=[evo_output, gpt_output, verdict_output])
|
70 |
|
71 |
-
gr.
|
|
|
72 |
|
73 |
gr.Markdown("""
|
74 |
> π§ͺ *Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. Thatβs part of its evolution.*
|
|
|
1 |
+
# β
Evo Showcase Mode: Full Evaluation with Correct Answer Comparison
|
2 |
|
3 |
import gradio as gr
|
4 |
import openai
|
|
|
7 |
# π SET YOUR GPT-3.5 API KEY HERE
|
8 |
openai.api_key = "sk-..." # Replace with your actual key
|
9 |
|
|
|
10 |
client = openai.OpenAI()
|
11 |
|
12 |
def gpt_predict(prompt):
|
|
|
22 |
except Exception as e:
|
23 |
return f"GPT Error: {str(e)}"
|
24 |
|
25 |
+
def compare(goal, sol1, sol2, correct):
|
26 |
if not goal.strip() or not sol1.strip() or not sol2.strip():
|
27 |
+
return "β οΈ Please provide all inputs.", "", "", ""
|
28 |
|
29 |
prompt = f"Goal: {goal}\nSolution 1: {sol1}\nSolution 2: {sol2}\nWhich is better?"
|
30 |
evo = evo_predict(goal, sol1, sol2)
|
|
|
35 |
else:
|
36 |
verdict = "βοΈ Evo disagrees with GPT-3.5 β explore why."
|
37 |
|
38 |
+
if correct.strip().lower() in ["solution 1", "solution 2"]:
|
39 |
+
if evo == correct and gpt == correct:
|
40 |
+
score_note = "β
Both Evo and GPT-3.5 were correct."
|
41 |
+
elif evo == correct:
|
42 |
+
score_note = "π’ Evo was correct. GPT-3.5 was wrong."
|
43 |
+
elif gpt == correct:
|
44 |
+
score_note = "π’ GPT-3.5 was correct. Evo was wrong."
|
45 |
+
else:
|
46 |
+
score_note = "β Both were incorrect."
|
47 |
+
else:
|
48 |
+
score_note = "β οΈ Correct answer not provided or invalid (must be 'Solution 1' or 'Solution 2')."
|
49 |
+
|
50 |
+
return f"π§ Evo: {evo}", f"π€ GPT-3.5: {gpt}", verdict, score_note
|
51 |
|
52 |
examples = [
|
53 |
+
["Start a fire", "Use a match", "Pour water", "Solution 1"],
|
54 |
+
["Warm up food", "Use microwave", "Put it in fridge", "Solution 1"],
|
55 |
+
["Charge a phone", "Plug it in", "Put it on grass", "Solution 1"],
|
56 |
+
["Stop a car", "Press the brake", "Press the horn", "Solution 1"]
|
57 |
]
|
58 |
|
59 |
with gr.Blocks(title="βοΈ Evo vs GPT-3.5 β Real-Time Commonsense Showdown") as demo:
|
|
|
71 |
with gr.Row():
|
72 |
sol1 = gr.Textbox(label="Solution 1")
|
73 |
sol2 = gr.Textbox(label="Solution 2")
|
74 |
+
correct = gr.Textbox(label="Correct Answer (Solution 1 or Solution 2)")
|
75 |
|
76 |
evo_output = gr.Textbox(label="EvoTransformer Response")
|
77 |
gpt_output = gr.Textbox(label="GPT-3.5 Response")
|
78 |
verdict_output = gr.Textbox(label="Verdict")
|
79 |
+
score_output = gr.Textbox(label="Correctness Evaluation")
|
80 |
|
81 |
submit = gr.Button("Submit")
|
82 |
+
submit.click(fn=compare, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output])
|
83 |
|
84 |
+
gr.Markdown("### π Examples:")
|
85 |
+
gr.Examples(examples=examples, inputs=[goal, sol1, sol2, correct], outputs=[evo_output, gpt_output, verdict_output, score_output], fn=compare, cache_examples=False)
|
86 |
|
87 |
gr.Markdown("""
|
88 |
> π§ͺ *Note: EvoTransformer is a scratch-built model trained on 1K PIQA examples. It may occasionally misinterpret context or idioms. Thatβs part of its evolution.*
|