Spaces:
Sleeping
Sleeping
Commit
·
35d31e1
1
Parent(s):
b72b033
Refine evaluation prompt and extraction for DeepSeek instruct model
Browse files- tinyllama_inference.py +16 -9
tinyllama_inference.py
CHANGED
@@ -15,13 +15,21 @@ def load_model():
|
|
15 |
return tokenizer, model
|
16 |
|
17 |
def evaluate_code(question, code):
|
|
|
18 |
prompt = f"""You are an expert code evaluator.
|
19 |
Evaluate the following solution for the given problem.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
Respond with exactly one JSON object (with no extra text) that has exactly two keys:
|
21 |
-
"stars": an integer between 0 and 5
|
22 |
-
"feedback": a concise string message.
|
23 |
The JSON must start with '{{' and end with '}}'.
|
24 |
-
Do not output
|
25 |
Question: "{question}"
|
26 |
Solution: "{code}"
|
27 |
Your response:"""
|
@@ -30,21 +38,20 @@ Your response:"""
|
|
30 |
inputs = tokenizer(prompt, return_tensors="pt")
|
31 |
outputs = model.generate(
|
32 |
**inputs,
|
33 |
-
max_new_tokens=
|
34 |
-
temperature=0.
|
35 |
pad_token_id=tokenizer.eos_token_id,
|
36 |
-
do_sample=True # Enable sampling to
|
37 |
)
|
38 |
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
39 |
-
print("Raw model response:", response_text) # Debug output
|
40 |
|
41 |
-
#
|
42 |
matches = re.findall(r'\{.*?\}', response_text)
|
43 |
result = None
|
44 |
for m in matches:
|
45 |
try:
|
46 |
temp = json.loads(m)
|
47 |
-
# Check that the parsed JSON contains both expected keys
|
48 |
if isinstance(temp, dict) and "stars" in temp and "feedback" in temp:
|
49 |
result = temp
|
50 |
break
|
|
|
15 |
return tokenizer, model
|
16 |
|
17 |
def evaluate_code(question, code):
|
18 |
+
# Refined prompt with explicit rating criteria.
|
19 |
prompt = f"""You are an expert code evaluator.
|
20 |
Evaluate the following solution for the given problem.
|
21 |
+
Rate the solution as follows:
|
22 |
+
- 5 stars: Perfect solution; it is correct, efficient, and follows best practices.
|
23 |
+
- 4 stars: Correct solution with minor issues or improvements possible.
|
24 |
+
- 3 stars: Partially correct solution with noticeable issues.
|
25 |
+
- 2 stars: Incorrect solution with some correct elements.
|
26 |
+
- 1 star: Mostly incorrect solution.
|
27 |
+
- 0 stars: Completely incorrect solution.
|
28 |
Respond with exactly one JSON object (with no extra text) that has exactly two keys:
|
29 |
+
"stars": an integer between 0 and 5,
|
30 |
+
"feedback": a concise string message explaining your rating.
|
31 |
The JSON must start with '{{' and end with '}}'.
|
32 |
+
Do not output any additional text.
|
33 |
Question: "{question}"
|
34 |
Solution: "{code}"
|
35 |
Your response:"""
|
|
|
38 |
inputs = tokenizer(prompt, return_tensors="pt")
|
39 |
outputs = model.generate(
|
40 |
**inputs,
|
41 |
+
max_new_tokens=120, # Increase token allowance for a complete evaluation
|
42 |
+
temperature=0.1, # A low temperature for more deterministic output
|
43 |
pad_token_id=tokenizer.eos_token_id,
|
44 |
+
do_sample=True # Enable sampling to allow some creativity
|
45 |
)
|
46 |
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
47 |
+
print("Raw model response:", response_text) # Debug: inspect raw output
|
48 |
|
49 |
+
# Extract all JSON objects using non-greedy regex and select the one with expected keys
|
50 |
matches = re.findall(r'\{.*?\}', response_text)
|
51 |
result = None
|
52 |
for m in matches:
|
53 |
try:
|
54 |
temp = json.loads(m)
|
|
|
55 |
if isinstance(temp, dict) and "stars" in temp and "feedback" in temp:
|
56 |
result = temp
|
57 |
break
|