Spaces:
Running
Running
import json | |
import re | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
# Global variables for caching the model and tokenizer | |
tokenizer, model = None, None | |
def load_model(): | |
global tokenizer, model | |
if tokenizer is None or model is None: | |
# Use the DeepSeek instruct model for code evaluation. | |
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
return tokenizer, model | |
def extract_json(response_text): | |
# Attempt to extract all JSON blocks (non-greedy, with DOTALL) | |
matches = re.findall(r'\{.*?\}', response_text, re.DOTALL) | |
for m in reversed(matches): | |
try: | |
temp = json.loads(m) | |
if isinstance(temp, dict) and "stars" in temp and "feedback" in temp: | |
return temp | |
except Exception: | |
continue | |
return None | |
def evaluate_code(question, code): | |
# Revised prompt that explicitly states the expected arithmetic operation for square. | |
prompt = f"""You are an expert code evaluator. | |
Evaluate the following solution for the given problem. | |
The problem asks for a function that returns the square of a number. | |
A correct solution must multiply the number by itself (using x*x or x**2). | |
If the solution uses any other operation (such as addition), it is completely incorrect. | |
Rate the solution as follows: | |
- 5 stars: Perfect solution; correct, efficient, and follows best practices. | |
- 4 stars: Correct solution with minor issues. | |
- 3 stars: Partially correct solution with noticeable issues. | |
- 2 stars: Incorrect solution with some correct elements. | |
- 1 star: Mostly incorrect solution. | |
- 0 stars: Completely incorrect solution. | |
Respond with exactly one JSON object (with no extra text) that has exactly two keys: | |
"stars": an integer between 0 and 5, | |
"feedback": a concise string message explaining your rating. | |
The JSON must start with '{{' and end with '}}'. | |
Do not output any additional text. | |
Question: "{question}" | |
Solution: "{code}" | |
Your response:""" | |
tokenizer, model = load_model() | |
inputs = tokenizer(prompt, return_tensors="pt") | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=120, | |
temperature=0.2, | |
pad_token_id=tokenizer.eos_token_id, | |
do_sample=True | |
) | |
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
print("Raw model response:", response_text) # Debug output | |
result = extract_json(response_text) | |
if result is None: | |
result = {"stars": 0, "feedback": "Evaluation failed. Unable to extract valid JSON from AI response."} | |
return result | |
# For direct command-line testing. | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) < 3: | |
print(json.dumps({"error": "Please provide a question and code as arguments"})) | |
sys.exit(1) | |
question = sys.argv[1] | |
code = sys.argv[2] | |
result = evaluate_code(question, code) | |
print(json.dumps(result)) | |