Spaces:
Running
Running
import json | |
import re | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
# Global variables for caching the model and tokenizer | |
tokenizer, model = None, None | |
def load_model(): | |
global tokenizer, model | |
if tokenizer is None or model is None: | |
# Use the DeepSeek instruct model for code evaluation. | |
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
return tokenizer, model | |
def extract_json(response_text): | |
# First, try to extract all JSON blocks using regex with DOTALL. | |
matches = re.findall(r'\{.*?\}', response_text, re.DOTALL) | |
for m in matches: | |
json_text = m.strip() | |
try: | |
temp = json.loads(json_text) | |
if isinstance(temp, dict) and "stars" in temp and "feedback" in temp: | |
return temp | |
except Exception: | |
continue | |
# Fallback: try splitting the text on "Your response:" and then extract JSON. | |
parts = response_text.split("Your response:") | |
if len(parts) > 1: | |
possible = parts[-1].strip() | |
# Try to extract JSON from this part. | |
try: | |
temp = json.loads(possible) | |
if isinstance(temp, dict) and "stars" in temp and "feedback" in temp: | |
return temp | |
except Exception: | |
# If it fails, try regex on this part. | |
matches = re.findall(r'\{.*?\}', possible, re.DOTALL) | |
for m in matches: | |
json_text = m.strip() | |
try: | |
temp = json.loads(json_text) | |
if isinstance(temp, dict) and "stars" in temp and "feedback" in temp: | |
return temp | |
except Exception: | |
continue | |
# If all methods fail, return a fallback result. | |
return {"stars": 0, "feedback": "Evaluation failed. Unable to extract valid JSON from AI response."} | |
def evaluate_code(question, code): | |
prompt = f"""You are an expert code evaluator. | |
Evaluate the following solution for the given problem. | |
Rate the solution as follows: | |
- 5 stars: Perfect solution; it is correct, efficient, and follows best practices. | |
- 4 stars: Correct solution with minor issues or improvements possible. | |
- 3 stars: Partially correct solution with noticeable issues. | |
- 2 stars: Incorrect solution with some correct elements. | |
- 1 star: Mostly incorrect solution. | |
- 0 stars: Completely incorrect solution. | |
Respond with exactly one JSON object (with no extra text) that has exactly two keys: | |
"stars": an integer between 0 and 5, | |
"feedback": a concise string message explaining your rating. | |
The JSON must start with '{{' and end with '}}'. | |
Do not output any additional text. | |
Question: "{question}" | |
Solution: "{code}" | |
Your response:""" | |
tokenizer, model = load_model() | |
inputs = tokenizer(prompt, return_tensors="pt") | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=120, # Allow enough tokens for a complete response | |
temperature=0.2, # Small randomness for creativity | |
pad_token_id=tokenizer.eos_token_id, | |
do_sample=True | |
) | |
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
print("Raw model response:", response_text) # Debug output | |
result = extract_json(response_text) | |
return result | |
# For direct command-line testing. | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) < 3: | |
print(json.dumps({"error": "Please provide a question and code as arguments"})) | |
sys.exit(1) | |
question = sys.argv[1] | |
code = sys.argv[2] | |
result = evaluate_code(question, code) | |
print(json.dumps(result)) | |