bug-explainer-ml / analyzer.py
Sushwetabm
updated analyzer.py
9a2b71a
# import json
# def analyze_code(language, code, tokenizer, model):
# messages = [
# {
# "role": "system",
# "content": (
# "You are a helpful and expert-level AI code reviewer and bug fixer. "
# "Your task is to analyze the given buggy code in the specified programming language, "
# "identify bugs (logical, syntax, runtime, etc.), and fix them. "
# "Return a JSON object with the following keys:\n\n"
# "1. 'bug_analysis': a list of objects, each containing:\n"
# " - 'line_number': the line number (approximate if needed)\n"
# " - 'error_message': a short name of the bug\n"
# " - 'explanation': short explanation of the problem\n"
# " - 'fix_suggestion': how to fix it\n"
# "2. 'corrected_code': the entire corrected code block.\n\n"
# "Respond with ONLY the raw JSON object, no extra commentary or markdown."
# )
# },
# {
# "role": "user",
# "content": f"πŸ’» Language: {language}\n🐞 Buggy Code:\n```{language.lower()}\n{code.strip()}\n```"
# }
# ]
# inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
# attention_mask = (inputs != tokenizer.pad_token_id).long()
# outputs = model.generate(
# inputs,
# attention_mask=attention_mask,
# max_new_tokens=1024,
# do_sample=False,
# pad_token_id=tokenizer.eos_token_id,
# eos_token_id=tokenizer.eos_token_id
# )
# response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
# # Try parsing response to JSON
# try:
# json_output = json.loads(response)
# return json_output
# except json.JSONDecodeError:
# print("⚠️ Could not decode response into JSON. Here's the raw output:\n")
# print(response)
# return None
# import json
# import logging
# import time
# import torch
# # Configure logging
# logger = logging.getLogger(__name__)
# def analyze_code(language, code, tokenizer, model):
# """
# Analyze code and return bug analysis with improved logging and error handling
# """
# start_time = time.time()
# logger.info(f"πŸ” Starting analysis for {language} code ({len(code)} characters)")
# try:
# # Prepare messages
# messages = [
# {
# "role": "system",
# "content": (
# "You are a helpful and expert-level AI code reviewer and bug fixer. "
# "Your task is to analyze the given buggy code in the specified programming language, "
# "identify bugs (logical, syntax, runtime, etc.), and fix them. "
# "Return a JSON object with the following keys:\n\n"
# "1. 'bug_analysis': a list of objects, each containing:\n"
# " - 'line_number': the line number (approximate if needed)\n"
# " - 'error_message': a short name of the bug\n"
# " - 'explanation': short explanation of the problem\n"
# " - 'fix_suggestion': how to fix it\n"
# "2. 'corrected_code': the entire corrected code block.\n\n"
# "Respond with ONLY the raw JSON object, no extra commentary or markdown."
# )
# },
# {
# "role": "user",
# "content": f"πŸ’» Language: {language}\n🐞 Buggy Code:\n```{language.lower()}\n{code.strip()}\n```"
# }
# ]
# logger.info("πŸ”§ Applying chat template...")
# inputs = tokenizer.apply_chat_template(
# messages,
# add_generation_prompt=True,
# return_tensors="pt"
# ).to(model.device)
# attention_mask = (inputs != tokenizer.pad_token_id).long()
# logger.info(f"πŸ“ Input length: {inputs.shape[1]} tokens")
# logger.info("πŸš€ Starting model generation...")
# generation_start = time.time()
# # Generate with more conservative settings
# with torch.no_grad(): # Ensure no gradients are computed
# outputs = model.generate(
# inputs,
# attention_mask=attention_mask,
# max_new_tokens=512, # Reduced from 1024 for faster inference
# do_sample=False,
# temperature=0.1, # Add temperature for more consistent output
# pad_token_id=tokenizer.eos_token_id,
# eos_token_id=tokenizer.eos_token_id,
# use_cache=True, # Enable KV cache for efficiency
# )
# generation_time = time.time() - generation_start
# logger.info(f"⚑ Generation completed in {generation_time:.2f} seconds")
# logger.info("πŸ“ Decoding response...")
# response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
# logger.info(f"πŸ“„ Response length: {len(response)} characters")
# logger.info(f"πŸ” First 100 chars: {response[:100]}...")
# # Try parsing response to JSON
# logger.info("πŸ” Attempting to parse JSON...")
# try:
# # Clean up response - remove any markdown formatting
# cleaned_response = response.strip()
# if cleaned_response.startswith('```json'):
# cleaned_response = cleaned_response[7:]
# if cleaned_response.startswith('```'):
# cleaned_response = cleaned_response[3:]
# if cleaned_response.endswith('```'):
# cleaned_response = cleaned_response[:-3]
# cleaned_response = cleaned_response.strip()
# json_output = json.loads(cleaned_response)
# total_time = time.time() - start_time
# logger.info(f"βœ… Analysis completed successfully in {total_time:.2f} seconds")
# # Validate the JSON structure
# if not isinstance(json_output, dict):
# raise ValueError("Response is not a dictionary")
# if 'bug_analysis' not in json_output:
# logger.warning("⚠️ Missing 'bug_analysis' key, adding empty list")
# json_output['bug_analysis'] = []
# if 'corrected_code' not in json_output:
# logger.warning("⚠️ Missing 'corrected_code' key, adding original code")
# json_output['corrected_code'] = code
# return json_output
# except json.JSONDecodeError as e:
# logger.error(f"❌ JSON decode error: {e}")
# logger.error(f"πŸ“„ Raw response: {repr(response)}")
# # Return a fallback structure with the raw response
# fallback_response = {
# "bug_analysis": [{
# "line_number": 1,
# "error_message": "Analysis parsing failed",
# "explanation": "The AI model returned a response that couldn't be parsed as JSON",
# "fix_suggestion": "Please try again or check the code format"
# }],
# "corrected_code": code,
# "raw_output": response,
# "parsing_error": str(e)
# }
# return fallback_response
# except Exception as e:
# total_time = time.time() - start_time
# logger.error(f"❌ Analysis failed after {total_time:.2f} seconds: {str(e)}")
# logger.error(f"πŸ’₯ Exception type: {type(e).__name__}")
# # Return error response
# return {
# "bug_analysis": [{
# "line_number": 1,
# "error_message": "Analysis failed",
# "explanation": f"An error occurred during analysis: {str(e)}",
# "fix_suggestion": "Please try again or contact support"
# }],
# "corrected_code": code,
# "error": str(e),
# "error_type": type(e).__name__
# }
# analyzer.py
# analyzer.py
import torch
import json
import time
import logging
# Configure logger
logger = logging.getLogger("CodeAnalyzer")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
def analyze_code(tokenizer, model, language, code):
"""
Analyze and fix buggy code using CodeT5+ model with 'fix:' prompt prefix.
Works across multiple programming languages.
"""
start_time = time.time()
# Prepare prompt in CodeT5+ style
prompt = f"fix: {code.strip()}"
logger.info(f"πŸ” Starting analysis for language: {language}")
logger.info(f"🧾 Prompt: {prompt[:80]}...")
try:
# Tokenize and generate response
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=1024)
# Decode output
response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
elapsed = round(time.time() - start_time, 2)
logger.info(f"βœ… Inference completed in {elapsed}s")
return {
"bug_analysis": [], # Optional: You could add heuristics here
"corrected_code": response
}
except Exception as e:
logger.error(f"❌ Error during analysis: {e}")
return {
"bug_analysis": [{
"line_number": 0,
"error_message": "Inference failed",
"explanation": str(e),
"fix_suggestion": "Try again with simpler code or retry later"
}],
"corrected_code": code
}