# AI-Powered Code Review Assistant # Author: Spencer Purdy # Description: An intelligent code review tool that analyzes Python and JavaScript code # for best practices, security vulnerabilities, and performance improvements using # CodeT5 and advanced pattern analysis. # Import required libraries import subprocess import sys import re import ast import json from typing import List, Dict, Tuple # Install required packages if not already installed def install_packages(): """Install required packages for the application""" packages = ['gradio', 'transformers', 'torch', 'sentencepiece'] for package in packages: subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q']) # Try importing, install if needed try: import gradio as gr from transformers import RobertaTokenizer, T5ForConditionalGeneration import torch except ImportError: print("Installing required packages...") install_packages() import gradio as gr from transformers import RobertaTokenizer, T5ForConditionalGeneration import torch # Initialize the CodeT5 model for AI-powered code analysis print("Loading CodeT5 model...") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base') model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base').to(device) model.eval() print(f"Model loaded successfully on {device}!") class CodeAnalyzer: """ Main class for analyzing code quality, security, and performance. Uses pattern matching and AI to identify issues in Python and JavaScript code. """ def __init__(self): """Initialize analyzer with predefined patterns for security, performance, and best practices""" # Security vulnerability patterns for each language self.security_patterns = { 'python': { 'sql_injection': r'(execute|executemany)\s*\(\s*["\'].*%[s|d].*["\'].*%', 'command_injection': r'(os\.system|subprocess\.call|subprocess\.run)\s*\([^)]*\+[^)]*\)', 'eval_usage': r'\beval\s*\(', 'pickle_usage': r'pickle\.(load|loads)\s*\(', 'hardcoded_secrets': r'(password|api_key|secret|token)\s*=\s*["\'][^"\']+["\']', 'weak_random': r'random\.(random|randint|choice)\s*\(', }, 'javascript': { 'eval_usage': r'\beval\s*\(', 'innerHTML_xss': r'\.innerHTML\s*=', 'sql_injection': r'query\s*\(\s*["\'].*\+.*["\']', 'hardcoded_secrets': r'(password|apiKey|secret|token)\s*=\s*["\'][^"\']+["\']', 'weak_comparison': r'==\s*(null|undefined)', 'unsafe_regex': r'new\s+RegExp\s*\([^)]*\+[^)]*\)', } } # Performance issue patterns self.performance_patterns = { 'python': { 'nested_loops': r'for\s+.*:\s*\n\s*for\s+.*:', 'string_concatenation': r'["\'].*["\']\s*\+\s*["\'].*["\']', 'list_comprehension_opportunity': r'for\s+.*:\s*\n\s*.*\.append\(', 'inefficient_contains': r'if\s+.*\s+in\s+.*list\(', }, 'javascript': { 'nested_loops': r'for\s*\([^)]*\)\s*{\s*for\s*\(', 'dom_in_loop': r'for\s*\([^)]*\)\s*{[^}]*document\.(getElementById|querySelector)', 'string_concatenation': r'["\'].*["\']\s*\+\s*["\'].*["\']', 'inefficient_array_method': r'\.(forEach|map|filter)\s*\([^)]*\)\s*\.(forEach|map|filter)', } } # Best practice checkers (mix of regex patterns and callable functions) self.best_practices = { 'python': { 'missing_docstring': self._check_missing_docstring, 'long_functions': self._check_long_functions, 'naming_convention': self._check_python_naming, 'unused_variables': self._check_unused_variables, }, 'javascript': { 'var_usage': r'\bvar\s+', 'missing_semicolon': r'[^;]\s*\n\s*(let|const|return|if|for|while)', 'console_log': r'console\.(log|error|warn)\(', 'naming_convention': self._check_js_naming, } } def analyze_code(self, code: str, language: str) -> Dict[str, List[Dict]]: """ Main analysis function that runs all checks on the provided code Args: code: Source code string to analyze language: Programming language ('python' or 'javascript') Returns: Dictionary with categorized issues: security, performance, best_practices, ai_suggestions """ results = { 'security': [], 'performance': [], 'best_practices': [], 'ai_suggestions': [] } # Run security analysis using regex patterns for issue_name, pattern in self.security_patterns.get(language, {}).items(): if isinstance(pattern, str) and re.search(pattern, code, re.IGNORECASE): results['security'].append({ 'type': issue_name.replace('_', ' ').title(), 'severity': 'high' if issue_name in ['sql_injection', 'command_injection', 'eval_usage'] else 'medium', 'message': self._get_security_message(issue_name), 'line': self._find_line_number(code, pattern) }) # Run performance analysis for issue_name, pattern in self.performance_patterns.get(language, {}).items(): if isinstance(pattern, str) and re.search(pattern, code, re.MULTILINE): results['performance'].append({ 'type': issue_name.replace('_', ' ').title(), 'severity': 'medium', 'message': self._get_performance_message(issue_name), 'line': self._find_line_number(code, pattern) }) # Run best practices analysis (can be regex or function-based) for issue_name, checker in self.best_practices.get(language, {}).items(): if callable(checker): issues = checker(code) results['best_practices'].extend(issues) elif isinstance(checker, str) and re.search(checker, code): results['best_practices'].append({ 'type': issue_name.replace('_', ' ').title(), 'severity': 'low', 'message': self._get_best_practice_message(issue_name), 'line': self._find_line_number(code, checker) }) # Get AI-powered suggestions using CodeT5 ai_suggestions = self._get_ai_suggestions(code, language) if ai_suggestions: results['ai_suggestions'] = ai_suggestions return results def _check_missing_docstring(self, code: str) -> List[Dict]: """ Check Python code for functions and classes missing docstrings Uses AST parsing to analyze code structure """ issues = [] try: tree = ast.parse(code) for node in ast.walk(tree): if isinstance(node, (ast.FunctionDef, ast.ClassDef)): if not ast.get_docstring(node): issues.append({ 'type': 'Missing Docstring', 'severity': 'low', 'message': f'Function/Class "{node.name}" should have a docstring explaining its purpose', 'line': node.lineno }) except: # If AST parsing fails, skip this check pass return issues def _check_long_functions(self, code: str) -> List[Dict]: """ Identify functions that exceed recommended length (20 lines) Long functions are harder to understand and maintain """ issues = [] try: tree = ast.parse(code) for node in ast.walk(tree): if isinstance(node, ast.FunctionDef): func_lines = node.end_lineno - node.lineno if func_lines > 20: issues.append({ 'type': 'Long Function', 'severity': 'medium', 'message': f'Function "{node.name}" is {func_lines} lines long. Consider breaking it into smaller functions.', 'line': node.lineno }) except: pass return issues def _check_python_naming(self, code: str) -> List[Dict]: """Check for Python naming convention violations (should use snake_case)""" issues = [] camel_case_pattern = r'\b[a-z]+[A-Z]\w*\s*=' matches = re.finditer(camel_case_pattern, code) for match in matches: issues.append({ 'type': 'Naming Convention', 'severity': 'low', 'message': 'Use snake_case for variable names in Python (e.g., my_variable instead of myVariable)', 'line': code[:match.start()].count('\n') + 1 }) return issues def _check_js_naming(self, code: str) -> List[Dict]: """Check for JavaScript naming convention violations (should use camelCase)""" issues = [] snake_case_pattern = r'(let|const|var)\s+[a-z]+_[a-z]+\s*=' matches = re.finditer(snake_case_pattern, code) for match in matches: issues.append({ 'type': 'Naming Convention', 'severity': 'low', 'message': 'Use camelCase for variable names in JavaScript (e.g., myVariable instead of my_variable)', 'line': code[:match.start()].count('\n') + 1 }) return issues def _check_unused_variables(self, code: str) -> List[Dict]: """ Detect variables that are assigned but never used in Python code Unused variables can indicate dead code or incomplete refactoring """ issues = [] try: tree = ast.parse(code) assigned_vars = set() used_vars = set() # Walk AST to find assignments and variable usage for node in ast.walk(tree): if isinstance(node, ast.Assign): for target in node.targets: if isinstance(target, ast.Name): assigned_vars.add(target.id) elif isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load): used_vars.add(node.id) # Find variables that are assigned but never used unused = assigned_vars - used_vars - {'_'} # Exclude underscore for var in unused: issues.append({ 'type': 'Unused Variable', 'severity': 'low', 'message': f'Variable "{var}" is defined but never used', 'line': 0 # Line number would require more complex analysis }) except: pass return issues def _find_line_number(self, code: str, pattern: str) -> int: """Find the line number where a regex pattern first matches""" match = re.search(pattern, code, re.MULTILINE | re.IGNORECASE) if match: return code[:match.start()].count('\n') + 1 return 0 def _get_security_message(self, issue_type: str) -> str: """Return detailed explanation for security issues""" messages = { 'sql_injection': 'Potential SQL injection vulnerability. Use parameterized queries or prepared statements instead of string concatenation.', 'command_injection': 'Potential command injection. Never use user input directly in system commands. Sanitize and validate all inputs.', 'eval_usage': 'Using eval() is dangerous and can lead to code injection. Consider using ast.literal_eval() or alternative approaches.', 'pickle_usage': 'Pickle can execute arbitrary code during deserialization. Only unpickle data from trusted sources.', 'hardcoded_secrets': 'Hardcoded credentials detected. Use environment variables or secure configuration management.', 'weak_random': 'Using non-cryptographic randomness for security. Use secrets module for tokens, passwords, or security-sensitive operations.', 'innerHTML_xss': 'Setting innerHTML with user data can lead to XSS attacks. Use textContent or properly sanitize input.', 'weak_comparison': 'Use === instead of == to avoid JavaScript type coercion issues.', 'unsafe_regex': 'Dynamic regex creation can lead to ReDoS attacks. Validate and escape user input carefully.', } return messages.get(issue_type, 'Security issue detected.') def _get_performance_message(self, issue_type: str) -> str: """Return detailed explanation for performance issues""" messages = { 'nested_loops': 'Nested loops can have O(n²) complexity. Consider using more efficient algorithms or data structures like sets or dictionaries.', 'string_concatenation': 'String concatenation in loops is inefficient. Use join() in Python or template literals in JavaScript.', 'list_comprehension_opportunity': 'This loop pattern could be replaced with a more efficient and readable list comprehension.', 'inefficient_contains': 'Checking membership in a list is O(n). Consider using a set for O(1) lookups if checking multiple times.', 'dom_in_loop': 'DOM manipulation inside loops causes reflows. Batch operations or use DocumentFragment for better performance.', 'inefficient_array_method': 'Chaining array methods creates intermediate arrays. Consider combining operations or using a single reduce().', } return messages.get(issue_type, 'Performance issue detected.') def _get_best_practice_message(self, issue_type: str) -> str: """Return detailed explanation for best practice violations""" messages = { 'var_usage': 'Use let or const instead of var for block scoping and to prevent hoisting issues.', 'missing_semicolon': 'Missing semicolon. While JavaScript has ASI, explicit semicolons prevent potential errors.', 'console_log': 'Remove console statements before production deployment or use a proper logging library.', } return messages.get(issue_type, 'Best practice violation detected.') def _get_ai_suggestions(self, code: str, language: str) -> List[Dict]: """ Generate AI-powered code improvement suggestions using CodeT5 model Provides high-level insights beyond pattern matching """ suggestions = [] try: # Extract function names and analyze code structure if language == 'python': # Analyze Python code structure try: tree = ast.parse(code) functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)] classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)] # Generate suggestions based on code structure if functions: # Create a summarization prompt for CodeT5 prompt = f"summarize python: {code[:200]}" inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device) with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=60, num_beams=4, early_stopping=True, temperature=0.7 ) summary = tokenizer.decode(outputs[0], skip_special_tokens=True) # Add meaningful suggestion based on analysis if 'database' in code.lower() or 'query' in code.lower(): suggestions.append({ 'type': 'Architecture Suggestion', 'severity': 'info', 'message': 'Consider implementing a data access layer or using an ORM like SQLAlchemy to abstract database operations and prevent SQL injection.', 'line': 0 }) if len(functions) > 3: suggestions.append({ 'type': 'Code Organization', 'severity': 'info', 'message': f'This code contains {len(functions)} functions. Consider organizing related functions into classes for better code organization.', 'line': 0 }) except: pass # Language-specific suggestions if 'for i in range(len(' in code: suggestions.append({ 'type': 'Pythonic Code', 'severity': 'info', 'message': 'Use enumerate() for index-value iteration: for i, item in enumerate(list) instead of range(len())', 'line': 0 }) if re.search(r'except\s*:', code): suggestions.append({ 'type': 'Error Handling', 'severity': 'info', 'message': 'Avoid bare except clauses. Specify exception types for better error handling.', 'line': 0 }) elif language == 'javascript': # JavaScript-specific AI suggestions if 'callback' in code.lower() and 'function' in code: suggestions.append({ 'type': 'Modern JavaScript', 'severity': 'info', 'message': 'Consider using Promises or async/await instead of callbacks for better readability and error handling.', 'line': 0 }) if 'getElementById' in code and code.count('getElementById') > 2: suggestions.append({ 'type': 'Performance Tip', 'severity': 'info', 'message': 'Cache DOM references when accessing the same element multiple times to improve performance.', 'line': 0 }) # Use CodeT5 for JavaScript analysis prompt = f"summarize javascript: {code[:200]}" inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device) with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=60, num_beams=4, early_stopping=True, temperature=0.7 ) summary = tokenizer.decode(outputs[0], skip_special_tokens=True) if 'array' in code.lower() or 'foreach' in code.lower(): suggestions.append({ 'type': 'Functional Programming', 'severity': 'info', 'message': 'Consider using functional array methods (map, filter, reduce) for cleaner and more expressive code.', 'line': 0 }) # General suggestions for both languages lines = code.split('\n') if max(len(line) for line in lines if line.strip()) > 100: suggestions.append({ 'type': 'Code Readability', 'severity': 'info', 'message': 'Some lines exceed 100 characters. Consider breaking long lines for better readability.', 'line': 0 }) # Check for code complexity if code.count('if') + code.count('else') + code.count('elif') > 5: suggestions.append({ 'type': 'Complexity Warning', 'severity': 'info', 'message': 'High conditional complexity detected. Consider refactoring using early returns or extracting complex logic into separate functions.', 'line': 0 }) except Exception as e: # Don't fail the entire analysis if AI suggestions fail print(f"AI suggestion generation note: {e}") return suggestions def format_results(results: Dict[str, List[Dict]]) -> str: """ Format analysis results into a readable markdown report Uses emojis and formatting for clear visual hierarchy """ if not any(results.values()): return "āœ… **Excellent!** Your code looks great - no significant issues found." output = [] # Security section (highest priority) if results['security']: output.append("## šŸ”’ Security Issues") output.append("*These require immediate attention:*\n") for issue in results['security']: severity_emoji = "šŸ”“" if issue['severity'] == 'high' else "🟔" output.append(f"{severity_emoji} **{issue['type']}** (Line {issue['line']})") output.append(f" → {issue['message']}\n") # Performance section if results['performance']: output.append("## ⚔ Performance Issues") output.append("*Optimize these for better efficiency:*\n") for issue in results['performance']: output.append(f"🟔 **{issue['type']}** (Line {issue['line']})") output.append(f" → {issue['message']}\n") # Best practices section if results['best_practices']: output.append("## šŸ“ Best Practices") output.append("*Follow these for cleaner, more maintainable code:*\n") for issue in results['best_practices']: output.append(f"šŸ”µ **{issue['type']}** (Line {issue['line']})") output.append(f" → {issue['message']}\n") # AI suggestions section if results['ai_suggestions']: output.append("## šŸ¤– AI-Powered Insights") output.append("*Advanced suggestions from CodeT5 analysis:*\n") for suggestion in results['ai_suggestions']: output.append(f"šŸ’” **{suggestion['type']}**: {suggestion['message']}\n") # Summary statistics total_issues = sum(len(v) for v in results.values()) high_severity = sum(1 for v in results['security'] if v['severity'] == 'high') output.append("---") output.append(f"**šŸ“Š Summary**: {total_issues} total suggestions found") if high_severity > 0: output.append(f"**āš ļø Critical**: {high_severity} high-severity security issues need immediate attention!") return "\n".join(output) def analyze_code_handler(code: str, language: str) -> str: """ Main handler function called by Gradio interface Coordinates the analysis and formats results for display """ if not code.strip(): return "ā— Please enter some code to analyze." # Create analyzer instance analyzer = CodeAnalyzer() language_key = language.lower() # Validate language selection if language_key not in ['python', 'javascript']: return "āš ļø Currently supporting Python and JavaScript. More languages coming soon!" try: # Run analysis results = analyzer.analyze_code(code, language_key) # Format and return results return format_results(results) except Exception as e: return f"āŒ An error occurred during analysis: {str(e)}\n\nPlease check your code syntax and try again." def create_interface(): """ Create and configure the Gradio web interface Provides an intuitive UI for code analysis with examples """ # Example code snippets demonstrating various issues python_example = '''def process_user_data(user_id): # Fetch user data from database query = "SELECT * FROM users WHERE id = " + user_id result = db.execute(query) password = "admin123" data = [] for row in result: data.append(row) # Process each item for i in range(len(data)): if data[i]['status'] == True: print(data[i]) return data''' javascript_example = '''function fetchUserData(userId) { var apiKey = "sk-1234567890abcdef"; // Get user element for (var i = 0; i < users.length; i++) { document.getElementById('user-' + i).innerHTML = users[i].name; } // Check user status if (userStatus == null) { console.log("User not found"); } var query = "SELECT * FROM users WHERE id = " + userId; return db.query(query) }''' # Build Gradio interface with custom theme with gr.Blocks(title="AI Code Review Assistant", theme=gr.themes.Soft()) as interface: # Header section gr.Markdown(""" # šŸ¤– AI-Powered Code Review Assistant **Instantly analyze your code for security vulnerabilities, performance issues, and best practices!** This tool uses advanced pattern matching and the CodeT5 AI model to provide comprehensive code analysis for Python and JavaScript. ### ✨ Features - šŸ”’ **Security Analysis**: Detect SQL injection, XSS, hardcoded secrets, and more - ⚔ **Performance Optimization**: Identify inefficient patterns and algorithms - šŸ“ **Best Practices**: Ensure clean, maintainable code following language conventions - šŸ¤– **AI Insights**: Get intelligent suggestions powered by CodeT5 transformer model """) # Main content area with two columns with gr.Row(): # Left column - Input with gr.Column(): code_input = gr.Code( label="šŸ“ Enter your code here", language="python", lines=15, value=python_example ) language_select = gr.Radio( choices=["Python", "JavaScript"], value="Python", label="šŸ”¤ Select Language" ) analyze_btn = gr.Button("šŸ” Analyze Code", variant="primary", size="lg") # Example section gr.Examples( examples=[ [python_example, "Python"], [javascript_example, "JavaScript"] ], inputs=[code_input, language_select], label="šŸ“š Try These Examples" ) # Right column - Output with gr.Column(): output = gr.Markdown( label="šŸ“Š Analysis Results", value="*Your analysis results will appear here...*" ) # Footer with instructions and attribution gr.Markdown(""" --- ### šŸŽÆ How to Use 1. **Paste** your Python or JavaScript code in the editor 2. **Select** the appropriate programming language 3. **Click** "Analyze Code" to run the analysis 4. **Review** the categorized feedback and improve your code! ### šŸ’” Tips - The tool works best with complete functions or code blocks - Line numbers help you quickly locate issues in your code - Security issues (šŸ”“) should be fixed immediately - Use the AI insights for high-level code improvements --- šŸ‘Øā€šŸ’» **Created by Spencer Purdy** | Computer Science @ Auburn University [GitHub](https://github.com/spencercpurdy) | [LinkedIn](https://linkedin.com/in/spencerpurdy) | [Hugging Face](https://huggingface.co/spencercpurdy) """) # Connect the analyze button to the handler function analyze_btn.click( fn=analyze_code_handler, inputs=[code_input, language_select], outputs=output ) return interface # Main execution block if __name__ == "__main__": # Create and launch the Gradio interface interface = create_interface() # Launch with sharing enabled for easy access interface.launch(debug=True, share=True)