CodeMateVulnerabilityEvalutor

Sleeping

App Files Files Community

Kareem94 commited on Jun 19

Commit

0d168b4

verified ·

1 Parent(s): 6d0481d

Update main.py

Browse files

Files changed (1) hide show

main.py +200 -71

main.py CHANGED Viewed

@@ -2,6 +2,7 @@ from flask import Flask, request, jsonify
 import torch
 from transformers import RobertaTokenizer, RobertaForSequenceClassification
 import os
 from functools import lru_cache
 app = Flask(__name__)
@@ -25,7 +26,11 @@ def load_tokenizer():
         return tokenizer
     except Exception as e:
         print(f"Error loading tokenizer: {e}")
-        return RobertaTokenizer.from_pretrained('microsoft/codebert-base')
 def load_model():
     global device
@@ -60,27 +65,27 @@ def load_model():
     except Exception as e:
         print(f"Error loading model: {e}")
-        raise e
-@lru_cache(maxsize=1000)
-def cached_tokenize(code_hash, max_length):
-    code = code_hash
-    return tokenizer(
-        code,
-        truncation=True,
-        padding='max_length',
-        max_length=max_length,
-        return_tensors='pt'
-    )
 try:
     print("Loading tokenizer...")
     tokenizer = load_tokenizer()
-    print("Tokenizer loaded successfully!")
     print("Loading model...")
     model = load_model()
-    print("Model loaded successfully!")
 except Exception as e:
     print(f"Error during initialization: {str(e)}")
@@ -111,75 +116,199 @@ def predict_batch():
         codes = data['codes']
         if not isinstance(codes, list) or len(codes) == 0:
             return jsonify({"error": "'codes' must be a non-empty array"}), 400
-        batch_size = min(len(codes), 16)
         results = []
-        for i in range(0, len(codes), batch_size):
-            batch = codes[i:i+batch_size]
-            scores = predict_vulnerability_batch(batch)
-            for j, score in enumerate(scores):
-                results.append({
-                    "score": score
-                })
         return jsonify({"results": results})
     except Exception as e:
         return jsonify({"error": f"Batch prediction error: {str(e)}"}), 500
 def predict_vulnerability(code):
-    dynamic_length = min(max(len(code.split()) * 2, 128), 512)
-    inputs = tokenizer(
-        code,
-        truncation=True,
-        padding='max_length',
-        max_length=dynamic_length,
-        return_tensors='pt'
-    )
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        with torch.cuda.amp.autocast() if device.type == 'cuda' else torch.no_grad():
-            outputs = model(**inputs)
-    if hasattr(outputs, 'logits'):
-        score = torch.sigmoid(outputs.logits).cpu().item()
-    else:
-        score = torch.sigmoid(outputs[0]).cpu().item()
-    return round(score, 4)
 def predict_vulnerability_batch(codes):
-    max_len = max([len(code.split()) * 2 for code in codes])
-    dynamic_length = min(max(max_len, 128), 512)
-    inputs = tokenizer(
-        codes,
-        truncation=True,
-        padding='max_length',
-        max_length=dynamic_length,
-        return_tensors='pt'
-    )
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        with torch.cuda.amp.autocast() if device.type == 'cuda' else torch.no_grad():
-            outputs = model(**inputs)
-    if hasattr(outputs, 'logits'):
-        scores = torch.sigmoid(outputs.logits).cpu().numpy()
-    else:
-        scores = torch.sigmoid(outputs[0]).cpu().numpy()
-    return [round(float(score), 4) for score in scores.flatten()]
 @app.route("/health", methods=['GET'])
 def health_check():

 import torch
 from transformers import RobertaTokenizer, RobertaForSequenceClassification
 import os
+import gc
 from functools import lru_cache
 app = Flask(__name__)
         return tokenizer
     except Exception as e:
         print(f"Error loading tokenizer: {e}")
+        try:
+            return RobertaTokenizer.from_pretrained('microsoft/codebert-base')
+        except Exception as e2:
+            print(f"Fallback tokenizer failed: {e2}")
+            return None
 def load_model():
     global device
     except Exception as e:
         print(f"Error loading model: {e}")
+        return None
+def cleanup_gpu_memory():
+    if device and device.type == 'cuda':
+        torch.cuda.empty_cache()
+    gc.collect()
 try:
     print("Loading tokenizer...")
     tokenizer = load_tokenizer()
+    if tokenizer:
+        print("Tokenizer loaded successfully!")
+    else:
+        print("Failed to load tokenizer!")
     print("Loading model...")
     model = load_model()
+    if model:
+        print("Model loaded successfully!")
+    else:
+        print("Failed to load model!")
 except Exception as e:
     print(f"Error during initialization: {str(e)}")
         codes = data['codes']
         if not isinstance(codes, list) or len(codes) == 0:
             return jsonify({"error": "'codes' must be a non-empty array"}), 400
+        if len(codes) > 100:
+            return jsonify({"error": "Too many codes. Maximum 100 allowed."}), 400
+        validated_codes = []
+        for i, code in enumerate(codes):
+            if not isinstance(code, str):
+                return jsonify({"error": f"Code at index {i} must be a string"}), 400
+            if len(code.strip()) == 0:
+                validated_codes.append("# empty code")
+            elif len(code) > 50000:
+                return jsonify({"error": f"Code at index {i} too long. Maximum 50000 characters."}), 400
+            else:
+                validated_codes.append(code.strip())
+        if len(validated_codes) == 1:
+            score = predict_vulnerability_with_chunking(validated_codes[0])
+            cleanup_gpu_memory()
+            return jsonify({"results": [{"score": score}]})
+        batch_size = min(len(validated_codes), 16)
         results = []
+        try:
+            for i in range(0, len(validated_codes), batch_size):
+                batch = validated_codes[i:i+batch_size]
+                long_codes = []
+                short_codes = []
+                long_indices = []
+                short_indices = []
+                for idx, code in enumerate(batch):
+                    try:
+                        tokens = tokenizer.encode(code, add_special_tokens=False, max_length=1000, truncation=True)
+                        if len(tokens) > 450:
+                            long_codes.append(code)
+                            long_indices.append(i + idx)
+                        else:
+                            short_codes.append(code)
+                            short_indices.append(i + idx)
+                    except Exception as e:
+                        print(f"Tokenization error for code {i + idx}: {e}")
+                        short_codes.append(code)
+                        short_indices.append(i + idx)
+                batch_scores = [0.0] * len(batch)
+                if short_codes:
+                    try:
+                        short_scores = predict_vulnerability_batch(short_codes)
+                        for j, score in enumerate(short_scores):
+                            local_idx = short_indices[j] - i
+                            batch_scores[local_idx] = score
+                    except Exception as e:
+                        print(f"Batch prediction error: {e}")
+                        for j in range(len(short_codes)):
+                            local_idx = short_indices[j] - i
+                            batch_scores[local_idx] = 0.0
+                for j, code in enumerate(long_codes):
+                    try:
+                        score = predict_vulnerability_with_chunking(code)
+                        local_idx = long_indices[j] - i
+                        batch_scores[local_idx] = score
+                    except Exception as e:
+                        print(f"Chunking prediction error: {e}")
+                        local_idx = long_indices[j] - i
+                        batch_scores[local_idx] = 0.0
+                for score in batch_scores:
+                    results.append({"score": score})
+                cleanup_gpu_memory()
+        except Exception as e:
+            cleanup_gpu_memory()
+            raise e
         return jsonify({"results": results})
     except Exception as e:
+        cleanup_gpu_memory()
         return jsonify({"error": f"Batch prediction error: {str(e)}"}), 500
+def predict_vulnerability_with_chunking(code):
+    try:
+        if not code or len(code.strip()) == 0:
+            return 0.0
+        tokens = tokenizer.encode(code, add_special_tokens=False, max_length=2000, truncation=True)
+        if len(tokens) <= 450:
+            return predict_vulnerability(code)
+        chunk_size = 400
+        overlap = 50
+        max_score = 0.0
+        for start in range(0, len(tokens), chunk_size - overlap):
+            end = min(start + chunk_size, len(tokens))
+            chunk_tokens = tokens[start:end]
+            try:
+                chunk_code = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
+                if chunk_code.strip():
+                    score = predict_vulnerability(chunk_code)
+                    max_score = max(max_score, score)
+            except Exception as e:
+                print(f"Chunk processing error: {e}")
+                continue
+            if end >= len(tokens):
+                break
+        return max_score
+    except Exception as e:
+        print(f"Chunking error: {e}")
+        return 0.0
 def predict_vulnerability(code):
+    try:
+        if not code or len(code.strip()) == 0:
+            return 0.0
+        dynamic_length = min(max(len(code.split()) * 2, 128), 512)
+        inputs = tokenizer(
+            code,
+            truncation=True,
+            padding='max_length',
+            max_length=dynamic_length,
+            return_tensors='pt'
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            if device.type == 'cuda':
+                with torch.cuda.amp.autocast():
+                    outputs = model(**inputs)
+            else:
+                outputs = model(**inputs)
+        if hasattr(outputs, 'logits'):
+            score = torch.sigmoid(outputs.logits).cpu().item()
+        else:
+            score = torch.sigmoid(outputs[0]).cpu().item()
+        return round(max(0.0, min(1.0, score)), 4)
+    except Exception as e:
+        print(f"Single prediction error: {e}")
+        return 0.0
 def predict_vulnerability_batch(codes):
+    try:
+        if not codes or len(codes) == 0:
+            return []
+        filtered_codes = [code if code and code.strip() else "# empty" for code in codes]
+        max_len = max([len(code.split()) * 2 for code in filtered_codes if code])
+        dynamic_length = min(max(max_len, 128), 512)
+        inputs = tokenizer(
+            filtered_codes,
+            truncation=True,
+            padding='max_length',
+            max_length=dynamic_length,
+            return_tensors='pt'
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            if device.type == 'cuda':
+                with torch.cuda.amp.autocast():
+                    outputs = model(**inputs)
+            else:
+                outputs = model(**inputs)
+        if hasattr(outputs, 'logits'):
+            scores = torch.sigmoid(outputs.logits).cpu().numpy()
+        else:
+            scores = torch.sigmoid(outputs[0]).cpu().numpy()
+        return [round(max(0.0, min(1.0, float(score))), 4) for score in scores.flatten()]
+    except Exception as e:
+        print(f"Batch prediction error: {e}")
+        return [0.0] * len(codes)
 @app.route("/health", methods=['GET'])
 def health_check():