Spaces:

panduwana
/

interview-ai-detector

Running

App Files Files Community

bearking58 commited on Apr 20, 2024

Commit

b4f3263

1 Parent(s): ae51d62

feat: finalize pipeline

Browse files

Files changed (7) hide show

device_manager.py +12 -0
gemma2b.py → gemma2b_dependencies.py +23 -17
hypothesis.py +24 -28
main_model.py +81 -0
prediction.py +24 -30
randomforest.py → random_forest_dependencies.py +11 -15
random_forest_model.py +15 -0

device_manager.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+class DeviceManager:
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(DeviceManager, cls).__new__(cls)
+            cls._instance.device = torch.device(
+                "cuda" if torch.cuda.is_available() else "cpu")
+        return cls._instance.device

gemma2b.py → gemma2b_dependencies.py RENAMED Viewed

@@ -3,19 +3,25 @@ import torch
 from torch.nn.functional import cosine_similarity
 from collections import Counter
 import numpy as np
 class Gemma2BDependencies:
-    def __init__(self, question: str, answer: str):
-        self.question = question
-        self.answer = answer
-        self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
-        self.model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")
-        self.device = torch.device("cuda")
-        self.model.to(self.device)
-    def calculate_perplexity(self):
-        inputs = self.tokenizer(self.answer, return_tensors="pt",
                                 truncation=True, max_length=1024)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -27,9 +33,9 @@ class Gemma2BDependencies:
         return perplexity.item()
-    def calculate_burstiness(self):
         # Tokenize the text using GPT-2 tokenizer
-        tokens = self.tokenizer.tokenize(self.answer)
         # Count token frequencies
         frequency_counts = list(Counter(tokens).values())
@@ -42,8 +48,8 @@ class Gemma2BDependencies:
         vmr = variance / mean if mean > 0 else 0
         return vmr
-    def get_embedding(self):
-        inputs = self.tokenizer(self.text, return_tensors="pt",
                                 truncation=True, max_length=1024)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -55,8 +61,8 @@ class Gemma2BDependencies:
         embedding = torch.mean(last_hidden_states, dim=1)
         return embedding
-    def calculate_cosine_similarity(self):
-        embedding1 = self.get_embedding(self.question)
-        embedding2 = self.get_embedding(self.answer)
         # Ensure the embeddings are in the correct shape for cosine_similarity
         return cosine_similarity(embedding1, embedding2).item()

 from torch.nn.functional import cosine_similarity
 from collections import Counter
 import numpy as np
+from device_manager import DeviceManager
 class Gemma2BDependencies:
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(Gemma2BDependencies, cls).__new__(cls)
+            cls._instance.tokenizer = AutoTokenizer.from_pretrained(
+                "google/gemma-2b")
+            cls._instance.model = AutoModelForCausalLM.from_pretrained(
+                "google/gemma-2b")
+            cls._instance.device = DeviceManager()
+            cls._instance.model.to(cls._instance.device)
+        return cls._instance
+    def calculate_perplexity(self, text: str):
+        inputs = self.tokenizer(text, return_tensors="pt",
                                 truncation=True, max_length=1024)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         return perplexity.item()
+    def calculate_burstiness(self, text: str):
         # Tokenize the text using GPT-2 tokenizer
+        tokens = self.tokenizer.tokenize(text)
         # Count token frequencies
         frequency_counts = list(Counter(tokens).values())
         vmr = variance / mean if mean > 0 else 0
         return vmr
+    def get_embedding(self, text: str):
+        inputs = self.tokenizer(text, return_tensors="pt",
                                 truncation=True, max_length=1024)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         embedding = torch.mean(last_hidden_states, dim=1)
         return embedding
+    def calculate_cosine_similarity(self, question: str, answer: str):
+        embedding1 = self.get_embedding(question)
+        embedding2 = self.get_embedding(answer)
         # Ensure the embeddings are in the correct shape for cosine_similarity
         return cosine_similarity(embedding1, embedding2).item()

hypothesis.py CHANGED Viewed

@@ -5,23 +5,19 @@ import pandas as pd
 import numpy as np
 from collections import defaultdict, Counter
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
-from gemma2b import Gemma2BDependencies
 class BaseModelHypothesis:
-    def __init__(self, question: str, answer: str):
         nltk.download('punkt')
         nltk.download('averaged_perceptron_tagger')
-        self.question = question
-        self.answer = answer
         self.analyzer = SentimentIntensityAnalyzer()
         self.lexicon_df = pd.read_csv(
             "https://storage.googleapis.com/ta-ai-detector/datasets/NRC-Emotion-Lexicon.csv")
         self.emotion_lexicon = self.process_emotion_lexicon()
-        self.gemma2bdependencies = Gemma2BDependencies(
-            self.question, self.answer)
         self.features_normalized_text_length = []
         self.features_not_normalized = []
@@ -39,30 +35,30 @@ class BaseModelHypothesis:
             emotion_lexicon[row["word"]].append(row["emotion"])
         return emotion_lexicon
-    def calculate_normalized_text_length_features(self):
         self.features_normalized_text_length = self.extract_pos_features(
-            self.answer)
         self.features_normalized_text_length = self.features_normalized_text_length + \
-            self.calculate_emotion_proportions(self.answer)
         self.features_normalized_text_length.append(
-            self.measure_unique_word_ratio(self.answer))
         return self.scaler_normalized_text_length.transform(np.array(self.features_normalized_text_length).astype(np.float32).reshape(1, -1))
-    def calculate_not_normalized_features(self):
         self.features_not_normalized.append(
-            self.measure_sentiment_intensity(self.answer))
         self.features_not_normalized = self.features_not_normalized + \
-            self.measure_readability(self.answer)
         self.features_not_normalized.append(
-            self.gemma2bdependencies.calculate_perplexity(self.answer))
         self.features_not_normalized.append(
-            self.gemma2bdependencies.calculate_burstiness(self.answer))
         return self.scaler_not_normalized.transform(np.array(self.features_not_normalized).astype(np.float32).reshape(1, -1))
-    def extract_pos_features(self):
-        words = nltk.word_tokenize(self.answer)
         pos_tags = nltk.pos_tag(words)
         desired_tags = ["JJ", "VB", "RB", "PRP", "DT", "IN", "NN", "NNS"]
         pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
@@ -76,19 +72,19 @@ class BaseModelHypothesis:
         return pos_ratios
-    def measure_sentiment_intensity(self):
-        sentiment = self.analyzer.polarity_scores(self.answer)
         return sentiment["compound"]
-    def measure_readability(self):
-        gunning_fog = textstat.gunning_fog(self.answer)
-        smog_index = textstat.smog_index(self.answer)
-        dale_chall_score = textstat.dale_chall_readability_score(self.answer)
         return [gunning_fog, smog_index, dale_chall_score]
-    def calculate_emotion_proportions(self):
-        tokens = nltk.word_tokenize(self.answer)
         total_tokens = len(tokens)
@@ -108,8 +104,8 @@ class BaseModelHypothesis:
             proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
         ]
-    def measure_unique_word_ratio(self):
-        tokens = nltk.word_tokenize(self.answer)
         total_words = len(tokens)
         unique_words = len(Counter(tokens).keys())

 import numpy as np
 from collections import defaultdict, Counter
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from gemma2b_dependencies import Gemma2BDependencies
 class BaseModelHypothesis:
+    def __init__(self):
         nltk.download('punkt')
         nltk.download('averaged_perceptron_tagger')
         self.analyzer = SentimentIntensityAnalyzer()
         self.lexicon_df = pd.read_csv(
             "https://storage.googleapis.com/ta-ai-detector/datasets/NRC-Emotion-Lexicon.csv")
         self.emotion_lexicon = self.process_emotion_lexicon()
+        self.gemma2bdependencies = Gemma2BDependencies()
         self.features_normalized_text_length = []
         self.features_not_normalized = []
             emotion_lexicon[row["word"]].append(row["emotion"])
         return emotion_lexicon
+    def calculate_normalized_text_length_features(self, text: str) -> np.ndarray:
         self.features_normalized_text_length = self.extract_pos_features(
+            text)
         self.features_normalized_text_length = self.features_normalized_text_length + \
+            self.calculate_emotion_proportions(text)
         self.features_normalized_text_length.append(
+            self.measure_unique_word_ratio(text))
         return self.scaler_normalized_text_length.transform(np.array(self.features_normalized_text_length).astype(np.float32).reshape(1, -1))
+    def calculate_not_normalized_features(self, text: str) -> np.ndarray:
         self.features_not_normalized.append(
+            self.measure_sentiment_intensity(text))
         self.features_not_normalized = self.features_not_normalized + \
+            self.measure_readability(text)
         self.features_not_normalized.append(
+            self.gemma2bdependencies.calculate_perplexity(text))
         self.features_not_normalized.append(
+            self.gemma2bdependencies.calculate_burstiness(text))
         return self.scaler_not_normalized.transform(np.array(self.features_not_normalized).astype(np.float32).reshape(1, -1))
+    def extract_pos_features(self, text: str):
+        words = nltk.word_tokenize(text)
         pos_tags = nltk.pos_tag(words)
         desired_tags = ["JJ", "VB", "RB", "PRP", "DT", "IN", "NN", "NNS"]
         pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
         return pos_ratios
+    def measure_sentiment_intensity(self, text: str):
+        sentiment = self.analyzer.polarity_scores(text)
         return sentiment["compound"]
+    def measure_readability(self, text: str):
+        gunning_fog = textstat.gunning_fog(text)
+        smog_index = textstat.smog_index(text)
+        dale_chall_score = textstat.dale_chall_readability_score(text)
         return [gunning_fog, smog_index, dale_chall_score]
+    def calculate_emotion_proportions(self, text: str):
+        tokens = nltk.word_tokenize(text)
         total_tokens = len(tokens)
             proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
         ]
+    def measure_unique_word_ratio(self, text: str):
+        tokens = nltk.word_tokenize(text)
         total_words = len(tokens)
         unique_words = len(Counter(tokens).keys())

main_model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from device_manager import DeviceManager
+from transformers import AlbertModel, AlbertTokenizerFast
+import torch.nn as nn
+import torch
+import numpy as np
+class AlbertCustomClassificationHead(nn.Module):
+    def __init__(self, albert_model, num_additional_features=25, dropout_rate=0.1):
+        super(AlbertCustomClassificationHead, self).__init__()
+        self.albert_model = albert_model
+        self.dropout = nn.Dropout(dropout_rate)
+        self.classifier = nn.Linear(1024 + num_additional_features, 1)
+    def forward(self, input_ids, attention_mask, additional_features, labels=None):
+        albert_output = self.albert_model(
+            input_ids=input_ids, attention_mask=attention_mask).pooler_output
+        combined_features = torch.cat(
+            [albert_output, additional_features], dim=1)
+        dropout_output = self.dropout(combined_features)
+        logits = self.classifier(dropout_output)
+        if labels is not None:
+            loss_fn = nn.BCEWithLogitsLoss()
+            labels = labels.unsqueeze(1)
+            loss = loss_fn(logits, labels.float())
+            return logits, loss
+        else:
+            return logits
+class PredictMainModel:
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(PredictMainModel, cls).__new__()
+            cls._instance.initialize()
+        return cls._instance
+    def initialize(self):
+        self.model_name = "albert-large-v2"
+        self.tokenizer = AlbertTokenizerFast.from_pretrained(self.model_name)
+        self.albert_model = AlbertModel.from_pretrained(self.model_name)
+        self.device = DeviceManager()
+        self.model = AlbertCustomClassificationHead(
+            self.albert_model).to(self.device)
+        # TODO : CHANGE MODEL STATE DICT PATH
+        self.model.load_state_dict(torch.load("best_model_fold_4.pth"))
+    def preprocess_input(self, text: str, additional_features: np.ndarray):
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=512,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors="pt"
+        )
+        additional_features_tensor = torch.tensor(
+            additional_features, dtype=torch.float)
+        return {
+            "input_ids": encoding["input_ids"].to(self.device),
+            "attention_mask": encoding["attention_mask"].to(self.device),
+            "additional_features": additional_features_tensor.to(self.device)
+        }
+    def predict(self, text: str, additional_features: np.ndarray) -> float:
+        self.model.eval()
+        with torch.no_grad():
+            data = self.preprocess_input(text, additional_features)
+            logits = self.model(**data)
+            return torch.sigmoid(logits).cpu().numpy()[0][0]

prediction.py CHANGED Viewed

@@ -1,37 +1,12 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from hypothesis import BaseModelHypothesis
-from randomforest import RandomForestDependencies
 import torch.nn as nn
 import torch
-class AlbertCustomClassificationHead(nn.Module):
-    def __init__(self, albert_model, dropout_rate=0.1):
-        super(AlbertCustomClassificationHead, self).__init__()
-        self.albert_model = albert_model
-        self.dropout = nn.Dropout(dropout_rate)
-        self.classifier = nn.Linear(1024 + 25, 1)
-    def forward(self, input_ids, attention_mask, additional_features, labels=None):
-        albert_output = self.albert_model(
-            input_ids=input_ids, attention_mask=attention_mask).pooler_output
-        combined_features = torch.cat(
-            [albert_output, additional_features], dim=1)
-        dropout_output = self.dropout(combined_features)
-        logits = self.classifier(dropout_output)
-        if labels is not None:
-            loss_fn = nn.BCEWithLogitsLoss()
-            labels = labels.unsqueeze(1)
-            loss = loss_fn(logits, labels.float())
-            return logits, loss
-        else:
-            return logits
 app = FastAPI()
@@ -60,4 +35,23 @@ async def predict(request: PredictRequest):
     features_not_normalized = hypothesis.calculate_not_normalized_features(
         answer)
-    return request_dict.get("backspace_count")

 from fastapi import FastAPI
 from pydantic import BaseModel
 from hypothesis import BaseModelHypothesis
+from random_forest_dependencies import RandomForestDependencies
+from random_forest_model import RandomForestModel
+from main_model import PredictMainModel
 import torch.nn as nn
 import torch
+import numpy as np
 app = FastAPI()
     features_not_normalized = hypothesis.calculate_not_normalized_features(
         answer)
+    combined_additional_features = np.concatenate(
+        (features_normalized_text_length, features_not_normalized), axis=1)
+    main_model = PredictMainModel()
+    main_model_probability = main_model.predict(
+        answer, combined_additional_features)
+    random_forest_features = RandomForestDependencies()
+    secondary_model_features = random_forest_features.calculate_features(
+        question, answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
+    secondary_model = RandomForestModel()
+    secondary_model_prediction = secondary_model.predict(
+        secondary_model_features)
+    return {
+        "main_model_probability": main_model_probability,
+        "final_prediction": secondary_model_prediction,
+        "prediction_class": "AI" if secondary_model_prediction == 1 else "HUMAN"
+    }

randomforest.py → random_forest_dependencies.py RENAMED Viewed

@@ -1,32 +1,28 @@
-from gemma2b import Gemma2BDependencies
 from collections import Counter
 class RandomForestDependencies:
-    def __init__(self, question: str, answer: str):
-        self.question = question
-        self.answer = answer
-        self.gemma2bdependencies = Gemma2BDependencies(
-            self.question, self.answer)
         self.random_forest_features = []
-    def calculate_features(self, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
         cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
-            self.question, self.answer)
-        backspace_count_normalized = backspace_count / len(self.answer)
-        typing_duration_normalized = typing_duration / len(self.answer)
         letter_discrepancy = self.calculate_letter_discrepancy(
-            self.answer, letter_click_counts)
         self.random_forest_features = [
             cosine_similarity, probability, backspace_count_normalized,
             typing_duration_normalized, letter_discrepancy
         ]
-    def calculate_letter_discrepancy(self, letter_click_counts: dict[str, int]):
         # Calculate letter frequencies in the text
-        text_letter_counts = Counter(self.answer.lower())
         # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
         ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
@@ -35,6 +31,6 @@ class RandomForestDependencies:
         # Average the ratios and normalize by the length of the text
         average_ratio = sum(ratios) / len(ratios)
         discrepancy_ratio_normalized = average_ratio / \
-            (len(self.answer) if len(self.answer) > 0 else 1)
         return discrepancy_ratio_normalized

+from gemma2b_dependencies import Gemma2BDependencies
 from collections import Counter
 class RandomForestDependencies:
+    def __init__(self):
+        self.gemma2bdependencies = Gemma2BDependencies()
         self.random_forest_features = []
+    def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
         cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
+            question, answer)
+        backspace_count_normalized = backspace_count / len(answer)
+        typing_duration_normalized = typing_duration / len(answer)
         letter_discrepancy = self.calculate_letter_discrepancy(
+            answer, letter_click_counts)
         self.random_forest_features = [
             cosine_similarity, probability, backspace_count_normalized,
             typing_duration_normalized, letter_discrepancy
         ]
+    def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
         # Calculate letter frequencies in the text
+        text_letter_counts = Counter(text.lower())
         # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
         ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
         # Average the ratios and normalize by the length of the text
         average_ratio = sum(ratios) / len(ratios)
         discrepancy_ratio_normalized = average_ratio / \
+            (len(text) if len(text) > 0 else 1)
         return discrepancy_ratio_normalized

random_forest_model.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import joblib
+import numpy as np
+from typing import List
+class RandomForestModel:
+    def __init__(self):
+        self.scaler = joblib.load("rf_scaler.joblib")
+        self.model = joblib.load("random_forest.joblib")
+    def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
+        return self.scaler.transform(np.array(secondary_model_features).astype(np.float32).reshape(1, -1))
+    def predict(self, secondary_model_features: List[float]):
+        return self.model.predict(self.preprocess_input(secondary_model_features))[0]