Spaces:

panduwana
/

interview-ai-detector

Sleeping

App Files Files Community

bearking58 commited on Apr 30, 2024

Commit

1c1651d

1 Parent(s): aef68de

refactor: change to a higher accuracy model

Browse files

Files changed (12) hide show

core-model-prediction/gemma2b_dependencies.py +1 -20
core-model-prediction/hypothesis.py +86 -32
core-model-prediction/main_model.py +42 -24
core-model-prediction/models/{albert_model.pth → albert_weights.pth} +2 -2
core-model-prediction/models/random_forest.joblib +0 -0
core-model-prediction/prediction.py +8 -10
core-model-prediction/random_forest_dependencies.py +2 -7
core-model-prediction/random_forest_model.py +8 -1
core-model-prediction/requirements.txt +2 -2
core-model-prediction/scalers/rf_scaler.joblib +0 -0
core-model-prediction/scalers/{scaler-normalized-text-length.joblib → torch-scaler-normalized-text-length.joblib} +0 -0
core-model-prediction/scalers/{scaler-not-normalized.joblib → torch-scaler-not-normalized.joblib} +0 -0

core-model-prediction/gemma2b_dependencies.py CHANGED Viewed

@@ -43,7 +43,7 @@ class Gemma2BDependencies:
     def calculate_burstiness(self, text: str):
         # Tokenize the text using GPT-2 tokenizer
-        tokens = self.tokenizer.tokenize(text)
         # Count token frequencies
         frequency_counts = list(Counter(tokens).values())
@@ -55,22 +55,3 @@ class Gemma2BDependencies:
         # Compute Variance-to-Mean Ratio (VMR) for burstiness
         vmr = variance / mean if mean > 0 else 0
         return vmr
-    def get_embedding(self, text: str):
-        inputs = self.tokenizer(text, return_tensors="pt",
-                                truncation=True, max_length=1024)
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            outputs = self.model(**inputs, output_hidden_states=True)
-        last_hidden_states = outputs.hidden_states[-1]
-        # Average the token embeddings to get a sentence-level embedding
-        embedding = torch.mean(last_hidden_states, dim=1)
-        return embedding
-    def calculate_cosine_similarity(self, question: str, answer: str):
-        embedding1 = self.get_embedding(question)
-        embedding2 = self.get_embedding(answer)
-        # Ensure the embeddings are in the correct shape for cosine_similarity
-        return cosine_similarity(embedding1, embedding2).item()

     def calculate_burstiness(self, text: str):
         # Tokenize the text using GPT-2 tokenizer
+        tokens = self.tokenizer.encode(text, add_special_tokens=False)
         # Count token frequencies
         frequency_counts = list(Counter(tokens).values())
         # Compute Variance-to-Mean Ratio (VMR) for burstiness
         vmr = variance / mean if mean > 0 else 0
         return vmr

core-model-prediction/hypothesis.py CHANGED Viewed

@@ -3,24 +3,47 @@ import joblib
 import textstat
 import pandas as pd
 import numpy as np
-from collections import defaultdict, Counter
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from gemma2b_dependencies import Gemma2BDependencies
 class BaseModelHypothesis:
     def __init__(self):
         nltk.download('punkt')
         nltk.download('averaged_perceptron_tagger')
         self.analyzer = SentimentIntensityAnalyzer()
         self.lexicon_df = pd.read_csv(
-            "https://storage.googleapis.com/ta-ai-detector/datasets/NRC-Emotion-Lexicon.csv")
         self.emotion_lexicon = self.process_emotion_lexicon()
         self.gemma2bdependencies = Gemma2BDependencies()
-        self.features_normalized_text_length = []
-        self.features_not_normalized = []
         self.scaler_normalized_text_length = joblib.load(
             "scalers/scaler-normalized-text-length.joblib")
@@ -35,32 +58,43 @@ class BaseModelHypothesis:
             emotion_lexicon[row["word"]].append(row["emotion"])
         return emotion_lexicon
-    def calculate_normalized_text_length_features(self, text: str) -> np.ndarray:
-        self.features_normalized_text_length = self.extract_pos_features(
             text)
-        self.features_normalized_text_length = self.features_normalized_text_length + \
-            self.calculate_emotion_proportions(text)
-        self.features_normalized_text_length.append(
-            self.measure_unique_word_ratio(text))
-        return self.scaler_normalized_text_length.transform(np.array(self.features_normalized_text_length).astype(np.float32).reshape(1, -1))
-    def calculate_not_normalized_features(self, text: str) -> np.ndarray:
-        self.features_not_normalized.append(
-            self.measure_sentiment_intensity(text))
-        self.features_not_normalized = self.features_not_normalized + \
-            self.measure_readability(text)
-        self.features_not_normalized.append(
-            self.gemma2bdependencies.calculate_perplexity(text))
-        self.features_not_normalized.append(
-            self.gemma2bdependencies.calculate_burstiness(text))
-        return self.scaler_not_normalized.transform(np.array(self.features_not_normalized).astype(np.float32).reshape(1, -1))
     def extract_pos_features(self, text: str):
         words = nltk.word_tokenize(text)
         pos_tags = nltk.pos_tag(words)
-        desired_tags = ["JJ", "VB", "RB", "PRP", "DT", "IN", "NN", "NNS"]
         pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
         for _, pos in pos_tags:
@@ -83,20 +117,37 @@ class BaseModelHypothesis:
         return [gunning_fog, smog_index, dale_chall_score]
     def calculate_emotion_proportions(self, text: str):
         tokens = nltk.word_tokenize(text)
-        total_tokens = len(tokens)
         emotion_counts = {emotion: 0 for emotion in [
             "negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
-        for token in tokens:
-            if token in self.emotion_lexicon:
-                for emotion in self.emotion_lexicon[token]:
                     emotion_counts[emotion] += 1
-        proportions = {emotion: count / total_tokens for emotion,
                        count in emotion_counts.items()}
         return [
@@ -105,9 +156,12 @@ class BaseModelHypothesis:
         ]
     def measure_unique_word_ratio(self, text: str):
-        tokens = nltk.word_tokenize(text)
         total_words = len(tokens)
-        unique_words = len(Counter(tokens).keys())
         return (unique_words / total_words)

 import textstat
 import pandas as pd
 import numpy as np
+from typing import List
+from collections import defaultdict
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from gemma2b_dependencies import Gemma2BDependencies
+from string import punctuation
 class BaseModelHypothesis:
     def __init__(self):
         nltk.download('punkt')
+        nltk.download('wordnet')
         nltk.download('averaged_perceptron_tagger')
         self.analyzer = SentimentIntensityAnalyzer()
         self.lexicon_df = pd.read_csv(
+            "https://storage.googleapis.com/interview-ai-detector/higher-accuracy-final-model/NRC-Emotion-Lexicon.csv")
         self.emotion_lexicon = self.process_emotion_lexicon()
+        self.lemmatizer = nltk.stem.WordNetLemmatizer()
         self.gemma2bdependencies = Gemma2BDependencies()
+        self.additional_feature_columns = [
+            "nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
+            "compound_score", "gunning_fog", "smog_index", "dale_chall_score",
+            "negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
+            "anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
+            "disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
+            "surprise_emotion_proportions", "unique_words_ratio", "perplexity", "burstiness"
+        ]
+        self.features_normalized_text_length = [
+            "nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
+            "negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
+            "anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
+            "disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
+            "surprise_emotion_proportions", "unique_words_ratio"
+        ]
+        self.features_not_normalized = [
+            "compound_score", "gunning_fog", "smog_index", "dale_chall_score",
+            "perplexity", "burstiness"
+        ]
         self.scaler_normalized_text_length = joblib.load(
             "scalers/scaler-normalized-text-length.joblib")
             emotion_lexicon[row["word"]].append(row["emotion"])
         return emotion_lexicon
+    def calculate_features_dataframe(self, text: str) -> np.ndarray:
+        normalized_text_length_features = self.calculate_normalized_text_length_features(
             text)
+        not_normalized_features = self.calculate_not_normalized_features(text)
+        all_features = normalized_text_length_features + not_normalized_features
+        features_df = pd.DataFrame(
+            [all_features], columns=self.additional_feature_columns)
+        # Scaling features
+        features_df[self.features_normalized_text_length] = self.scaler_normalized_text_length.transform(
+            features_df[self.features_normalized_text_length])
+        features_df[self.features_not_normalized] = self.scaler_not_normalized.transform(
+            features_df[self.features_not_normalized])
+        ordered_df = features_df[self.additional_feature_columns]
+        return ordered_df.values.astype(np.float32).reshape(1, -1)
+    def calculate_normalized_text_length_features(self, text: str) -> List[float]:
+        pos_features = self.extract_pos_features(text)
+        emotion_features = self.calculate_emotion_proportions(text)
+        unique_word_ratio = [self.measure_unique_word_ratio(text)]
+        features = pos_features + emotion_features + unique_word_ratio
+        return features
+    def calculate_not_normalized_features(self, text: str) -> List[float]:
+        sentiment_intensity = self.measure_sentiment_intensity(text)
+        readability_scores = self.measure_readability(text)
+        perplexity = [self.gemma2bdependencies.calculate_perplexity(text)]
+        burstiness = [self.gemma2bdependencies.calculate_burstiness(text)]
+        features = sentiment_intensity + readability_scores + perplexity + burstiness
+        return features
     def extract_pos_features(self, text: str):
         words = nltk.word_tokenize(text)
         pos_tags = nltk.pos_tag(words)
+        desired_tags = ["NN", "NNS", "JJ", "IN", "DT", "VB", "PRP", "RB"]
         pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
         for _, pos in pos_tags:
         return [gunning_fog, smog_index, dale_chall_score]
+    def __penn2morphy(self, penntag):
+        morphy_tag = {
+            'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n',  # Nouns
+            'JJ': 'a', 'JJR': 'a', 'JJS': 'a',  # Adjectives
+            'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',  # Verbs
+            'RB': 'r', 'RBR': 'r', 'RBS': 'r',  # Adverbs
+            # Pronouns, determiners, prepositions, modal verbs
+            'PRP': 'n', 'PRP$': 'n', 'DT': 'n', 'IN': 'n', 'MD': 'v',
+            # Others, treated as nouns unless a better fit is found
+            'CC': 'n', 'CD': 'n', 'EX': 'n', 'FW': 'n', 'POS': 'n', 'TO': 'n', 'WDT': 'n', 'WP': 'n', 'WP$': 'n', 'WRB': 'n', 'PDT': 'n'
+        }
+        return morphy_tag.get(penntag[:2], 'n')
     def calculate_emotion_proportions(self, text: str):
         tokens = nltk.word_tokenize(text)
+        tagged_tokens = nltk.pos_tag(tokens)
+        lemmas = [self.lemmatizer.lemmatize(
+            token.lower(), pos=self.__penn2morphy(tag)) for token, tag in tagged_tokens]
+        total_lemmas = len(lemmas)
         emotion_counts = {emotion: 0 for emotion in [
             "negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
+        for lemma in lemmas:
+            if lemma in self.emotion_lexicon:
+                for emotion in self.emotion_lexicon[lemma]:
                     emotion_counts[emotion] += 1
+        proportions = {emotion: count / total_lemmas for emotion,
                        count in emotion_counts.items()}
         return [
         ]
     def measure_unique_word_ratio(self, text: str):
+        tokens = nltk.word_tokenize(text.lower())
+        tokens = [token for token in tokens if token not in punctuation]
         total_words = len(tokens)
+        unique_words = len(set(tokens))
         return (unique_words / total_words)

core-model-prediction/main_model.py CHANGED Viewed

@@ -5,31 +5,50 @@ import torch
 import numpy as np
-class AlbertCustomClassificationHead(nn.Module):
-    def __init__(self, albert_model, num_additional_features=25, dropout_rate=0.1):
-        super(AlbertCustomClassificationHead, self).__init__()
-        self.albert_model = albert_model
-        self.dropout = nn.Dropout(dropout_rate)
-        self.classifier = nn.Linear(1024 + num_additional_features, 1)
-    def forward(self, input_ids, attention_mask, additional_features, labels=None):
-        albert_output = self.albert_model(
-            input_ids=input_ids, attention_mask=attention_mask).pooler_output
-        combined_features = torch.cat(
-            [albert_output, additional_features], dim=1)
-        dropout_output = self.dropout(combined_features)
-        logits = self.classifier(dropout_output)
-        if labels is not None:
-            loss_fn = nn.BCEWithLogitsLoss()
-            labels = labels.unsqueeze(1)
-            loss = loss_fn(logits, labels.float())
-            return logits, loss
-        else:
-            return logits
 class PredictMainModel:
@@ -47,10 +66,9 @@ class PredictMainModel:
         self.albert_model = AlbertModel.from_pretrained(self.model_name)
         self.device = DeviceManager()
-        self.model = AlbertCustomClassificationHead(
             self.albert_model).to(self.device)
-        # TODO : CHANGE MODEL STATE DICT PATH
-        self.model.load_state_dict(torch.load("models/albert_model.pth"))
     def preprocess_input(self, text: str, additional_features: np.ndarray):
         encoding = self.tokenizer.encode_plus(

 import numpy as np
+class AlbertSeparateTransformation(nn.Module):
+    def __init__(self, albert_model, num_additional_features=25,
+                 hidden_size_albert=512, hidden_size_additional=128, classifier_hidden_size=256,
+                 dropout_rate_albert=0.3, dropout_rate_additional=0.1, dropout_rate_classifier=0.1):
+        super(AlbertSeparateTransformation, self).__init__()
+        self.albert = albert_model
+        # Transform ALBERT's features to an intermediate space
+        self.albert_feature_transform = nn.Sequential(
+            nn.Linear(1024, hidden_size_albert),
+            nn.ReLU(),
+            nn.Dropout(dropout_rate_albert),
+        )
+        # Transform additional features to an intermediate space
+        self.additional_feature_transform = nn.Sequential(
+            nn.Linear(num_additional_features, hidden_size_additional),
+            nn.ReLU(),
+            nn.Dropout(dropout_rate_additional),
+        )
+        # Combine both transformed features and process for final prediction
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_size_albert + hidden_size_additional,
+                      classifier_hidden_size),
+            nn.ReLU(),
+            nn.Dropout(dropout_rate_classifier),
+            nn.Linear(classifier_hidden_size, 1)
+        )
+    def forward(self, input_ids, attention_mask, additional_features):
+        albert_output = self.albert(
+            input_ids=input_ids, attention_mask=attention_mask).pooler_output
+        transformed_albert_features = self.albert_feature_transform(
+            albert_output)
+        transformed_additional_features = self.additional_feature_transform(
+            additional_features)
+        combined_features = torch.cat(
+            (transformed_albert_features, transformed_additional_features), dim=1)
+        logits = self.classifier(combined_features)
+        return logits
 class PredictMainModel:
         self.albert_model = AlbertModel.from_pretrained(self.model_name)
         self.device = DeviceManager()
+        self.model = AlbertSeparateTransformation(
             self.albert_model).to(self.device)
+        self.model.load_state_dict(torch.load("models/albert_weights.pth"))
     def preprocess_input(self, text: str, additional_features: np.ndarray):
         encoding = self.tokenizer.encode_plus(

core-model-prediction/models/{albert_model.pth → albert_weights.pth} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b203b54caba10e290830cf720c8cd35093a358792e996ee04ee7d8e5e341651
-size 70752187

 version https://git-lfs.github.com/spec/v1
+oid sha256:59184c88c7921ac5f115aa0b10b3224536b5f7d7ebb6cf07fd45eecccfcff3ae
+size 73519347

core-model-prediction/models/random_forest.joblib CHANGED Viewed

Binary files a/core-model-prediction/models/random_forest.joblib and b/core-model-prediction/models/random_forest.joblib differ

core-model-prediction/prediction.py CHANGED Viewed

@@ -40,29 +40,27 @@ def process_instance(data: PredictRequest):
     typing_duration = data.typing_duration
     letter_click_counts = data.letter_click_counts
     hypothesis = BaseModelHypothesis()
-    features_normalized_text_length = hypothesis.calculate_normalized_text_length_features(
-        answer)
-    features_not_normalized = hypothesis.calculate_not_normalized_features(
-        answer)
-    combined_additional_features = np.concatenate(
-        (features_normalized_text_length, features_not_normalized), axis=1)
     main_model = PredictMainModel()
     main_model_probability = main_model.predict(
-        answer, combined_additional_features)
     random_forest_features = RandomForestDependencies()
     secondary_model_features = random_forest_features.calculate_features(
-        question, answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
     secondary_model = RandomForestModel()
     secondary_model_prediction = secondary_model.predict(
         secondary_model_features)
     return {
-        "prediction_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
         "details": {
             "main_model_probability": str(main_model_probability),
             "final_prediction": secondary_model_prediction

     typing_duration = data.typing_duration
     letter_click_counts = data.letter_click_counts
+    # Data preparation for 1st model
     hypothesis = BaseModelHypothesis()
+    additional_features = hypothesis.calculate_features_dataframe(answer)
+    # 1st model prediction
     main_model = PredictMainModel()
     main_model_probability = main_model.predict(
+        answer, additional_features)
+    # Data preparation for 2nd model
     random_forest_features = RandomForestDependencies()
     secondary_model_features = random_forest_features.calculate_features(
+        answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
+    # 2nd model prediction
     secondary_model = RandomForestModel()
     secondary_model_prediction = secondary_model.predict(
         secondary_model_features)
     return {
+        "predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
         "details": {
             "main_model_probability": str(main_model_probability),
             "final_prediction": secondary_model_prediction

core-model-prediction/random_forest_dependencies.py CHANGED Viewed

@@ -3,19 +3,14 @@ from collections import Counter
 class RandomForestDependencies:
-    def __init__(self):
-        self.gemma2bdependencies = Gemma2BDependencies()
-    def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
-        cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
-            question, answer)
         backspace_count_normalized = backspace_count / len(answer)
         typing_duration_normalized = typing_duration / len(answer)
         letter_discrepancy = self.calculate_letter_discrepancy(
             answer, letter_click_counts)
         return [
-            cosine_similarity, probability, backspace_count_normalized,
             typing_duration_normalized, letter_discrepancy
         ]

 class RandomForestDependencies:
+    def calculate_features(self, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
         backspace_count_normalized = backspace_count / len(answer)
         typing_duration_normalized = typing_duration / len(answer)
         letter_discrepancy = self.calculate_letter_discrepancy(
             answer, letter_click_counts)
         return [
+            probability, backspace_count_normalized,
             typing_duration_normalized, letter_discrepancy
         ]

core-model-prediction/random_forest_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import joblib
 import numpy as np
 from typing import List
@@ -7,9 +8,15 @@ class RandomForestModel:
     def __init__(self):
         self.scaler = joblib.load("scalers/rf_scaler.joblib")
         self.model = joblib.load("models/random_forest.joblib")
     def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
-        return self.scaler.transform(np.array(secondary_model_features).astype(np.float32).reshape(1, -1))
     def predict(self, secondary_model_features: List[float]):
         return int(self.model.predict(self.preprocess_input(secondary_model_features))[0])

 import joblib
 import numpy as np
+import pandas as pd
 from typing import List
     def __init__(self):
         self.scaler = joblib.load("scalers/rf_scaler.joblib")
         self.model = joblib.load("models/random_forest.joblib")
+        self.secondary_model_features = [
+            "machine_probability", "backspace_count_normalized", "typing_duration_normalized", "letter_discrepancy_normalized"
+        ]
     def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
+        features_df = pd.DataFrame([secondary_model_features], columns=[
+                                   self.secondary_model_features])
+        features_df = self.scaler.transform(features_df)
+        return features_df.values.astype(np.float32).reshape(1, -1)
     def predict(self, secondary_model_features: List[float]):
         return int(self.model.predict(self.preprocess_input(secondary_model_features))[0])

core-model-prediction/requirements.txt CHANGED Viewed

@@ -2,8 +2,8 @@ nltk
 vaderSentiment
 pandas
 textstat
-scikit-learn==1.4.1.post1
-transformers
 fastapi
 uvicorn
 google-cloud-secret-manager

 vaderSentiment
 pandas
 textstat
+scikit-learn==1.2.2
+transformers==4.38.2
 fastapi
 uvicorn
 google-cloud-secret-manager

core-model-prediction/scalers/rf_scaler.joblib CHANGED Viewed

Binary files a/core-model-prediction/scalers/rf_scaler.joblib and b/core-model-prediction/scalers/rf_scaler.joblib differ

core-model-prediction/scalers/{scaler-normalized-text-length.joblib → torch-scaler-normalized-text-length.joblib} RENAMED Viewed

Binary files a/core-model-prediction/scalers/scaler-normalized-text-length.joblib and b/core-model-prediction/scalers/torch-scaler-normalized-text-length.joblib differ

core-model-prediction/scalers/{scaler-not-normalized.joblib → torch-scaler-not-normalized.joblib} RENAMED Viewed

Binary files a/core-model-prediction/scalers/scaler-not-normalized.joblib and b/core-model-prediction/scalers/torch-scaler-not-normalized.joblib differ