Spaces:
Sleeping
Sleeping
Commit
Β·
1c1651d
1
Parent(s):
aef68de
refactor: change to a higher accuracy model
Browse files- core-model-prediction/gemma2b_dependencies.py +1 -20
- core-model-prediction/hypothesis.py +86 -32
- core-model-prediction/main_model.py +42 -24
- core-model-prediction/models/{albert_model.pth β albert_weights.pth} +2 -2
- core-model-prediction/models/random_forest.joblib +0 -0
- core-model-prediction/prediction.py +8 -10
- core-model-prediction/random_forest_dependencies.py +2 -7
- core-model-prediction/random_forest_model.py +8 -1
- core-model-prediction/requirements.txt +2 -2
- core-model-prediction/scalers/rf_scaler.joblib +0 -0
- core-model-prediction/scalers/{scaler-normalized-text-length.joblib β torch-scaler-normalized-text-length.joblib} +0 -0
- core-model-prediction/scalers/{scaler-not-normalized.joblib β torch-scaler-not-normalized.joblib} +0 -0
core-model-prediction/gemma2b_dependencies.py
CHANGED
|
@@ -43,7 +43,7 @@ class Gemma2BDependencies:
|
|
| 43 |
|
| 44 |
def calculate_burstiness(self, text: str):
|
| 45 |
# Tokenize the text using GPT-2 tokenizer
|
| 46 |
-
tokens = self.tokenizer.
|
| 47 |
|
| 48 |
# Count token frequencies
|
| 49 |
frequency_counts = list(Counter(tokens).values())
|
|
@@ -55,22 +55,3 @@ class Gemma2BDependencies:
|
|
| 55 |
# Compute Variance-to-Mean Ratio (VMR) for burstiness
|
| 56 |
vmr = variance / mean if mean > 0 else 0
|
| 57 |
return vmr
|
| 58 |
-
|
| 59 |
-
def get_embedding(self, text: str):
|
| 60 |
-
inputs = self.tokenizer(text, return_tensors="pt",
|
| 61 |
-
truncation=True, max_length=1024)
|
| 62 |
-
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 63 |
-
|
| 64 |
-
with torch.no_grad():
|
| 65 |
-
outputs = self.model(**inputs, output_hidden_states=True)
|
| 66 |
-
|
| 67 |
-
last_hidden_states = outputs.hidden_states[-1]
|
| 68 |
-
# Average the token embeddings to get a sentence-level embedding
|
| 69 |
-
embedding = torch.mean(last_hidden_states, dim=1)
|
| 70 |
-
return embedding
|
| 71 |
-
|
| 72 |
-
def calculate_cosine_similarity(self, question: str, answer: str):
|
| 73 |
-
embedding1 = self.get_embedding(question)
|
| 74 |
-
embedding2 = self.get_embedding(answer)
|
| 75 |
-
# Ensure the embeddings are in the correct shape for cosine_similarity
|
| 76 |
-
return cosine_similarity(embedding1, embedding2).item()
|
|
|
|
| 43 |
|
| 44 |
def calculate_burstiness(self, text: str):
|
| 45 |
# Tokenize the text using GPT-2 tokenizer
|
| 46 |
+
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
| 47 |
|
| 48 |
# Count token frequencies
|
| 49 |
frequency_counts = list(Counter(tokens).values())
|
|
|
|
| 55 |
# Compute Variance-to-Mean Ratio (VMR) for burstiness
|
| 56 |
vmr = variance / mean if mean > 0 else 0
|
| 57 |
return vmr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core-model-prediction/hypothesis.py
CHANGED
|
@@ -3,24 +3,47 @@ import joblib
|
|
| 3 |
import textstat
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
| 6 |
-
from
|
|
|
|
| 7 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 8 |
from gemma2b_dependencies import Gemma2BDependencies
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class BaseModelHypothesis:
|
| 12 |
def __init__(self):
|
| 13 |
nltk.download('punkt')
|
|
|
|
| 14 |
nltk.download('averaged_perceptron_tagger')
|
| 15 |
|
| 16 |
self.analyzer = SentimentIntensityAnalyzer()
|
| 17 |
self.lexicon_df = pd.read_csv(
|
| 18 |
-
"https://storage.googleapis.com/
|
| 19 |
self.emotion_lexicon = self.process_emotion_lexicon()
|
|
|
|
| 20 |
self.gemma2bdependencies = Gemma2BDependencies()
|
| 21 |
|
| 22 |
-
self.
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
self.scaler_normalized_text_length = joblib.load(
|
| 26 |
"scalers/scaler-normalized-text-length.joblib")
|
|
@@ -35,32 +58,43 @@ class BaseModelHypothesis:
|
|
| 35 |
emotion_lexicon[row["word"]].append(row["emotion"])
|
| 36 |
return emotion_lexicon
|
| 37 |
|
| 38 |
-
def
|
| 39 |
-
|
| 40 |
text)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
self.
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
self.features_not_normalized.
|
| 50 |
-
self.
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def extract_pos_features(self, text: str):
|
| 61 |
words = nltk.word_tokenize(text)
|
| 62 |
pos_tags = nltk.pos_tag(words)
|
| 63 |
-
desired_tags = ["
|
| 64 |
pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
|
| 65 |
|
| 66 |
for _, pos in pos_tags:
|
|
@@ -83,20 +117,37 @@ class BaseModelHypothesis:
|
|
| 83 |
|
| 84 |
return [gunning_fog, smog_index, dale_chall_score]
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def calculate_emotion_proportions(self, text: str):
|
| 87 |
tokens = nltk.word_tokenize(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
|
| 90 |
|
| 91 |
emotion_counts = {emotion: 0 for emotion in [
|
| 92 |
"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
|
| 93 |
|
| 94 |
-
for
|
| 95 |
-
if
|
| 96 |
-
for emotion in self.emotion_lexicon[
|
| 97 |
emotion_counts[emotion] += 1
|
| 98 |
|
| 99 |
-
proportions = {emotion: count /
|
| 100 |
count in emotion_counts.items()}
|
| 101 |
|
| 102 |
return [
|
|
@@ -105,9 +156,12 @@ class BaseModelHypothesis:
|
|
| 105 |
]
|
| 106 |
|
| 107 |
def measure_unique_word_ratio(self, text: str):
|
| 108 |
-
tokens = nltk.word_tokenize(text)
|
|
|
|
|
|
|
|
|
|
| 109 |
total_words = len(tokens)
|
| 110 |
|
| 111 |
-
unique_words = len(
|
| 112 |
|
| 113 |
return (unique_words / total_words)
|
|
|
|
| 3 |
import textstat
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
| 6 |
+
from typing import List
|
| 7 |
+
from collections import defaultdict
|
| 8 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 9 |
from gemma2b_dependencies import Gemma2BDependencies
|
| 10 |
+
from string import punctuation
|
| 11 |
|
| 12 |
|
| 13 |
class BaseModelHypothesis:
|
| 14 |
def __init__(self):
|
| 15 |
nltk.download('punkt')
|
| 16 |
+
nltk.download('wordnet')
|
| 17 |
nltk.download('averaged_perceptron_tagger')
|
| 18 |
|
| 19 |
self.analyzer = SentimentIntensityAnalyzer()
|
| 20 |
self.lexicon_df = pd.read_csv(
|
| 21 |
+
"https://storage.googleapis.com/interview-ai-detector/higher-accuracy-final-model/NRC-Emotion-Lexicon.csv")
|
| 22 |
self.emotion_lexicon = self.process_emotion_lexicon()
|
| 23 |
+
self.lemmatizer = nltk.stem.WordNetLemmatizer()
|
| 24 |
self.gemma2bdependencies = Gemma2BDependencies()
|
| 25 |
|
| 26 |
+
self.additional_feature_columns = [
|
| 27 |
+
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
|
| 28 |
+
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
|
| 29 |
+
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
|
| 30 |
+
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
|
| 31 |
+
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
|
| 32 |
+
"surprise_emotion_proportions", "unique_words_ratio", "perplexity", "burstiness"
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
self.features_normalized_text_length = [
|
| 36 |
+
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
|
| 37 |
+
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
|
| 38 |
+
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
|
| 39 |
+
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
|
| 40 |
+
"surprise_emotion_proportions", "unique_words_ratio"
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
self.features_not_normalized = [
|
| 44 |
+
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
|
| 45 |
+
"perplexity", "burstiness"
|
| 46 |
+
]
|
| 47 |
|
| 48 |
self.scaler_normalized_text_length = joblib.load(
|
| 49 |
"scalers/scaler-normalized-text-length.joblib")
|
|
|
|
| 58 |
emotion_lexicon[row["word"]].append(row["emotion"])
|
| 59 |
return emotion_lexicon
|
| 60 |
|
| 61 |
+
def calculate_features_dataframe(self, text: str) -> np.ndarray:
|
| 62 |
+
normalized_text_length_features = self.calculate_normalized_text_length_features(
|
| 63 |
text)
|
| 64 |
+
not_normalized_features = self.calculate_not_normalized_features(text)
|
| 65 |
+
all_features = normalized_text_length_features + not_normalized_features
|
| 66 |
+
features_df = pd.DataFrame(
|
| 67 |
+
[all_features], columns=self.additional_feature_columns)
|
| 68 |
+
|
| 69 |
+
# Scaling features
|
| 70 |
+
features_df[self.features_normalized_text_length] = self.scaler_normalized_text_length.transform(
|
| 71 |
+
features_df[self.features_normalized_text_length])
|
| 72 |
+
features_df[self.features_not_normalized] = self.scaler_not_normalized.transform(
|
| 73 |
+
features_df[self.features_not_normalized])
|
| 74 |
+
|
| 75 |
+
ordered_df = features_df[self.additional_feature_columns]
|
| 76 |
+
|
| 77 |
+
return ordered_df.values.astype(np.float32).reshape(1, -1)
|
| 78 |
+
|
| 79 |
+
def calculate_normalized_text_length_features(self, text: str) -> List[float]:
|
| 80 |
+
pos_features = self.extract_pos_features(text)
|
| 81 |
+
emotion_features = self.calculate_emotion_proportions(text)
|
| 82 |
+
unique_word_ratio = [self.measure_unique_word_ratio(text)]
|
| 83 |
+
features = pos_features + emotion_features + unique_word_ratio
|
| 84 |
+
return features
|
| 85 |
+
|
| 86 |
+
def calculate_not_normalized_features(self, text: str) -> List[float]:
|
| 87 |
+
sentiment_intensity = self.measure_sentiment_intensity(text)
|
| 88 |
+
readability_scores = self.measure_readability(text)
|
| 89 |
+
perplexity = [self.gemma2bdependencies.calculate_perplexity(text)]
|
| 90 |
+
burstiness = [self.gemma2bdependencies.calculate_burstiness(text)]
|
| 91 |
+
features = sentiment_intensity + readability_scores + perplexity + burstiness
|
| 92 |
+
return features
|
| 93 |
|
| 94 |
def extract_pos_features(self, text: str):
|
| 95 |
words = nltk.word_tokenize(text)
|
| 96 |
pos_tags = nltk.pos_tag(words)
|
| 97 |
+
desired_tags = ["NN", "NNS", "JJ", "IN", "DT", "VB", "PRP", "RB"]
|
| 98 |
pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
|
| 99 |
|
| 100 |
for _, pos in pos_tags:
|
|
|
|
| 117 |
|
| 118 |
return [gunning_fog, smog_index, dale_chall_score]
|
| 119 |
|
| 120 |
+
def __penn2morphy(self, penntag):
|
| 121 |
+
morphy_tag = {
|
| 122 |
+
'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n', # Nouns
|
| 123 |
+
'JJ': 'a', 'JJR': 'a', 'JJS': 'a', # Adjectives
|
| 124 |
+
'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', # Verbs
|
| 125 |
+
'RB': 'r', 'RBR': 'r', 'RBS': 'r', # Adverbs
|
| 126 |
+
# Pronouns, determiners, prepositions, modal verbs
|
| 127 |
+
'PRP': 'n', 'PRP$': 'n', 'DT': 'n', 'IN': 'n', 'MD': 'v',
|
| 128 |
+
# Others, treated as nouns unless a better fit is found
|
| 129 |
+
'CC': 'n', 'CD': 'n', 'EX': 'n', 'FW': 'n', 'POS': 'n', 'TO': 'n', 'WDT': 'n', 'WP': 'n', 'WP$': 'n', 'WRB': 'n', 'PDT': 'n'
|
| 130 |
+
}
|
| 131 |
+
return morphy_tag.get(penntag[:2], 'n')
|
| 132 |
+
|
| 133 |
def calculate_emotion_proportions(self, text: str):
|
| 134 |
tokens = nltk.word_tokenize(text)
|
| 135 |
+
tagged_tokens = nltk.pos_tag(tokens)
|
| 136 |
+
|
| 137 |
+
lemmas = [self.lemmatizer.lemmatize(
|
| 138 |
+
token.lower(), pos=self.__penn2morphy(tag)) for token, tag in tagged_tokens]
|
| 139 |
|
| 140 |
+
total_lemmas = len(lemmas)
|
| 141 |
|
| 142 |
emotion_counts = {emotion: 0 for emotion in [
|
| 143 |
"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
|
| 144 |
|
| 145 |
+
for lemma in lemmas:
|
| 146 |
+
if lemma in self.emotion_lexicon:
|
| 147 |
+
for emotion in self.emotion_lexicon[lemma]:
|
| 148 |
emotion_counts[emotion] += 1
|
| 149 |
|
| 150 |
+
proportions = {emotion: count / total_lemmas for emotion,
|
| 151 |
count in emotion_counts.items()}
|
| 152 |
|
| 153 |
return [
|
|
|
|
| 156 |
]
|
| 157 |
|
| 158 |
def measure_unique_word_ratio(self, text: str):
|
| 159 |
+
tokens = nltk.word_tokenize(text.lower())
|
| 160 |
+
|
| 161 |
+
tokens = [token for token in tokens if token not in punctuation]
|
| 162 |
+
|
| 163 |
total_words = len(tokens)
|
| 164 |
|
| 165 |
+
unique_words = len(set(tokens))
|
| 166 |
|
| 167 |
return (unique_words / total_words)
|
core-model-prediction/main_model.py
CHANGED
|
@@ -5,31 +5,50 @@ import torch
|
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
|
| 8 |
-
class
|
| 9 |
-
def __init__(self, albert_model, num_additional_features=25,
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
self.
|
| 13 |
-
self.
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
labels = labels.unsqueeze(1)
|
| 29 |
-
loss = loss_fn(logits, labels.float())
|
| 30 |
-
return logits, loss
|
| 31 |
-
else:
|
| 32 |
-
return logits
|
| 33 |
|
| 34 |
|
| 35 |
class PredictMainModel:
|
|
@@ -47,10 +66,9 @@ class PredictMainModel:
|
|
| 47 |
self.albert_model = AlbertModel.from_pretrained(self.model_name)
|
| 48 |
self.device = DeviceManager()
|
| 49 |
|
| 50 |
-
self.model =
|
| 51 |
self.albert_model).to(self.device)
|
| 52 |
-
|
| 53 |
-
self.model.load_state_dict(torch.load("models/albert_model.pth"))
|
| 54 |
|
| 55 |
def preprocess_input(self, text: str, additional_features: np.ndarray):
|
| 56 |
encoding = self.tokenizer.encode_plus(
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
|
| 8 |
+
class AlbertSeparateTransformation(nn.Module):
|
| 9 |
+
def __init__(self, albert_model, num_additional_features=25,
|
| 10 |
+
hidden_size_albert=512, hidden_size_additional=128, classifier_hidden_size=256,
|
| 11 |
+
dropout_rate_albert=0.3, dropout_rate_additional=0.1, dropout_rate_classifier=0.1):
|
| 12 |
+
super(AlbertSeparateTransformation, self).__init__()
|
| 13 |
+
self.albert = albert_model
|
| 14 |
+
|
| 15 |
+
# Transform ALBERT's features to an intermediate space
|
| 16 |
+
self.albert_feature_transform = nn.Sequential(
|
| 17 |
+
nn.Linear(1024, hidden_size_albert),
|
| 18 |
+
nn.ReLU(),
|
| 19 |
+
nn.Dropout(dropout_rate_albert),
|
| 20 |
+
)
|
| 21 |
|
| 22 |
+
# Transform additional features to an intermediate space
|
| 23 |
+
self.additional_feature_transform = nn.Sequential(
|
| 24 |
+
nn.Linear(num_additional_features, hidden_size_additional),
|
| 25 |
+
nn.ReLU(),
|
| 26 |
+
nn.Dropout(dropout_rate_additional),
|
| 27 |
+
)
|
| 28 |
|
| 29 |
+
# Combine both transformed features and process for final prediction
|
| 30 |
+
self.classifier = nn.Sequential(
|
| 31 |
+
nn.Linear(hidden_size_albert + hidden_size_additional,
|
| 32 |
+
classifier_hidden_size),
|
| 33 |
+
nn.ReLU(),
|
| 34 |
+
nn.Dropout(dropout_rate_classifier),
|
| 35 |
+
nn.Linear(classifier_hidden_size, 1)
|
| 36 |
+
)
|
| 37 |
|
| 38 |
+
def forward(self, input_ids, attention_mask, additional_features):
|
| 39 |
+
albert_output = self.albert(
|
| 40 |
+
input_ids=input_ids, attention_mask=attention_mask).pooler_output
|
| 41 |
+
|
| 42 |
+
transformed_albert_features = self.albert_feature_transform(
|
| 43 |
+
albert_output)
|
| 44 |
+
transformed_additional_features = self.additional_feature_transform(
|
| 45 |
+
additional_features)
|
| 46 |
+
|
| 47 |
+
combined_features = torch.cat(
|
| 48 |
+
(transformed_albert_features, transformed_additional_features), dim=1)
|
| 49 |
|
| 50 |
+
logits = self.classifier(combined_features)
|
| 51 |
+
return logits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
class PredictMainModel:
|
|
|
|
| 66 |
self.albert_model = AlbertModel.from_pretrained(self.model_name)
|
| 67 |
self.device = DeviceManager()
|
| 68 |
|
| 69 |
+
self.model = AlbertSeparateTransformation(
|
| 70 |
self.albert_model).to(self.device)
|
| 71 |
+
self.model.load_state_dict(torch.load("models/albert_weights.pth"))
|
|
|
|
| 72 |
|
| 73 |
def preprocess_input(self, text: str, additional_features: np.ndarray):
|
| 74 |
encoding = self.tokenizer.encode_plus(
|
core-model-prediction/models/{albert_model.pth β albert_weights.pth}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59184c88c7921ac5f115aa0b10b3224536b5f7d7ebb6cf07fd45eecccfcff3ae
|
| 3 |
+
size 73519347
|
core-model-prediction/models/random_forest.joblib
CHANGED
|
Binary files a/core-model-prediction/models/random_forest.joblib and b/core-model-prediction/models/random_forest.joblib differ
|
|
|
core-model-prediction/prediction.py
CHANGED
|
@@ -40,29 +40,27 @@ def process_instance(data: PredictRequest):
|
|
| 40 |
typing_duration = data.typing_duration
|
| 41 |
letter_click_counts = data.letter_click_counts
|
| 42 |
|
|
|
|
| 43 |
hypothesis = BaseModelHypothesis()
|
| 44 |
-
|
| 45 |
-
answer)
|
| 46 |
-
features_not_normalized = hypothesis.calculate_not_normalized_features(
|
| 47 |
-
answer)
|
| 48 |
-
|
| 49 |
-
combined_additional_features = np.concatenate(
|
| 50 |
-
(features_normalized_text_length, features_not_normalized), axis=1)
|
| 51 |
|
|
|
|
| 52 |
main_model = PredictMainModel()
|
| 53 |
main_model_probability = main_model.predict(
|
| 54 |
-
answer,
|
| 55 |
|
|
|
|
| 56 |
random_forest_features = RandomForestDependencies()
|
| 57 |
secondary_model_features = random_forest_features.calculate_features(
|
| 58 |
-
|
| 59 |
|
|
|
|
| 60 |
secondary_model = RandomForestModel()
|
| 61 |
secondary_model_prediction = secondary_model.predict(
|
| 62 |
secondary_model_features)
|
| 63 |
|
| 64 |
return {
|
| 65 |
-
"
|
| 66 |
"details": {
|
| 67 |
"main_model_probability": str(main_model_probability),
|
| 68 |
"final_prediction": secondary_model_prediction
|
|
|
|
| 40 |
typing_duration = data.typing_duration
|
| 41 |
letter_click_counts = data.letter_click_counts
|
| 42 |
|
| 43 |
+
# Data preparation for 1st model
|
| 44 |
hypothesis = BaseModelHypothesis()
|
| 45 |
+
additional_features = hypothesis.calculate_features_dataframe(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
# 1st model prediction
|
| 48 |
main_model = PredictMainModel()
|
| 49 |
main_model_probability = main_model.predict(
|
| 50 |
+
answer, additional_features)
|
| 51 |
|
| 52 |
+
# Data preparation for 2nd model
|
| 53 |
random_forest_features = RandomForestDependencies()
|
| 54 |
secondary_model_features = random_forest_features.calculate_features(
|
| 55 |
+
answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
|
| 56 |
|
| 57 |
+
# 2nd model prediction
|
| 58 |
secondary_model = RandomForestModel()
|
| 59 |
secondary_model_prediction = secondary_model.predict(
|
| 60 |
secondary_model_features)
|
| 61 |
|
| 62 |
return {
|
| 63 |
+
"predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
|
| 64 |
"details": {
|
| 65 |
"main_model_probability": str(main_model_probability),
|
| 66 |
"final_prediction": secondary_model_prediction
|
core-model-prediction/random_forest_dependencies.py
CHANGED
|
@@ -3,19 +3,14 @@ from collections import Counter
|
|
| 3 |
|
| 4 |
|
| 5 |
class RandomForestDependencies:
|
| 6 |
-
def
|
| 7 |
-
self.gemma2bdependencies = Gemma2BDependencies()
|
| 8 |
-
|
| 9 |
-
def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
|
| 10 |
-
cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
|
| 11 |
-
question, answer)
|
| 12 |
backspace_count_normalized = backspace_count / len(answer)
|
| 13 |
typing_duration_normalized = typing_duration / len(answer)
|
| 14 |
letter_discrepancy = self.calculate_letter_discrepancy(
|
| 15 |
answer, letter_click_counts)
|
| 16 |
|
| 17 |
return [
|
| 18 |
-
|
| 19 |
typing_duration_normalized, letter_discrepancy
|
| 20 |
]
|
| 21 |
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
class RandomForestDependencies:
|
| 6 |
+
def calculate_features(self, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
backspace_count_normalized = backspace_count / len(answer)
|
| 8 |
typing_duration_normalized = typing_duration / len(answer)
|
| 9 |
letter_discrepancy = self.calculate_letter_discrepancy(
|
| 10 |
answer, letter_click_counts)
|
| 11 |
|
| 12 |
return [
|
| 13 |
+
probability, backspace_count_normalized,
|
| 14 |
typing_duration_normalized, letter_discrepancy
|
| 15 |
]
|
| 16 |
|
core-model-prediction/random_forest_model.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import joblib
|
| 2 |
import numpy as np
|
|
|
|
| 3 |
from typing import List
|
| 4 |
|
| 5 |
|
|
@@ -7,9 +8,15 @@ class RandomForestModel:
|
|
| 7 |
def __init__(self):
|
| 8 |
self.scaler = joblib.load("scalers/rf_scaler.joblib")
|
| 9 |
self.model = joblib.load("models/random_forest.joblib")
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def predict(self, secondary_model_features: List[float]):
|
| 15 |
return int(self.model.predict(self.preprocess_input(secondary_model_features))[0])
|
|
|
|
| 1 |
import joblib
|
| 2 |
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
|
|
|
|
| 8 |
def __init__(self):
|
| 9 |
self.scaler = joblib.load("scalers/rf_scaler.joblib")
|
| 10 |
self.model = joblib.load("models/random_forest.joblib")
|
| 11 |
+
self.secondary_model_features = [
|
| 12 |
+
"machine_probability", "backspace_count_normalized", "typing_duration_normalized", "letter_discrepancy_normalized"
|
| 13 |
+
]
|
| 14 |
|
| 15 |
def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
|
| 16 |
+
features_df = pd.DataFrame([secondary_model_features], columns=[
|
| 17 |
+
self.secondary_model_features])
|
| 18 |
+
features_df = self.scaler.transform(features_df)
|
| 19 |
+
return features_df.values.astype(np.float32).reshape(1, -1)
|
| 20 |
|
| 21 |
def predict(self, secondary_model_features: List[float]):
|
| 22 |
return int(self.model.predict(self.preprocess_input(secondary_model_features))[0])
|
core-model-prediction/requirements.txt
CHANGED
|
@@ -2,8 +2,8 @@ nltk
|
|
| 2 |
vaderSentiment
|
| 3 |
pandas
|
| 4 |
textstat
|
| 5 |
-
scikit-learn==1.
|
| 6 |
-
transformers
|
| 7 |
fastapi
|
| 8 |
uvicorn
|
| 9 |
google-cloud-secret-manager
|
|
|
|
| 2 |
vaderSentiment
|
| 3 |
pandas
|
| 4 |
textstat
|
| 5 |
+
scikit-learn==1.2.2
|
| 6 |
+
transformers==4.38.2
|
| 7 |
fastapi
|
| 8 |
uvicorn
|
| 9 |
google-cloud-secret-manager
|
core-model-prediction/scalers/rf_scaler.joblib
CHANGED
|
Binary files a/core-model-prediction/scalers/rf_scaler.joblib and b/core-model-prediction/scalers/rf_scaler.joblib differ
|
|
|
core-model-prediction/scalers/{scaler-normalized-text-length.joblib β torch-scaler-normalized-text-length.joblib}
RENAMED
|
Binary files a/core-model-prediction/scalers/scaler-normalized-text-length.joblib and b/core-model-prediction/scalers/torch-scaler-normalized-text-length.joblib differ
|
|
|
core-model-prediction/scalers/{scaler-not-normalized.joblib β torch-scaler-not-normalized.joblib}
RENAMED
|
Binary files a/core-model-prediction/scalers/scaler-not-normalized.joblib and b/core-model-prediction/scalers/torch-scaler-not-normalized.joblib differ
|
|
|