File size: 1,478 Bytes
e10ddaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f69673
e10ddaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
import sys
src_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "src"))
sys.path.append(src_directory)
from data import sample_data

encoding_model = model
logreg_model = None
X_train_embeddings = None

file_path = r"src/data/sms_process_data_main.xlsx"
df = sample_data.get_data_frame(file_path)

def train_model():
    global logreg_model, X_train_embeddings

    if logreg_model is None:
        X_train, X_test, y_train, y_test = train_test_split(df['MessageText'], df['label'], test_size=0.2, random_state=42)
        X_train_embeddings = encoding_model.encode(X_train.tolist())
        
        logreg_model = LogisticRegression(max_iter=100)
        logreg_model.fit(X_train_embeddings, y_train)

def get_label(message):
    if logreg_model is None:
        raise ValueError("Model has not been trained yet. Please call train_model first.")

    new_embeddings = encoding_model.encode([message])

    prediction = logreg_model.predict(new_embeddings)

    no_of_dimensions = len(new_embeddings[0])
    dimension_df = pd.DataFrame(new_embeddings[0], columns=["Dimension"])
    return {"Prediction": prediction[0], "Prediction_Dimension": {no_of_dimensions: dimension_df}}