try: import torch import pandas as pd import streamlit as st import re import streamlit as st from transformers import BertTokenizer, BertModel from model import IndoBERTBiLSTM, IndoBERTModel except Exception as e: print(e) STYLE = """ """ # Config MAX_SEQ_LEN = 128 # bert_path = 'indolem/indobert-base-uncased' bert_path = 'indobenchmark/indobert-base-p2' MODELS_PATH = ["kadabengaran/IndoBERT-Useful-App-Review", "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"] MODELS_NAME = ["IndoBERT-BiLSTM", "IndoBERT"] LABELS = {'Not Useful': 0, 'Useful': 1} HIDDEN_DIM = 768 OUTPUT_DIM = 2 # 2 if Binary N_LAYERS = 2 BIDIRECTIONAL = True DROPOUT = 0.2 # Get the Keys def get_key(val, my_dict): for key, value in my_dict.items(): if val == value: return key def get_device(): if torch.cuda.is_available(): return torch.device('cuda') else: return torch.device('cpu') def load_tokenizer(model_path): tokenizer = BertTokenizer.from_pretrained(model_path) return tokenizer def remove_special_characters(text): text = re.sub(r'[^a-zA-Z0-9\s]', '', text) text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space text = re.sub(r'[0-9]', ' ', text) #remove number text = text.lower() return text def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN): return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt' ) def load_model(): bert = BertModel.from_pretrained(bert_path) # Load the model model_combined = IndoBERTBiLSTM.from_pretrained(MODELS_PATH[0], bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT) model_base = IndoBERTModel.from_pretrained(MODELS_PATH[1], bert, OUTPUT_DIM) return model_combined, model_base def predict_single(text, model, tokenizer, device): if device.type == 'cuda': model.cuda() # We need Token IDs and Attention Mask for inference on the new sentence test_ids = [] test_attention_mask = [] # Apply preprocessing to the new sentence new_sentence = remove_special_characters(text) encoding = preprocess(new_sentence, tokenizer) # Extract IDs and Attention Mask test_ids.append(encoding['input_ids']) test_attention_mask.append(encoding['attention_mask']) test_ids = torch.cat(test_ids, dim=0) test_attention_mask = torch.cat(test_attention_mask, dim=0) # Forward pass, calculate logit predictions with torch.no_grad(): outputs = model(test_ids.to(device), test_attention_mask.to(device)) print("output ", outputs) predictions = torch.argmax(outputs, dim=-1) print("output ", predictions) return predictions.item() def predict_multiple(data, model, tokenizer, device): input_ids = [] attention_masks = [] for row in data.tolist(): # Apply remove_special_characters function to title column text = remove_special_characters(row) text = preprocess(text, tokenizer) input_ids.append(text['input_ids']) attention_masks.append(text['attention_mask']) predictions = [] with torch.no_grad(): for i in range(len(input_ids)): test_ids = input_ids[i] test_attention_mask = attention_masks[i] outputs = model(test_ids.to(device), test_attention_mask.to(device)) prediction = torch.argmax(outputs, dim= -1) prediction_label = get_key(prediction.item(), LABELS) predictions.append(prediction_label) return predictions tab_labels = ["Single Input", "Multiple Input"] class App: print("Loading All") def __init__(self): self.fileTypes = ["csv"] self.default_tab_selected = tab_labels[0] self.input_text = None self.csv_input = None self.csv_process = None def run(self): self.init_session_state() # Initialize session state tokenizer = load_tokenizer(bert_path) device = get_device() model_combined, model_base = load_model() """App Review Classifier""" html_temp = """