try: import torch import pandas as pd import streamlit as st import re import streamlit as st from transformers import BertTokenizer, AutoConfig from model import IndoBERTBiLSTM, IndoBERTModel except Exception as e: print(e) STYLE = """ """ footer="""
""" # Config MAX_SEQ_LEN = 128 MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review" LABELS = {'Not Useful': 0, 'Useful': 1} def get_device(): if torch.cuda.is_available(): return torch.device('cuda') else: return torch.device('cpu') USE_CUDA = True device = get_device() if device.type == 'cuda': USE_CUDA = True # Get the Keys def get_key(val, my_dict): for key, value in my_dict.items(): if val == value: return key def load_tokenizer(model_path): tokenizer = BertTokenizer.from_pretrained(model_path) return tokenizer def remove_special_characters(text): text = re.sub(r'[^a-zA-Z0-9\s]', '', text) text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space text = re.sub(r'[0-9]', ' ', text) #remove number text = text.lower() return text def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN): return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt' ) def load_model(): model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH) return model def predict_single(text, model, tokenizer, device): if device.type == 'cuda': model.cuda() # We need Token IDs and Attention Mask for inference on the new sentence test_ids = [] test_attention_mask = [] # Apply preprocessing to the new sentence new_sentence = remove_special_characters(text) encoding = preprocess(new_sentence, tokenizer) # Extract IDs and Attention Mask test_ids.append(encoding['input_ids']) test_attention_mask.append(encoding['attention_mask']) test_ids = torch.cat(test_ids, dim=0) test_attention_mask = torch.cat(test_attention_mask, dim=0) # Forward pass, calculate logit predictions with torch.no_grad(): outputs = model(test_ids.to(device), test_attention_mask.to(device)) print("output ", outputs) predictions = torch.argmax(outputs, dim=-1) print("output ", predictions) return predictions.item() def predict_multiple(data, model, tokenizer, device): if device.type == 'cuda': model.cuda() input_ids = [] attention_masks = [] for row in data.tolist(): # Apply remove_special_characters function to title column text = remove_special_characters(row) text = preprocess(text, tokenizer) input_ids.append(text['input_ids']) attention_masks.append(text['attention_mask']) predictions = [] with torch.no_grad(): for i in range(len(input_ids)): test_ids = input_ids[i] test_attention_mask = attention_masks[i] outputs = model(test_ids.to(device), test_attention_mask.to(device)) prediction = torch.argmax(outputs, dim= -1) prediction_label = get_key(prediction.item(), LABELS) predictions.append(prediction_label) return predictions tab_labels = ["Single Input", "Multiple Input"] class App: print("Loading All") def __init__(self): self.fileTypes = ["csv"] self.default_tab_selected = tab_labels[0] self.input_text = None self.csv_input = None self.csv_process = None def run(self): self.init_session_state() # Initialize session state tokenizer = load_tokenizer(MODELS_PATH) model = load_model() """App Review Classifier""" html_temp = """