Spaces:

asmashayea
/

absa-app

Sleeping

App Files Files Community

asmashayea commited on Jun 27

Commit

e90dd4b

1 Parent(s): afe677f

araberta

Browse files

Files changed (2) hide show

inference.py +89 -32
model.py → modeling_bilstm_crf.py +21 -10

inference.py CHANGED Viewed

@@ -1,35 +1,96 @@
 import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-from peft import PeftModel
 from seq2seq_inference import infer_t5_prompt, infer_mBart_prompt
-from transformers import AutoTokenizer, AutoModelForTokenClassification
-# Define supported models and their adapter IDs
-MODEL_OPTIONS = {
-    "Araberta": {
-        "base": "asmashayea/absa-araberta",
-        "adapter": "asmashayea/absa-araberta"
-    },
-    "mT5": {
-        "base": "google/mt5-base",
-        "adapter": "asmashayea/mt4-absa"
-    },
-    "mBART": {
-        "base": "facebook/mbart-large-50-many-to-many-mmt",
-        "adapter": "asmashayea/mbart-absa"
-    },
-    "GPT3.5": {
-        "base": "bigscience/bloom-560m",  # example, not ideal for ABSA
-        "adapter": "asmashayea/gpt-absa"
-    },
-    "GPT4o": {
-        "base": "bigscience/bloom-560m",  # example, not ideal for ABSA
-        "adapter": "asmashayea/gpt-absa"
-    }
-}
 cached_models = {}
 def load_model(model_key):
     if model_key in cached_models:
         return cached_models[model_key]
@@ -47,8 +108,6 @@ def load_model(model_key):
 def predict_absa(text, model_choice):
     if model_choice == 'mT5':
         tokenizer, model = load_model(model_choice)
@@ -60,9 +119,7 @@ def predict_absa(text, model_choice):
     elif model_choice == 'Araberta':
-        model = AutoModelForTokenClassification.from_pretrained("asmashayea/absa-araberta")
-        tokenizer = AutoTokenizer.from_pretrained("asmashayea/absa-araberta")
-        decoded = infer_mBart_prompt(text, tokenizer, model)
     # prompt = f"استخرج الجوانب والآراء والمشاعر من النص التالي:\n{text}"

 import torch
+import json
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, AutoConfig
+from peft import LoraConfig, get_peft_model
+from modeling_bilstm_crf import BERT_BiLSTM_CRF
 from seq2seq_inference import infer_t5_prompt, infer_mBart_prompt
+from peft import LoraConfig, get_peft_model, PeftModel
+from modeling_bilstm_crf import BERT_BiLSTM_CRF
 cached_models = {}
+def load_araberta():
+    path = "asmashayea/absa-arabert"
+    tokenizer = AutoTokenizer.from_pretrained(path)
+    base_model = AutoModel.from_pretrained(path)
+    lora_config = LoraConfig.from_pretrained(path)
+    lora_model = get_peft_model(base_model, lora_config)
+    config = AutoConfig.from_pretrained(path)
+    model = BERT_BiLSTM_CRF(lora_model, config)
+    model.load_state_dict(torch.load("bilstm_crf_head.pt"))
+    model.eval()
+    cached_models["Araberta"] = (tokenizer, model)
+    return tokenizer, model
+def infer_araberta(text):
+    if "Araberta" not in cached_models:
+        tokenizer, model = load_araberta()
+    else:
+        tokenizer, model = cached_models["Araberta"]
+    device = next(model.parameters()).device
+    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        predicted_ids = outputs['logits'][0].cpu().tolist()
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
+    predicted_labels = [model.config.id2label.get(p, 'O') for p in predicted_ids]
+    clean_tokens = [t for t in tokens if t not in tokenizer.all_special_tokens]
+    clean_labels = [l for t, l in zip(tokens, predicted_labels) if t not in tokenizer.all_special_tokens]
+    # Horizontal output
+    pairs = [f"{token}: {label}" for token, label in zip(clean_tokens, clean_labels)]
+    horizontal_output = " | ".join(pairs)
+    # Group by aspect span
+    aspects = []
+    current_tokens = []
+    current_sentiment = None
+    for token, label in zip(clean_tokens, clean_labels):
+        if label.startswith("B-"):
+            if current_tokens:
+                aspects.append({
+                    "aspect": " ".join(current_tokens).replace("##", ""),
+                    "sentiment": current_sentiment
+                })
+            current_tokens = [token]
+            current_sentiment = label.split("-")[1]
+        elif label.startswith("I-") and current_sentiment == label.split("-")[1]:
+            current_tokens.append(token)
+        else:
+            if current_tokens:
+                aspects.append({
+                    "aspect": " ".join(current_tokens).replace("##", ""),
+                    "sentiment": current_sentiment
+                })
+                current_tokens = []
+                current_sentiment = None
+    if current_tokens:
+        aspects.append({
+            "aspect": " ".join(current_tokens).replace("##", ""),
+            "sentiment": current_sentiment
+        })
+    return {
+        "token_predictions": horizontal_output,
+        "aspects": aspects
+    }
 def load_model(model_key):
     if model_key in cached_models:
         return cached_models[model_key]
 def predict_absa(text, model_choice):
     if model_choice == 'mT5':
         tokenizer, model = load_model(model_choice)
     elif model_choice == 'Araberta':
+        decoded = infer_araberta(text)
     # prompt = f"استخرج الجوانب والآراء والمشاعر من النص التالي:\n{text}"

model.py → modeling_bilstm_crf.py RENAMED Viewed

@@ -3,32 +3,43 @@ import torch.nn as nn
 from torchcrf import CRF
 class BERT_BiLSTM_CRF(nn.Module):
-    def __init__(self, base_model, num_labels, rnn_dim=256, dropout_rate=0.2):
         super().__init__()
         self.bert = base_model
         self.bilstm = nn.LSTM(
-            input_size=self.bert.config.hidden_size,
-            hidden_size=rnn_dim,
             num_layers=2,
             batch_first=True,
             bidirectional=True,
-            dropout=dropout_rate
         )
         self.dropout = nn.Dropout(dropout_rate)
-        self.classifier = nn.Linear(rnn_dim * 2, num_labels)
-        self.crf = CRF(num_labels, batch_first=True)
     def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
-        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
-        lstm_out, _ = self.bilstm(self.dropout(bert_output))
         emissions = self.classifier(lstm_out)
         mask = attention_mask.bool()
         if labels is not None:
             safe_labels = labels.clone()
-            safe_labels[labels == -100] = 0
             loss = -self.crf(emissions, safe_labels, mask=mask, reduction='mean')
             return {'loss': loss, 'logits': emissions}
         else:
             decoded = self.crf.decode(emissions, mask=mask)
-            return {'logits': torch.tensor(decoded)}

 from torchcrf import CRF
 class BERT_BiLSTM_CRF(nn.Module):
+    def __init__(self, base_model, config, dropout_rate=0.2, rnn_dim=256):
         super().__init__()
         self.bert = base_model
+        self.label2id = config.label2id  # <-- pulled from config
+        self.id2label = config.id2label
+        self.num_labels = config.num_labels
         self.bilstm = nn.LSTM(
+            self.bert.config.hidden_size,
+            rnn_dim,
             num_layers=2,
             batch_first=True,
             bidirectional=True,
+            dropout=0.2
         )
         self.dropout = nn.Dropout(dropout_rate)
+        self.classifier = nn.Linear(rnn_dim * 2, self.num_labels)
+        self.crf = CRF(self.num_labels, batch_first=True)
     def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids
+        )
+        lstm_out, _ = self.bilstm(self.dropout(outputs.last_hidden_state))
         emissions = self.classifier(lstm_out)
         mask = attention_mask.bool()
         if labels is not None:
             safe_labels = labels.clone()
+            safe_labels[labels == -100] = self.label2id['O']
             loss = -self.crf(emissions, safe_labels, mask=mask, reduction='mean')
             return {'loss': loss, 'logits': emissions}
         else:
             decoded = self.crf.decode(emissions, mask=mask)
+            max_len = input_ids.shape[1]
+            padded_decoded = [seq + [0] * (max_len - len(seq)) for seq in decoded]
+            logits = torch.tensor(padded_decoded, device=input_ids.device)
+            return {'logits': logits}