File size: 5,084 Bytes
a4b33d8
b3b327d
138ec98
 
b3b327d
 
a4b33d8
c2ae4ec
 
 
 
e91e5d5
a4b33d8
c2ae4ec
fbdaedd
 
 
c2ae4ec
fbdaedd
 
 
 
 
 
 
c2ae4ec
 
fbdaedd
c2ae4ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbdaedd
 
 
 
 
 
c2ae4ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbdaedd
c2ae4ec
 
 
fbdaedd
c2ae4ec
 
 
fbdaedd
 
c2ae4ec
 
 
e91e5d5
c2ae4ec
fbdaedd
 
c2ae4ec
 
 
 
 
 
 
 
 
 
 
fbdaedd
c2ae4ec
 
 
 
 
 
e91e5d5
c2ae4ec
 
 
 
 
 
e91e5d5
c2ae4ec
 
e91e5d5
c2ae4ec
fbdaedd
c2ae4ec
fbdaedd
c2ae4ec
fbdaedd
 
 
c2ae4ec
fbdaedd
e91e5d5
c2ae4ec
 
fbdaedd
c2ae4ec
 
fbdaedd
 
e91e5d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import torch
from transformers import DebertaV2Model, DebertaV2Config, AutoTokenizer, PreTrainedModel
from transformers.models.deberta.modeling_deberta import ContextPooler
from transformers import pipeline
import torch.nn as nn

# Model cards and thresholds
BASE_MODEL = "microsoft/mdeberta-v3-base"
SENT_SUBJ_MODEL = "MatteoFasulo/mdeberta-v3-base-subjectivity-sentiment-multilingual-no-arabic"
SUBJ_ONLY_MODEL = "MatteoFasulo/mdeberta-v3-base-subjectivity-multilingual-no-arabic"
THRESHOLD = 0.65

# Custom model for subjectivity (+ optional sentiment features)
class CustomModel(PreTrainedModel):
    config_class = DebertaV2Config

    def __init__(self, config, sentiment_dim=0, num_labels=2, *args, **kwargs):
        super().__init__(config, *args, **kwargs)
        self.deberta = DebertaV2Model(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(output_dim + sentiment_dim, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                positive=None, neutral=None, negative=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.pooler(outputs[0])
        if positive is not None and neutral is not None and negative is not None:
            sent_feats = torch.stack((positive, neutral, negative), dim=1)
            combined = torch.cat((pooled, sent_feats), dim=1)
        else:
            combined = pooled
        logits = self.classifier(self.dropout(combined))
        return logits

# Load tokenizer and model helper
def load_models():
    # Tokenizer shared
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    # Sentiment+Subjectivity model
    cfg1 = DebertaV2Config.from_pretrained(
        SENT_SUBJ_MODEL,
        num_labels=2,
        id2label={0: 'OBJ', 1: 'SUBJ'},
        label2id={'OBJ': 0, 'SUBJ': 1},
        output_attentions=False,
        output_hidden_states=False
    )
    model1 = CustomModel(config=cfg1, sentiment_dim=3)
    model1 = model1.from_pretrained(SENT_SUBJ_MODEL)
    # Subjectivity-only model
    cfg2 = DebertaV2Config.from_pretrained(
        SUBJ_ONLY_MODEL,
        num_labels=2,
        id2label={0: 'OBJ', 1: 'SUBJ'},
        label2id={'OBJ': 0, 'SUBJ': 1},
        output_attentions=False,
        output_hidden_states=False
    )
    model2 = CustomModel(config=cfg2, sentiment_dim=0)
    model2 = model2.from_pretrained(SUBJ_ONLY_MODEL)
    return tokenizer, model1, model2

# Sentiment pipeline
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
    tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment",
    top_k=None
)

def get_sentiment_scores(text: str):
    results = sentiment_pipe(text)[0]
    return {lbl: score for lbl, score in [(list(d.keys())[0], list(d.values())[0]) for d in results]}

# Prediction function
# Caches models on first call
tokenizer, model_sent_subj, model_subj_only = None, None, None

def predict_subjectivity(text):
    global tokenizer, model_sent_subj, model_subj_only
    if tokenizer is None:
        tokenizer, model_sent_subj, model_subj_only = load_models()

    # Tokenize input
    inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors='pt')

    # Sentiment + subjectivity model inference
    sent_scores = get_sentiment_scores(text)
    pos, neu, neg = sent_scores['positive'], sent_scores['neutral'], sent_scores['negative']
    logits1 = model_sent_subj(
        input_ids=inputs['input_ids'],
        attention_mask=inputs.get('attention_mask'),
        positive=torch.tensor([pos]),
        neutral=torch.tensor([neu]),
        negative=torch.tensor([neg])
    )
    probs1 = torch.softmax(logits1, dim=1)[0]

    # Subjectivity-only model inference
    logits2 = model_subj_only(
        input_ids=inputs['input_ids'],
        attention_mask=inputs.get('attention_mask')
    )
    probs2 = torch.softmax(logits2, dim=1)[0]

    # Formatting
    output = []
    output.append("Sentiment Scores (sent-subj model):")
    output.append(f"- Positive: {pos:.2%}")
    output.append(f"- Neutral: {neu:.2%}")
    output.append(f"- Negative: {neg:.2%}\n")

    output.append(f"Subjectivity (with sentiment) - OBJ: {probs1[0]:.2%}, SUBJ: {probs1[1]:.2%}")
    output.append(f"Subjectivity (text only)   - OBJ: {probs2[0]:.2%}, SUBJ: {probs2[1]:.2%}")

    return "\n".join(output)

# Build Gradio interface
demo = gr.Interface(
    fn=predict_subjectivity,
    inputs=gr.Textbox(
        label='Input sentence',
        placeholder='Enter a sentence from a news article',
        info='Paste a sentence from a news article to determine subjectivity'
    ),
    outputs=gr.Textbox(
        label='Results',
        info='Sentiment & dual-model subjectivity probabilities'
    ),
    title='Dual-Model Subjectivity Detection',
    description='Outputs sentiment scores and class probabilities from two subjectivity models.'
)

demo.launch()