Spaces:

MatteoFasulo
/

SubjectivityDetection

Running

App Files Files Community

MatteoFasulo commited on 24 days ago

Commit

1132a05

verified ·

1 Parent(s): 79352ba

Update with examples and cleaner logic

Browse files

Files changed (1) hide show

app.py +127 -68

app.py CHANGED Viewed

@@ -2,80 +2,132 @@ import gradio as gr
 import torch
 from transformers import DebertaV2Model, DebertaV2Config, AutoTokenizer, PreTrainedModel
 from transformers.models.deberta.modeling_deberta import ContextPooler
-from transformers import pipeline
 import torch.nn as nn
-# -- Model definitions
-BASE_MODEL = "microsoft/mdeberta-v3-base"
-SENT_SUBJ_MODEL = "MatteoFasulo/mdeberta-v3-base-subjectivity-sentiment-multilingual-no-arabic"
-SUBJ_ONLY_MODEL = "MatteoFasulo/mdeberta-v3-base-subjectivity-multilingual-no-arabic"
-# -- Custom model builder
-from functools import partial
-def build_custom_model(sentiment_dim=0):
-    class CustomModel(PreTrainedModel):
-        config_class = DebertaV2Config
-        def __init__(self, config, *args, **kwargs):
-            super().__init__(config, *args, **kwargs)
-            self.deberta = DebertaV2Model(config)
-            self.pooler = ContextPooler(config)
-            self.dropout = nn.Dropout(0.1)
-            hidden_dim = self.pooler.output_dim + sentiment_dim
-            self.classifier = nn.Linear(hidden_dim, config.num_labels)
-        def forward(self, input_ids, attention_mask=None, **sent_kwargs):
-            x = self.deberta(input_ids=input_ids, attention_mask=attention_mask)[0]
-            pooled = self.pooler(x)
-            if sentiment_dim:
-                sent_feats = torch.stack((sent_kwargs['positive'], sent_kwargs['neutral'], sent_kwargs['negative']), dim=1)
-                pooled = torch.cat((pooled, sent_feats), dim=1)
-            return self.classifier(self.dropout(pooled))
-    return CustomModel
-# -- Load models and tokenizer
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-# sentiment+subjectivity
-cfg1 = DebertaV2Config.from_pretrained(SENT_SUBJ_MODEL, num_labels=2, id2label={0:'OBJ',1:'SUBJ'}, label2id={'OBJ':0,'SUBJ':1})
-Model1Cls = build_custom_model(sentiment_dim=3)
-model1 = Model1Cls.from_pretrained(SENT_SUBJ_MODEL, config=cfg1, ignore_mismatched_sizes=True)
-# subjectivity-only
-cfg2 = DebertaV2Config.from_pretrained(SUBJ_ONLY_MODEL, num_labels=2, id2label={0:'OBJ',1:'SUBJ'}, label2id={'OBJ':0,'SUBJ':1})
-Model2Cls = build_custom_model(sentiment_dim=0)
-model2 = Model2Cls.from_pretrained(SUBJ_ONLY_MODEL, config=cfg2)
-# sentiment pipeline
-sentiment_pipe = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment", tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment", top_k=None)
-def get_sentiment_scores(text):
-    out = sentiment_pipe(text)[0]
-    return {list(d.keys())[0]: list(d.values())[0] for d in out}
-# -- Prediction logic
 def analyze(text):
     # Tokenize
-    inputs = tokenizer(text, truncation=True, padding=True, max_length=256, return_tensors='pt')
-    # Sentiment
-    scores = get_sentiment_scores(text)
-    pos, neu, neg = scores['positive'], scores['neutral'], scores['negative']
-    # Model1
-    logits1 = model1(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, positive=torch.tensor([pos]), neutral=torch.tensor([neu]), negative=torch.tensor([neg]))
-    p1 = torch.softmax(logits1, dim=1)[0]
-    # Model2
-    logits2 = model2(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)
-    p2 = torch.softmax(logits2, dim=1)[0]
-    # Build results
     return {
-        'Positive': f"{pos:.2%}", 'Neutral': f"{neu:.2%}", 'Negative': f"{neg:.2%}",
         'Sent-Subj OBJ': f"{p1[0]:.2%}", 'Sent-Subj SUBJ': f"{p1[1]:.2%}",
         'TextOnly OBJ': f"{p2[0]:.2%}", 'TextOnly SUBJ': f"{p2[1]:.2%}"
     }
-# -- Build Gradio Dashboard with Blocks
-theme = gr.themes.Soft()
-with gr.Blocks(theme=theme, css="""
 #result_table td { padding: 8px; font-size: 1rem; }
 #header { text-align: center; font-size: 2rem; font-weight: bold; margin-bottom: 10px; }
 """) as demo:
@@ -90,9 +142,16 @@ with gr.Blocks(theme=theme, css="""
             table = gr.Dataframe(headers=["Metric", "Value"], datatype=["str","str"], interactive=False, elem_id="result_table")
         with gr.TabItem("About ℹ️"):
             gr.Markdown("This dashboard uses two DeBERTa-based models (with and without sentiment integration) to detect subjectivity, alongside sentiment scores from an XLM-RoBERTa model.")
-            gr.Markdown("**Threshold** for subjective classification is adjustable in code (default: 0.65). Feel free to fork and customize! 🚀")
     # Link inputs to outputs
     btn.click(fn=analyze, inputs=txt, outputs=[chart, table])
-# -- Launch
-demo.queue().launch(server_name="0.0.0.0", share=True)

 import torch
 from transformers import DebertaV2Model, DebertaV2Config, AutoTokenizer, PreTrainedModel
 from transformers.models.deberta.modeling_deberta import ContextPooler
+from transformers import pipeline, AutoModelForSequenceClassification
 import torch.nn as nn
+# Define the model and tokenizer
+model_card = "microsoft/mdeberta-v3-base"
+subjectivity_only_model = "MatteoFasulo/mdeberta-v3-base-subjectivity-multilingual-no-arabic"
+sentiment_model = "MatteoFasulo/mdeberta-v3-base-subjectivity-sentiment-multilingual-no-arabic"
+# Define some examples for the Gradio interface (cached to run on-the-fly)
+examples = [
+    ['Example1'],
+    ['Example2'],
+    ['Example3'],
+]
+# Custom model class for combining sentiment analysis with subjectivity detection
+class CustomModel(PreTrainedModel):
+    config_class = DebertaV2Config
+    def __init__(self, config, sentiment_dim=3, num_labels=2, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Linear(output_dim + sentiment_dim, num_labels)
+    def forward(self, input_ids, positive, neutral, negative, token_type_ids=None, attention_mask=None, labels=None):
+        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        # Sentiment features as a single tensor
+        sentiment_features = torch.stack((positive, neutral, negative), dim=1)  # Shape: (batch_size, 3)
+        # Combine CLS embedding with sentiment features
+        combined_features = torch.cat((pooled_output, sentiment_features), dim=1)
+        # Classification head
+        logits = self.classifier(self.dropout(combined_features))
+        return {'logits': logits}
+# Load the pre-trained tokenizer
+def load_tokenizer(model_name: str):
+    return AutoTokenizer.from_pretrained(model_name)
+# Load the pre-trained model
+def load_model(model_name: str):
+    if 'sentiment' in model_name:
+        config = DebertaV2Config.from_pretrained(
+            model_name,
+            num_labels=2,
+            id2label={0: 'OBJ', 1: 'SUBJ'},
+            label2id={'OBJ': 0, 'SUBJ': 1},
+            output_attentions=False,
+            output_hidden_states=False
+        )
+        model = CustomModel(config=config, sentiment_dim=3, num_labels=2).from_pretrained(model_name)
+    else:
+        model = AutoModelForSequenceClassification.from_pretrained(
+                model_name,
+                num_labels=2,
+                id2label={0: 'OBJ', 1: 'SUBJ'},
+                label2id={'OBJ': 0, 'SUBJ': 1},
+                output_attentions=False,
+                output_hidden_states=False
+            )
+    return model
+# Get sentiment values using a pre-trained sentiment analysis model
+def get_sentiment_values(text: str):
+    pipe = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment", tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment", top_k=None)
+    sentiments = pipe(text)[0]
+    return {k:v for k,v in [(list(sentiment.values())[0], list(sentiment.values())[1]) for sentiment in sentiments]}
+# Modify the predict_subjectivity function to return additional information
 def analyze(text):
+    # Extract sentiment values
+    sentiment_values = get_sentiment_values(text)
+    # Load the tokenizer and model
+    tokenizer = load_tokenizer(model_card)
+    sentiment_model = load_model(sentiment_model)
+    subjectivity_model = load_model(subjectivity_only_model)
     # Tokenize
+    inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors='pt')
+    # Get the sentiment values
+    positive = sentiment_values['positive']
+    neutral = sentiment_values['neutral']
+    negative = sentiment_values['negative']
+    # Convert sentiment values to tensors
+    inputs['positive'] = torch.tensor(positive).unsqueeze(0)
+    inputs['neutral'] = torch.tensor(neutral).unsqueeze(0)
+    inputs['negative'] = torch.tensor(negative).unsqueeze(0)
+    # Get the sentiment model outputs
+    outputs1 = sentiment_model(**inputs)
+    logits1 = outputs1.get('logits')
+    # Calculate probabilities using softmax
+    p1 = torch.nn.functional.softmax(logits1, dim=1)[0]
+    # Get the subjectivity model outputs
+    outputs2 = subjectivity_model(**inputs)
+    logits2 = outputs2.get('logits')
+    # Calculate probabilities using softmax
+    p2 = torch.nn.functional.softmax(logits2, dim=1)[0]
+    # Format the output
     return {
+        'Positive': f"{positive:.2%}", 'Neutral': f"{neutral:.2%}", 'Negative': f"{negative:.2%}",
         'Sent-Subj OBJ': f"{p1[0]:.2%}", 'Sent-Subj SUBJ': f"{p1[1]:.2%}",
         'TextOnly OBJ': f"{p2[0]:.2%}", 'TextOnly SUBJ': f"{p2[1]:.2%}"
     }
+# Update the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft(), css="""
 #result_table td { padding: 8px; font-size: 1rem; }
 #header { text-align: center; font-size: 2rem; font-weight: bold; margin-bottom: 10px; }
 """) as demo:
             table = gr.Dataframe(headers=["Metric", "Value"], datatype=["str","str"], interactive=False, elem_id="result_table")
         with gr.TabItem("About ℹ️"):
             gr.Markdown("This dashboard uses two DeBERTa-based models (with and without sentiment integration) to detect subjectivity, alongside sentiment scores from an XLM-RoBERTa model.")
+    with gr.Row():
+        gr.Markdown("### Examples:")
+        gr.Examples(
+            examples=examples,
+            inputs=txt,
+            label="Examples",
+            elem_id="example_list",
+            cache_examples=True,
+        )
     # Link inputs to outputs
     btn.click(fn=analyze, inputs=txt, outputs=[chart, table])
+demo.queue().launch(server_name="0.0.0.0", share=True)