File size: 4,516 Bytes

e26d346

"""

Sentiment Analysis Model Pipeline for Hugging Face Hub

"""

import pickle
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from typing import Dict, List, Union

class SentimentClassifier:
    """Sentiment classification model."""
    
    def __init__(self):
        self.model = None
        self.classes = ['negative', 'neutral', 'positive']
        
    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text."""
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def predict(self, text: str) -> Dict:
        """Predict sentiment of a single text."""
        if self.model is None:
            return {
                "text": text,
                "prediction": "error",
                "confidence": 0.0,
                "probabilities": {"positive": 0.33, "negative": 0.33, "neutral": 0.34}
            }
        
        processed_text = self.preprocess_text(text)
        probabilities = self.model.predict_proba([processed_text])[0]
        prediction_idx = np.argmax(probabilities)
        prediction = self.classes[prediction_idx]
        confidence = probabilities[prediction_idx]
        
        return {
            "text": text,
            "prediction": prediction,
            "confidence": float(confidence),
            "probabilities": {
                class_name: float(prob) 
                for class_name, prob in zip(self.classes, probabilities)
            }
        }

# Pipeline function for Hugging Face
def pipeline(task: str, model=None, **kwargs):
    """Pipeline function for Hugging Face Hub."""
    if task == "text-classification":
        return SentimentAnalysisPipeline(model)
    else:
        raise ValueError(f"Task {task} not supported")

class SentimentAnalysisPipeline:
    """Pipeline for sentiment analysis."""
    
    def __init__(self, model=None):
        self.classifier = SentimentClassifier()
        # Load the trained model
        self._load_model()
    
    def _load_model(self):
        """Load the trained model."""
        try:
            # Try to load from model files
            import joblib
            self.classifier.model = joblib.load("model.pkl")
        except:
            # Fallback: create a simple model
            self._create_fallback_model()
    
    def _create_fallback_model(self):
        """Create a fallback model."""
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.pipeline import Pipeline
        
        # Sample training data
        texts = [
            "I love this product!", "This is terrible.", "It's okay, nothing special.",
            "Amazing quality!", "Worst experience ever.", "Pretty good overall.",
            "Absolutely fantastic!", "Completely disappointed.", "Average product.",
            "Excellent service!", "Terrible customer support.", "Decent enough.",
            "Outstanding quality!", "Completely useless.", "It's fine, I guess.",
            "Best purchase ever!", "Waste of money.", "Nothing special.",
            "Highly recommended!", "Would not buy again.", "Average at best."
        ]
        labels = ["positive", "negative", "neutral", "positive", "negative", "neutral",
                 "positive", "negative", "neutral", "positive", "negative", "neutral",
                 "positive", "negative", "neutral", "positive", "negative", "neutral",
                 "positive", "negative", "neutral"]
        
        self.classifier.model = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))),
            ('classifier', MultinomialNB())
        ])
        self.classifier.model.fit(texts, labels)
    
    def __call__(self, inputs: Union[str, List[str]], **kwargs):
        """Process inputs."""
        if isinstance(inputs, str):
            return self.classifier.predict(inputs)
        else:
            return [self.classifier.predict(text) for text in inputs]

# For compatibility with transformers
def sentiment_analysis_pipeline(model=None, **kwargs):
    """Create sentiment analysis pipeline."""
    return SentimentAnalysisPipeline(model)