import os os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache" os.environ["HF_HOME"] = "/tmp/hf-home" import nltk os.environ["NLTK_DATA"] = "/tmp/nltk_data" nltk.download("punkt", download_dir="/tmp/nltk_data") from typing import List, Optional from pydantic import BaseModel from transformers import pipeline # ✅ Extra: Smart Summarization Imports from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from nltk.tokenize import sent_tokenize from sklearn.metrics.pairwise import cosine_similarity import numpy as np summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") sentiment_analyzer = pipeline("sentiment-analysis") # 🧠 Basic Summarization (Abstractive) def summarize_review(text): return summarizer(text, max_length=60, min_length=10, do_sample=False, no_repeat_ngram_size=3)[0]["summary_text"] # 🧠 Smart Summarization (Clustered Key Sentences) def smart_summarize(text, n_clusters=1): """Improved summarization using clustering on sentence embeddings""" tokenizer = nltk.tokenize.PunktSentenceTokenizer() # ✅ Use default trained Punkt tokenizer sentences = tokenizer.tokenize(text) if len(sentences) <= 1: return text vectorizer = TfidfVectorizer(stop_words="english") tfidf_matrix = vectorizer.fit_transform(sentences) if len(sentences) <= n_clusters: return " ".join(sentences) kmeans = KMeans(n_clusters=n_clusters, random_state=42) kmeans.fit(tfidf_matrix) avg = [] for i in range(n_clusters): idx = np.where(kmeans.labels_ == i)[0] if len(idx) == 0: continue avg_vector = tfidf_matrix[idx].mean(axis=0).A1.reshape(1, -1) # Convert np.matrix to ndarray sim = cosine_similarity(avg_vector, tfidf_matrix[idx]) most_representative_idx = idx[np.argmax(sim)] avg.append(sentences[most_representative_idx]) return " ".join(sorted(avg, key=sentences.index)) # 📊 Sentiment Detection def analyze_sentiment(text): result = sentiment_analyzer(text)[0] label = result["label"] score = result["score"] if "star" in label: stars = int(label[0]) if stars <= 2: label = "NEGATIVE" elif stars == 3: label = "NEUTRAL" else: label = "POSITIVE" return { "label": label, "score": score } # 🔥 Emotion Detection (heuristic-based) def detect_emotion(text): text_lower = text.lower() if "angry" in text_lower or "hate" in text_lower: return "anger" elif "happy" in text_lower or "love" in text_lower: return "joy" elif "sad" in text_lower or "disappointed" in text_lower: return "sadness" elif "confused" in text_lower or "unclear" in text_lower: return "confusion" else: return "neutral" # 🧩 Aspect-Based Sentiment (mock) def extract_aspect_sentiment(text, aspects: list): results = {} text_lower = text.lower() for asp in aspects: label = "positive" if asp in text_lower and "not" not in text_lower else "neutral" results[asp] = { "label": label, "confidence": 0.85 } return results # ✅ Pydantic Schemas for FastAPI class ReviewInput(BaseModel): text: str model: str = "distilbert-base-uncased-finetuned-sst-2-english" industry: str = "Generic" aspects: bool = False follow_up: Optional[str] = None product_category: Optional[str] = None device: Optional[str] = None class BulkReviewInput(BaseModel): reviews: List[str] model: str = "distilbert-base-uncased-finetuned-sst-2-english" industry: str = "Generic" aspects: bool = False product_category: Optional[str] = None device: Optional[str] = None class TranslationInput(BaseModel): text: str target_lang: str = "fr" class ChatInput(BaseModel): question: str context: str