Spaces:
Running
Running
File size: 7,230 Bytes
555c12b e394afd 555c12b fadde82 073de9b fadde82 555c12b 2e4bd3d 555c12b fadde82 8a7c354 555c12b fadde82 555c12b 8a7c354 555c12b 36cce4f 555c12b 8a7c354 3034a35 fadde82 e394afd 555c12b 36cce4f 555c12b 36cce4f 555c12b e394afd 555c12b 36cce4f 555c12b e394afd 555c12b e394afd 40537d1 e394afd 40537d1 8b420fe 40537d1 8b420fe e394afd 555c12b e394afd 555c12b e394afd 555c12b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import os
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
os.environ["HF_HOME"] = "/tmp/hf-home"
import nltk
nltk.download("punkt", download_dir="/tmp/nltk_data")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from transformers import pipeline
import numpy as np
import logging
import re
# === Pipelines ===
summarizer = pipeline("summarization", model="google/pegasus-xsum")
qa_pipeline = pipeline(
"question-answering",
model="distilbert-base-cased-distilled-squad",
tokenizer="distilbert-base-cased-distilled-squad"
)
emotion_model = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1)
# === Brief Summarization ===
def summarize_review(text, max_len=100, min_len=30):
try:
result = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
if result and isinstance(result, list) and "summary_text" in result[0]:
return result[0]["summary_text"]
else:
logging.warning("Summarizer output malformed, falling back.")
return text
except Exception as e:
logging.warning(f"Fallback to raw text due to summarization error: {e}")
return text
# === Smart Summarization with Clustering ===
def smart_summarize(text, n_clusters=1):
try:
sentences = sent_tokenize(text)
if len(sentences) <= 1:
return text
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(sentences)
if len(sentences) <= n_clusters:
return " ".join(sentences)
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(tfidf_matrix)
summary_sentences = []
for i in range(n_clusters):
idx = np.where(kmeans.labels_ == i)[0]
if not len(idx):
continue
avg_vector = np.asarray(tfidf_matrix[idx].mean(axis=0))
sim = cosine_similarity(avg_vector, tfidf_matrix[idx].toarray())
most_representative = sentences[idx[np.argmax(sim)]]
summary_sentences.append(most_representative)
return " ".join(sorted(summary_sentences, key=sentences.index))
except Exception as e:
logging.error(f"Smart summarize error: {e}")
return text
# === Emotion Detection (Fixed) ===
def detect_emotion(text):
if not text.strip():
return "neutral"
try:
result = emotion_model(text, top_k=1)
if isinstance(result, list) and isinstance(result[0], dict):
return result[0]["label"]
elif isinstance(result, dict) and "label" in result:
return result["label"]
else:
return "neutral"
except Exception as e:
logging.warning(f"Emotion detection failed: {e}")
return "neutral"
# === Follow-up Q&A ===
def answer_followup(text, question, verbosity="brief"):
try:
if not question:
return "No question provided."
if isinstance(question, list):
answers = []
for q in question:
if not q.strip():
continue
response = qa_pipeline({"question": q, "context": text})
ans = response.get("answer", "")
answers.append(f"**{q}** → {ans}" if verbosity.lower() == "detailed" else ans)
return answers
else:
response = qa_pipeline({"question": question, "context": text})
ans = response.get("answer", "")
return f"**{question}** → {ans}" if verbosity.lower() == "detailed" else ans
except Exception as e:
logging.warning(f"Follow-up error: {e}")
return "Sorry, I couldn't generate a follow-up answer."
# === Direct follow-up route handler ===
def answer_only(text, question):
try:
if not question:
return "No question provided."
return qa_pipeline({"question": question, "context": text}).get("answer", "No answer found.")
except Exception as e:
logging.warning(f"Answer-only failed: {e}")
return "Q&A failed."
# === Explanation Generator ===
def generate_explanation(text):
try:
explanation = summarizer(text, max_length=60, min_length=20, do_sample=False)[0]["summary_text"]
return f"🧠 This review can be explained as: {explanation}"
except Exception as e:
logging.warning(f"Explanation failed: {e}")
return "⚠️ Explanation could not be generated."
# === Churn Risk Estimator ===
def assess_churn_risk(sentiment_label, emotion_label):
if sentiment_label.lower() == "negative" and emotion_label.lower() in ["anger", "fear", "sadness", "frustrated"]:
return "High Risk"
return "Low Risk"
# === Pain Point Extractor ===
def extract_pain_points(text):
common_issues = [
"slow", "crash", "lag", "expensive", "confusing", "noisy", "poor", "rude",
"unhelpful", "bug", "broken", "unresponsive", "not working", "error", "delay", "disconnect",
"incomplete", "overpriced", "difficult", "conflict", "unclear", "inconsistent",
"missing", "locked", "freeze", "freeze-up", "conflicting", "conflicting answers", "outdated"
]
text_lower = text.lower()
matches = [kw for kw in common_issues if re.search(rf"\b{re.escape(kw)}\b", text_lower)]
return list(set(matches))[:5]
# === Industry Detector ===
def detect_industry(text):
text = text.lower()
if any(k in text for k in ["doctor", "hospital", "health", "pill", "med"]): return "Healthcare"
if any(k in text for k in ["flight", "hotel", "trip", "booking"]): return "Travel"
if any(k in text for k in ["bank", "loan", "credit", "payment"]): return "Banking"
if any(k in text for k in ["gym", "trainer", "fitness", "workout"]): return "Fitness"
if any(k in text for k in ["movie", "series", "stream", "video"]): return "Entertainment"
if any(k in text for k in ["game", "gaming", "console"]): return "Gaming"
if any(k in text for k in ["food", "delivery", "restaurant", "order"]): return "Food Delivery"
if any(k in text for k in ["school", "university", "teacher", "course"]): return "Education"
if any(k in text for k in ["insurance", "policy", "claim"]): return "Insurance"
if any(k in text for k in ["property", "rent", "apartment", "house"]): return "Real Estate"
if any(k in text for k in ["shop", "buy", "product", "phone", "amazon", "flipkart"]): return "E-commerce"
return "Generic"
# === Product Category Detector ===
def detect_product_category(text):
text = text.lower()
if any(k in text for k in ["mobile", "smartphone", "iphone", "samsung", "phone"]): return "Mobile Devices"
if any(k in text for k in ["laptop", "macbook", "notebook", "chromebook"]): return "Laptops"
if any(k in text for k in ["tv", "refrigerator", "microwave", "washer"]): return "Home Appliances"
if any(k in text for k in ["watch", "band", "fitbit", "wearable"]): return "Wearables"
if any(k in text for k in ["app", "portal", "site", "website"]): return "Web App"
return "General"
|