Spaces:
Running
Running
File size: 5,896 Bytes
555c12b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
os.environ["HF_HOME"] = "/tmp/hf-home"
import nltk
nltk.download("punkt", download_dir="/tmp/nltk_data")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from transformers import pipeline
import numpy as np
import logging
# === Pipelines ===
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
emotion_pipeline = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", top_k=1)
# === Brief Summarization ===
def summarize_review(text, max_len=80, min_len=20):
try:
return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
except Exception as e:
logging.warning(f"Summarization fallback used: {e}")
return text
# === Smart Summarization with Clustering ===
def smart_summarize(text, n_clusters=1):
try:
sentences = sent_tokenize(text)
if len(sentences) <= 1:
return text
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(sentences)
if len(sentences) <= n_clusters:
return " ".join(sentences)
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(tfidf_matrix)
summary_sentences = []
for i in range(n_clusters):
idx = np.where(kmeans.labels_ == i)[0]
if not len(idx):
continue
avg_vector = np.asarray(tfidf_matrix[idx].mean(axis=0))
sim = cosine_similarity(avg_vector, tfidf_matrix[idx].toarray())
most_representative = sentences[idx[np.argmax(sim)]]
summary_sentences.append(most_representative)
return " ".join(sorted(summary_sentences, key=sentences.index))
except Exception as e:
logging.error(f"Smart summarize error: {e}")
return text
# === Emotion Detection ===
def detect_emotion(text):
try:
result = emotion_pipeline(text)[0]
return result["label"]
except Exception as e:
logging.warning(f"Emotion detection failed: {e}")
return "neutral"
# === Follow-up Q&A (Flexible for list or str) ===
def answer_followup(text, question, verbosity="brief"):
try:
if isinstance(question, list):
answers = []
for q in question:
response = qa_pipeline({"question": q, "context": text})
ans = response.get("answer", "")
if verbosity.lower() == "detailed":
answers.append(f"**{q}** → {ans}")
else:
answers.append(ans)
return answers
else:
response = qa_pipeline({"question": question, "context": text})
ans = response.get("answer", "")
return f"**{question}** → {ans}" if verbosity.lower() == "detailed" else ans
except Exception as e:
logging.warning(f"Follow-up error: {e}")
return "Sorry, I couldn't generate a follow-up answer."
# === Fast follow-up (used for direct /followup route) ===
def answer_only(text, question):
try:
if not question:
return "No question provided."
return qa_pipeline({"question": question, "context": text}).get("answer", "No answer found.")
except Exception as e:
logging.warning(f"Answer-only failed: {e}")
return "Q&A failed."
# === Optional Explanation Generator ===
def generate_explanation(text):
try:
explanation = summarizer(text, max_length=60, min_length=20, do_sample=False)[0]["summary_text"]
return f"🧠 This review can be explained as: {explanation}"
except Exception as e:
logging.warning(f"Explanation failed: {e}")
return "⚠️ Explanation could not be generated."
# === Industry Detector ===
def detect_industry(text):
text = text.lower()
if any(k in text for k in ["doctor", "hospital", "health", "pill", "med"]):
return "Healthcare"
if any(k in text for k in ["flight", "hotel", "trip", "booking"]):
return "Travel"
if any(k in text for k in ["bank", "loan", "credit", "payment"]):
return "Banking"
if any(k in text for k in ["gym", "trainer", "fitness", "workout"]):
return "Fitness"
if any(k in text for k in ["movie", "series", "stream", "video"]):
return "Entertainment"
if any(k in text for k in ["game", "gaming", "console"]):
return "Gaming"
if any(k in text for k in ["food", "delivery", "restaurant", "order"]):
return "Food Delivery"
if any(k in text for k in ["school", "university", "teacher", "course"]):
return "Education"
if any(k in text for k in ["insurance", "policy", "claim"]):
return "Insurance"
if any(k in text for k in ["property", "rent", "apartment", "house"]):
return "Real Estate"
if any(k in text for k in ["shop", "buy", "product", "phone", "amazon", "flipkart"]):
return "E-commerce"
return "Generic"
# === Product Category Detector ===
def detect_product_category(text):
text = text.lower()
if any(k in text for k in ["mobile", "smartphone", "iphone", "samsung", "phone"]):
return "Mobile Devices"
if any(k in text for k in ["laptop", "macbook", "notebook", "chromebook"]):
return "Laptops"
if any(k in text for k in ["tv", "refrigerator", "microwave", "washer"]):
return "Home Appliances"
if any(k in text for k in ["watch", "band", "fitbit", "wearable"]):
return "Wearables"
if any(k in text for k in ["app", "portal", "site", "website"]):
return "Web App"
return "General"
|