Spaces:
Running
Running
File size: 4,081 Bytes
c06888f cd098b8 a6cc94f 80cb61e 494f3c9 a6cc94f b5ad127 494f3c9 b5ad127 c06888f 1a7ffb3 b5ad127 1a7ffb3 b5ad127 1a7ffb3 e547dd2 ef42ebc b5ad127 a6cc94f 494f3c9 b5ad127 a6cc94f b5ad127 494f3c9 a6cc94f b5ad127 a6cc94f 356f40d a6cc94f 1a7ffb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
os.environ["HF_HOME"] = "/tmp/hf-home"
import nltk
nltk.download("punkt", download_dir="/tmp/nltk_data")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from transformers import pipeline
import numpy as np
# Load summarizer and Q&A pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
# --- Summarization Functions ---
def summarize_review(text):
"""Standard transformer-based summarization"""
return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
def smart_summarize(text, n_clusters=1):
"""
Clustering + cosine similarity-based summarization
Selects most representative sentence(s) from each cluster
"""
sentences = sent_tokenize(text)
if len(sentences) <= 1:
return text
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(sentences)
if len(sentences) <= n_clusters:
return " ".join(sentences)
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(tfidf_matrix)
summary_sentences = []
for i in range(n_clusters):
idx = np.where(kmeans.labels_ == i)[0]
if not len(idx):
continue
avg_vector = np.asarray(tfidf_matrix[idx].mean(axis=0))
sim = cosine_similarity(avg_vector, tfidf_matrix[idx].toarray())
most_representative = sentences[idx[np.argmax(sim)]]
summary_sentences.append(most_representative)
return " ".join(sorted(summary_sentences, key=sentences.index))
# --- Rule-based Category Detectors ---
def detect_industry(text):
text = text.lower()
if any(k in text for k in ["doctor", "hospital", "health", "pill", "med"]):
return "Healthcare"
if any(k in text for k in ["flight", "hotel", "trip", "booking"]):
return "Travel"
if any(k in text for k in ["bank", "loan", "credit", "payment"]):
return "Banking"
if any(k in text for k in ["gym", "trainer", "fitness", "workout"]):
return "Fitness"
if any(k in text for k in ["movie", "series", "stream", "video"]):
return "Entertainment"
if any(k in text for k in ["game", "gaming", "console"]):
return "Gaming"
if any(k in text for k in ["food", "delivery", "restaurant", "order"]):
return "Food Delivery"
if any(k in text for k in ["school", "university", "teacher", "course"]):
return "Education"
if any(k in text for k in ["insurance", "policy", "claim"]):
return "Insurance"
if any(k in text for k in ["property", "rent", "apartment", "house"]):
return "Real Estate"
if any(k in text for k in ["shop", "buy", "product", "phone", "amazon", "flipkart"]):
return "E-commerce"
return "Generic"
def detect_product_category(text):
text = text.lower()
if any(k in text for k in ["mobile", "smartphone", "iphone", "samsung", "phone"]):
return "Mobile Devices"
if any(k in text for k in ["laptop", "macbook", "notebook", "chromebook"]):
return "Laptops"
if any(k in text for k in ["tv", "refrigerator", "microwave", "washer"]):
return "Home Appliances"
if any(k in text for k in ["watch", "band", "fitbit", "wearable"]):
return "Wearables"
if any(k in text for k in ["app", "portal", "site", "website"]):
return "Web App"
return "General"
# --- Follow-up Q&A ---
def answer_followup(text, question, verbosity="brief"):
try:
response = qa_pipeline({"question": question, "context": text})
answer = response.get("answer", "")
if verbosity.lower() == "detailed":
return f"Based on the review, the answer is: **{answer}**"
return answer
except Exception:
return "Sorry, I couldn't generate a follow-up answer."
|