File size: 4,081 Bytes
c06888f
 
cd098b8
a6cc94f
80cb61e
494f3c9
a6cc94f
b5ad127
 
 
494f3c9
 
b5ad127
c06888f
1a7ffb3
b5ad127
1a7ffb3
b5ad127
1a7ffb3
e547dd2
 
ef42ebc
b5ad127
a6cc94f
 
 
 
494f3c9
b5ad127
 
 
a6cc94f
 
 
b5ad127
 
 
494f3c9
a6cc94f
 
b5ad127
 
a6cc94f
 
356f40d
a6cc94f
 
 
 
 
1a7ffb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
os.environ["HF_HOME"] = "/tmp/hf-home"

import nltk
nltk.download("punkt", download_dir="/tmp/nltk_data")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from transformers import pipeline
import numpy as np

# Load summarizer and Q&A pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# --- Summarization Functions ---
def summarize_review(text):
    """Standard transformer-based summarization"""
    return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
def smart_summarize(text, n_clusters=1):
    """
    Clustering + cosine similarity-based summarization
    Selects most representative sentence(s) from each cluster
    """
    sentences = sent_tokenize(text)
    if len(sentences) <= 1:
        return text

    tfidf = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf.fit_transform(sentences)

    if len(sentences) <= n_clusters:
        return " ".join(sentences)

    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(tfidf_matrix)
    summary_sentences = []

    for i in range(n_clusters):
        idx = np.where(kmeans.labels_ == i)[0]
        if not len(idx):
            continue
        avg_vector = np.asarray(tfidf_matrix[idx].mean(axis=0))
        sim = cosine_similarity(avg_vector, tfidf_matrix[idx].toarray())
        most_representative = sentences[idx[np.argmax(sim)]]
        summary_sentences.append(most_representative)

    return " ".join(sorted(summary_sentences, key=sentences.index))

# --- Rule-based Category Detectors ---
def detect_industry(text):
    text = text.lower()
    if any(k in text for k in ["doctor", "hospital", "health", "pill", "med"]):
        return "Healthcare"
    if any(k in text for k in ["flight", "hotel", "trip", "booking"]):
        return "Travel"
    if any(k in text for k in ["bank", "loan", "credit", "payment"]):
        return "Banking"
    if any(k in text for k in ["gym", "trainer", "fitness", "workout"]):
        return "Fitness"
    if any(k in text for k in ["movie", "series", "stream", "video"]):
        return "Entertainment"
    if any(k in text for k in ["game", "gaming", "console"]):
        return "Gaming"
    if any(k in text for k in ["food", "delivery", "restaurant", "order"]):
        return "Food Delivery"
    if any(k in text for k in ["school", "university", "teacher", "course"]):
        return "Education"
    if any(k in text for k in ["insurance", "policy", "claim"]):
        return "Insurance"
    if any(k in text for k in ["property", "rent", "apartment", "house"]):
        return "Real Estate"
    if any(k in text for k in ["shop", "buy", "product", "phone", "amazon", "flipkart"]):
        return "E-commerce"
    return "Generic"

def detect_product_category(text):
    text = text.lower()
    if any(k in text for k in ["mobile", "smartphone", "iphone", "samsung", "phone"]):
        return "Mobile Devices"
    if any(k in text for k in ["laptop", "macbook", "notebook", "chromebook"]):
        return "Laptops"
    if any(k in text for k in ["tv", "refrigerator", "microwave", "washer"]):
        return "Home Appliances"
    if any(k in text for k in ["watch", "band", "fitbit", "wearable"]):
        return "Wearables"
    if any(k in text for k in ["app", "portal", "site", "website"]):
        return "Web App"
    return "General"

# --- Follow-up Q&A ---
def answer_followup(text, question, verbosity="brief"):
    try:
        response = qa_pipeline({"question": question, "context": text})
        answer = response.get("answer", "")
        if verbosity.lower() == "detailed":
            return f"Based on the review, the answer is: **{answer}**"
        return answer
    except Exception:
        return "Sorry, I couldn't generate a follow-up answer."