Hasitha16 commited on
Commit
494f3c9
Β·
verified Β·
1 Parent(s): 672778d

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +11 -101
model.py CHANGED
@@ -2,124 +2,34 @@ import os
2
  os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
3
  os.environ["HF_HOME"] = "/tmp/hf-home"
4
  import nltk
5
- os.environ["NLTK_DATA"] = "/tmp/nltk_data"
6
- nltk.download("punkt", download_dir="/tmp/nltk_data")
7
- from typing import List, Optional
8
- from pydantic import BaseModel
9
- from transformers import pipeline
10
-
11
- # βœ… Extra: Smart Summarization Imports
12
  from sklearn.feature_extraction.text import TfidfVectorizer
13
  from sklearn.cluster import KMeans
14
- from nltk.tokenize import sent_tokenize
15
  from sklearn.metrics.pairwise import cosine_similarity
 
 
16
  import numpy as np
17
 
18
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
19
- sentiment_analyzer = pipeline("sentiment-analysis")
20
 
21
- # 🧠 Basic Summarization (Abstractive)
22
  def summarize_review(text):
23
- return summarizer(text, max_length=60, min_length=10, do_sample=False, no_repeat_ngram_size=3)[0]["summary_text"]
24
 
25
- # 🧠 Smart Summarization (Clustered Key Sentences)
26
  def smart_summarize(text, n_clusters=1):
27
- """Improved summarization using clustering on sentence embeddings"""
28
- tokenizer = nltk.tokenize.PunktSentenceTokenizer() # βœ… Use default trained Punkt tokenizer
29
- sentences = tokenizer.tokenize(text)
30
-
31
  if len(sentences) <= 1:
32
  return text
33
 
34
- vectorizer = TfidfVectorizer(stop_words="english")
35
- tfidf_matrix = vectorizer.fit_transform(sentences)
36
-
37
  if len(sentences) <= n_clusters:
38
  return " ".join(sentences)
39
 
40
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
41
- kmeans.fit(tfidf_matrix)
42
-
43
  avg = []
44
  for i in range(n_clusters):
45
  idx = np.where(kmeans.labels_ == i)[0]
46
- if len(idx) == 0:
47
- continue
48
- avg_vector = tfidf_matrix[idx].mean(axis=0).A1.reshape(1, -1) # Convert np.matrix to ndarray
49
  sim = cosine_similarity(avg_vector, tfidf_matrix[idx])
50
- most_representative_idx = idx[np.argmax(sim)]
51
- avg.append(sentences[most_representative_idx])
52
-
53
- return " ".join(sorted(avg, key=sentences.index))
54
-
55
- # πŸ“Š Sentiment Detection
56
- def analyze_sentiment(text):
57
- result = sentiment_analyzer(text)[0]
58
- label = result["label"]
59
- score = result["score"]
60
-
61
- if "star" in label:
62
- stars = int(label[0])
63
- if stars <= 2:
64
- label = "NEGATIVE"
65
- elif stars == 3:
66
- label = "NEUTRAL"
67
- else:
68
- label = "POSITIVE"
69
-
70
- return {
71
- "label": label,
72
- "score": score
73
- }
74
-
75
- # πŸ”₯ Emotion Detection (heuristic-based)
76
- def detect_emotion(text):
77
- text_lower = text.lower()
78
- if "angry" in text_lower or "hate" in text_lower:
79
- return "anger"
80
- elif "happy" in text_lower or "love" in text_lower:
81
- return "joy"
82
- elif "sad" in text_lower or "disappointed" in text_lower:
83
- return "sadness"
84
- elif "confused" in text_lower or "unclear" in text_lower:
85
- return "confusion"
86
- else:
87
- return "neutral"
88
-
89
- # 🧩 Aspect-Based Sentiment (mock)
90
- def extract_aspect_sentiment(text, aspects: list):
91
- results = {}
92
- text_lower = text.lower()
93
- for asp in aspects:
94
- label = "positive" if asp in text_lower and "not" not in text_lower else "neutral"
95
- results[asp] = {
96
- "label": label,
97
- "confidence": 0.85
98
- }
99
- return results
100
-
101
- # βœ… Pydantic Schemas for FastAPI
102
- class ReviewInput(BaseModel):
103
- text: str
104
- model: str = "distilbert-base-uncased-finetuned-sst-2-english"
105
- industry: str = "Generic"
106
- aspects: bool = False
107
- follow_up: Optional[str] = None
108
- product_category: Optional[str] = None
109
- device: Optional[str] = None
110
-
111
- class BulkReviewInput(BaseModel):
112
- reviews: List[str]
113
- model: str = "distilbert-base-uncased-finetuned-sst-2-english"
114
- industry: str = "Generic"
115
- aspects: bool = False
116
- product_category: Optional[str] = None
117
- device: Optional[str] = None
118
-
119
- class TranslationInput(BaseModel):
120
- text: str
121
- target_lang: str = "fr"
122
-
123
- class ChatInput(BaseModel):
124
- question: str
125
- context: str
 
2
  os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
3
  os.environ["HF_HOME"] = "/tmp/hf-home"
4
  import nltk
5
+ nltk.download("punkt", download_dir="/tmp/nltk_data")
 
 
 
 
 
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.cluster import KMeans
 
8
  from sklearn.metrics.pairwise import cosine_similarity
9
+ from nltk.tokenize import sent_tokenize
10
+ from transformers import pipeline
11
  import numpy as np
12
 
13
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 
14
 
 
15
  def summarize_review(text):
16
+ return summarizer(text, max_length=60, min_length=10, do_sample=False)[0]["summary_text"]
17
 
 
18
  def smart_summarize(text, n_clusters=1):
19
+ sentences = sent_tokenize(text)
 
 
 
20
  if len(sentences) <= 1:
21
  return text
22
 
23
+ tfidf_matrix = TfidfVectorizer(stop_words="english").fit_transform(sentences)
 
 
24
  if len(sentences) <= n_clusters:
25
  return " ".join(sentences)
26
 
27
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(tfidf_matrix)
 
 
28
  avg = []
29
  for i in range(n_clusters):
30
  idx = np.where(kmeans.labels_ == i)[0]
31
+ if not len(idx): continue
32
+ avg_vector = tfidf_matrix[idx].mean(axis=0)
 
33
  sim = cosine_similarity(avg_vector, tfidf_matrix[idx])
34
+ avg.append(sentences[idx[np.argmax(sim)]])
35
+ return " ".join(sorted(avg, key=sentences.index))