Hasitha16 commited on
Commit
b5ad127
Β·
verified Β·
1 Parent(s): 5053036

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +121 -120
model.py CHANGED
@@ -1,121 +1,122 @@
1
- from typing import List, Optional
2
- from pydantic import BaseModel
3
- from transformers import pipeline
4
- import nltk.data
5
-
6
- # βœ… Extra: Smart Summarization Imports
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from sklearn.cluster import KMeans
9
- from nltk.tokenize import sent_tokenize
10
- from sklearn.metrics.pairwise import cosine_similarity
11
- import numpy as np
12
-
13
- # πŸ“„ Load HuggingFace Pipelines
14
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
15
- sentiment_analyzer = pipeline("sentiment-analysis")
16
-
17
- # 🧠 Basic Summarization (Abstractive)
18
- def summarize_review(text):
19
- return summarizer(text, max_length=60, min_length=10, do_sample=False, no_repeat_ngram_size=3)[0]["summary_text"]
20
-
21
- # 🧠 Smart Summarization (Clustered Key Sentences)
22
- def smart_summarize(text, n_clusters=1):
23
- """Improved summarization using clustering on sentence embeddings"""
24
- tokenizer = nltk.tokenize.PunktSentenceTokenizer() # βœ… Use default trained Punkt tokenizer
25
- sentences = tokenizer.tokenize(text)
26
-
27
- if len(sentences) <= 1:
28
- return text
29
-
30
- vectorizer = TfidfVectorizer(stop_words="english")
31
- tfidf_matrix = vectorizer.fit_transform(sentences)
32
-
33
- if len(sentences) <= n_clusters:
34
- return " ".join(sentences)
35
-
36
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
37
- kmeans.fit(tfidf_matrix)
38
-
39
- avg = []
40
- for i in range(n_clusters):
41
- idx = np.where(kmeans.labels_ == i)[0]
42
- if len(idx) == 0:
43
- continue
44
- avg_vector = tfidf_matrix[idx].mean(axis=0).A1.reshape(1, -1) # Convert np.matrix to ndarray
45
- sim = cosine_similarity(avg_vector, tfidf_matrix[idx])
46
- most_representative_idx = idx[np.argmax(sim)]
47
- avg.append(sentences[most_representative_idx])
48
-
49
- return " ".join(sorted(avg, key=sentences.index))
50
-
51
- # πŸ“Š Sentiment Detection
52
- def analyze_sentiment(text):
53
- result = sentiment_analyzer(text)[0]
54
- label = result["label"]
55
- score = result["score"]
56
-
57
- if "star" in label:
58
- stars = int(label[0])
59
- if stars <= 2:
60
- label = "NEGATIVE"
61
- elif stars == 3:
62
- label = "NEUTRAL"
63
- else:
64
- label = "POSITIVE"
65
-
66
- return {
67
- "label": label,
68
- "score": score
69
- }
70
-
71
- # πŸ”₯ Emotion Detection (heuristic-based)
72
- def detect_emotion(text):
73
- text_lower = text.lower()
74
- if "angry" in text_lower or "hate" in text_lower:
75
- return "anger"
76
- elif "happy" in text_lower or "love" in text_lower:
77
- return "joy"
78
- elif "sad" in text_lower or "disappointed" in text_lower:
79
- return "sadness"
80
- elif "confused" in text_lower or "unclear" in text_lower:
81
- return "confusion"
82
- else:
83
- return "neutral"
84
-
85
- # 🧩 Aspect-Based Sentiment (mock)
86
- def extract_aspect_sentiment(text, aspects: list):
87
- results = {}
88
- text_lower = text.lower()
89
- for asp in aspects:
90
- label = "positive" if asp in text_lower and "not" not in text_lower else "neutral"
91
- results[asp] = {
92
- "label": label,
93
- "confidence": 0.85
94
- }
95
- return results
96
-
97
- # βœ… Pydantic Schemas for FastAPI
98
- class ReviewInput(BaseModel):
99
- text: str
100
- model: str = "distilbert-base-uncased-finetuned-sst-2-english"
101
- industry: str = "Generic"
102
- aspects: bool = False
103
- follow_up: Optional[str] = None
104
- product_category: Optional[str] = None
105
- device: Optional[str] = None
106
-
107
- class BulkReviewInput(BaseModel):
108
- reviews: List[str]
109
- model: str = "distilbert-base-uncased-finetuned-sst-2-english"
110
- industry: str = "Generic"
111
- aspects: bool = False
112
- product_category: Optional[str] = None
113
- device: Optional[str] = None
114
-
115
- class TranslationInput(BaseModel):
116
- text: str
117
- target_lang: str = "fr"
118
-
119
- class ChatInput(BaseModel):
120
- question: str
 
121
  context: str
 
1
+ from typing import List, Optional
2
+ from pydantic import BaseModel
3
+ from transformers import pipeline
4
+ import nltk.data
5
+
6
+ # βœ… Extra: Smart Summarization Imports
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.cluster import KMeans
9
+ from nltk.tokenize import sent_tokenize
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ import numpy as np
12
+ import os
13
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
14
+ # πŸ“„ Load HuggingFace Pipelines
15
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
16
+ sentiment_analyzer = pipeline("sentiment-analysis")
17
+
18
+ # 🧠 Basic Summarization (Abstractive)
19
+ def summarize_review(text):
20
+ return summarizer(text, max_length=60, min_length=10, do_sample=False, no_repeat_ngram_size=3)[0]["summary_text"]
21
+
22
+ # 🧠 Smart Summarization (Clustered Key Sentences)
23
+ def smart_summarize(text, n_clusters=1):
24
+ """Improved summarization using clustering on sentence embeddings"""
25
+ tokenizer = nltk.tokenize.PunktSentenceTokenizer() # βœ… Use default trained Punkt tokenizer
26
+ sentences = tokenizer.tokenize(text)
27
+
28
+ if len(sentences) <= 1:
29
+ return text
30
+
31
+ vectorizer = TfidfVectorizer(stop_words="english")
32
+ tfidf_matrix = vectorizer.fit_transform(sentences)
33
+
34
+ if len(sentences) <= n_clusters:
35
+ return " ".join(sentences)
36
+
37
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
38
+ kmeans.fit(tfidf_matrix)
39
+
40
+ avg = []
41
+ for i in range(n_clusters):
42
+ idx = np.where(kmeans.labels_ == i)[0]
43
+ if len(idx) == 0:
44
+ continue
45
+ avg_vector = tfidf_matrix[idx].mean(axis=0).A1.reshape(1, -1) # Convert np.matrix to ndarray
46
+ sim = cosine_similarity(avg_vector, tfidf_matrix[idx])
47
+ most_representative_idx = idx[np.argmax(sim)]
48
+ avg.append(sentences[most_representative_idx])
49
+
50
+ return " ".join(sorted(avg, key=sentences.index))
51
+
52
+ # πŸ“Š Sentiment Detection
53
+ def analyze_sentiment(text):
54
+ result = sentiment_analyzer(text)[0]
55
+ label = result["label"]
56
+ score = result["score"]
57
+
58
+ if "star" in label:
59
+ stars = int(label[0])
60
+ if stars <= 2:
61
+ label = "NEGATIVE"
62
+ elif stars == 3:
63
+ label = "NEUTRAL"
64
+ else:
65
+ label = "POSITIVE"
66
+
67
+ return {
68
+ "label": label,
69
+ "score": score
70
+ }
71
+
72
+ # πŸ”₯ Emotion Detection (heuristic-based)
73
+ def detect_emotion(text):
74
+ text_lower = text.lower()
75
+ if "angry" in text_lower or "hate" in text_lower:
76
+ return "anger"
77
+ elif "happy" in text_lower or "love" in text_lower:
78
+ return "joy"
79
+ elif "sad" in text_lower or "disappointed" in text_lower:
80
+ return "sadness"
81
+ elif "confused" in text_lower or "unclear" in text_lower:
82
+ return "confusion"
83
+ else:
84
+ return "neutral"
85
+
86
+ # 🧩 Aspect-Based Sentiment (mock)
87
+ def extract_aspect_sentiment(text, aspects: list):
88
+ results = {}
89
+ text_lower = text.lower()
90
+ for asp in aspects:
91
+ label = "positive" if asp in text_lower and "not" not in text_lower else "neutral"
92
+ results[asp] = {
93
+ "label": label,
94
+ "confidence": 0.85
95
+ }
96
+ return results
97
+
98
+ # βœ… Pydantic Schemas for FastAPI
99
+ class ReviewInput(BaseModel):
100
+ text: str
101
+ model: str = "distilbert-base-uncased-finetuned-sst-2-english"
102
+ industry: str = "Generic"
103
+ aspects: bool = False
104
+ follow_up: Optional[str] = None
105
+ product_category: Optional[str] = None
106
+ device: Optional[str] = None
107
+
108
+ class BulkReviewInput(BaseModel):
109
+ reviews: List[str]
110
+ model: str = "distilbert-base-uncased-finetuned-sst-2-english"
111
+ industry: str = "Generic"
112
+ aspects: bool = False
113
+ product_category: Optional[str] = None
114
+ device: Optional[str] = None
115
+
116
+ class TranslationInput(BaseModel):
117
+ text: str
118
+ target_lang: str = "fr"
119
+
120
+ class ChatInput(BaseModel):
121
+ question: str
122
  context: str