analytics-jiten commited on
Commit
60ded96
·
1 Parent(s): 01ad901

Create aspects_extraction.py

Browse files
Files changed (1) hide show
  1. aspects_extraction.py +261 -0
aspects_extraction.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def has_vectors(doc):
5
+ return np.all([token.has_vector for token in doc])
6
+
7
+ def extract_doc_aspects(doc):
8
+
9
+ prod_pronouns = ['it','this','they','these']
10
+
11
+ rule1_pairs = []
12
+ rule2_pairs = []
13
+ rule3_pairs = []
14
+ rule4_pairs = []
15
+ rule5_pairs = []
16
+ rule6_pairs = []
17
+ rule7_pairs = []
18
+
19
+ for token in doc:
20
+ if token.text == 'product':
21
+ continue
22
+
23
+ ## FIRST RULE OF DEPENDANCY PARSE -
24
+ ## M - Sentiment modifier || A - Aspect
25
+ ## RULE = M is child of A with a relationship of amod
26
+ A = "999999"
27
+ M = "999999"
28
+ if token.dep_ == "amod" and not token.is_stop:
29
+ M = token.text
30
+ A = token.head.text
31
+
32
+ # add adverbial modifier of adjective (e.g. 'most comfortable headphones')
33
+ M_children = token.children
34
+ for child_m in M_children:
35
+ if(child_m.dep_ == "advmod"):
36
+ M_hash = child_m.text
37
+ M = M_hash + " " + M
38
+ break
39
+
40
+ # negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters)
41
+ A_children = token.head.children
42
+ for child_a in A_children:
43
+ if(child_a.dep_ == "det" and child_a.text == 'no'):
44
+ neg_prefix = 'not'
45
+ M = neg_prefix + " " + M
46
+ break
47
+
48
+ if(A != "999999" and M != "999999"):
49
+ if A in prod_pronouns :
50
+ A = "product"
51
+ dict1 = {"noun" : A, "adj" : M, "rule" : 1}
52
+ rule1_pairs.append(dict1)
53
+
54
+
55
+ # # SECOND RULE OF DEPENDANCY PARSE -
56
+ # # M - Sentiment modifier || A - Aspect
57
+ # Direct Object - A is a child of something with relationship of nsubj, while
58
+ # M is a child of the same something with relationship of dobj
59
+ # Assumption - A verb will have only one NSUBJ and DOBJ
60
+ children = token.children
61
+ A = "999999"
62
+ M = "999999"
63
+ add_neg_pfx = False
64
+ for child in children :
65
+ if(child.dep_ == "nsubj" and not child.is_stop):
66
+ A = child.text
67
+
68
+ if((child.dep_ == "dobj" and child.pos_ == "ADJ") and not child.is_stop):
69
+ M = child.text
70
+
71
+ if(child.dep_ == "neg"):
72
+ neg_prefix = child.text
73
+ add_neg_pfx = True
74
+
75
+ if (add_neg_pfx and M != "999999"):
76
+ M = neg_prefix + " " + M
77
+
78
+ if(A != "999999" and M != "999999"):
79
+ if A in prod_pronouns :
80
+ A = "product"
81
+ dict2 = {"noun" : A, "adj" : M, "rule" : 2}
82
+ rule2_pairs.append(dict2)
83
+
84
+
85
+ ## THIRD RULE OF DEPENDANCY PARSE -
86
+ ## M - Sentiment modifier || A - Aspect
87
+ ## Adjectival Complement - A is a child of something with relationship of nsubj, while
88
+ ## M is a child of the same something with relationship of acomp
89
+ ## Assumption - A verb will have only one NSUBJ and DOBJ
90
+ ## "The sound of the speakers would be better. The sound of the speakers could be better" - handled using AUX dependency
91
+
92
+ children = token.children
93
+ A = "999999"
94
+ M = "999999"
95
+ add_neg_pfx = False
96
+ for child in children :
97
+ if(child.dep_ == "nsubj" and not child.is_stop):
98
+ A = child.text
99
+
100
+ if(child.dep_ == "acomp" and not child.is_stop):
101
+ M = child.text
102
+
103
+ # example - 'this could have been better' -> (this, not better)
104
+ if(child.dep_ == "aux" and child.tag_ == "MD"):
105
+ neg_prefix = "not"
106
+ add_neg_pfx = True
107
+
108
+ if(child.dep_ == "neg"):
109
+ neg_prefix = child.text
110
+ add_neg_pfx = True
111
+
112
+ if (add_neg_pfx and M != "999999"):
113
+ M = neg_prefix + " " + M
114
+
115
+ if(A != "999999" and M != "999999"):
116
+ if A in prod_pronouns :
117
+ A = "product"
118
+ dict3 = {"noun" : A, "adj" : M, "rule" : 3}
119
+ rule3_pairs.append(dict3)
120
+
121
+
122
+ ## FOURTH RULE OF DEPENDANCY PARSE -
123
+ ## M - Sentiment modifier || A - Aspect
124
+
125
+ #Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while
126
+ # M is a child of the same something with relationship of advmod
127
+
128
+ #Assumption - A verb will have only one NSUBJ and DOBJ
129
+
130
+ children = token.children
131
+ A = "999999"
132
+ M = "999999"
133
+ add_neg_pfx = False
134
+ for child in children :
135
+ if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop):
136
+ A = child.text
137
+
138
+ if(child.dep_ == "advmod" and not child.is_stop):
139
+ M = child.text
140
+ M_children = child.children
141
+ for child_m in M_children:
142
+ if(child_m.dep_ == "advmod"):
143
+ M_hash = child_m.text
144
+ M = M_hash + " " + child.text
145
+ break
146
+
147
+ if(child.dep_ == "neg"):
148
+ neg_prefix = child.text
149
+ add_neg_pfx = True
150
+
151
+ if (add_neg_pfx and M != "999999"):
152
+ M = neg_prefix + " " + M
153
+
154
+ if(A != "999999" and M != "999999"):
155
+ if A in prod_pronouns :
156
+ A = "product"
157
+ dict4 = {"noun" : A, "adj" : M, "rule" : 4}
158
+ rule4_pairs.append(dict4)
159
+
160
+ ## FIFTH RULE OF DEPENDANCY PARSE -
161
+ ## M - Sentiment modifier || A - Aspect
162
+
163
+ #Complement of a copular verb - A is a child of M with relationship of nsubj, while
164
+ # M has a child with relationship of cop
165
+
166
+ #Assumption - A verb will have only one NSUBJ and DOBJ
167
+
168
+ children = token.children
169
+ A = "999999"
170
+ buf_var = "999999"
171
+ for child in children :
172
+ if(child.dep_ == "nsubj" and not child.is_stop):
173
+ A = child.text
174
+
175
+ if(child.dep_ == "cop" and not child.is_stop):
176
+ buf_var = child.text
177
+
178
+ if(A != "999999" and buf_var != "999999"):
179
+ if A in prod_pronouns :
180
+ A = "product"
181
+ dict5 = {"noun" : A, "adj" : token.text, "rule" : 5}
182
+ rule5_pairs.append(dict5)
183
+
184
+
185
+ ## SIXTH RULE OF DEPENDANCY PARSE -
186
+ ## M - Sentiment modifier || A - Aspect
187
+ ## Example - "It ok", "ok" is INTJ (interjections like bravo, great etc)
188
+
189
+ children = token.children
190
+ A = "999999"
191
+ M = "999999"
192
+ if(token.pos_ == "INTJ" and not token.is_stop):
193
+ for child in children :
194
+ if(child.dep_ == "nsubj" and not child.is_stop):
195
+ A = child.text
196
+ M = token.text
197
+
198
+ if(A != "999999" and M != "999999"):
199
+ if A in prod_pronouns :
200
+ A = "product"
201
+ dict6 = {"noun" : A, "adj" : M, "rule" : 6}
202
+ rule6_pairs.append(dict6)
203
+
204
+ ## SEVENTH RULE OF DEPENDANCY PARSE -
205
+ ## M - Sentiment modifier || A - Aspect
206
+ ## ATTR - link between a verb like 'be/seem/appear' and its complement
207
+ ## Example: 'this is garbage' -> (this, garbage)
208
+
209
+ children = token.children
210
+ A = "999999"
211
+ M = "999999"
212
+ add_neg_pfx = False
213
+ for child in children :
214
+ if(child.dep_ == "nsubj" and not child.is_stop):
215
+ A = child.text
216
+
217
+ if((child.dep_ == "attr") and not child.is_stop):
218
+ M = child.text
219
+
220
+ if(child.dep_ == "neg"):
221
+ neg_prefix = child.text
222
+ add_neg_pfx = True
223
+
224
+ if (add_neg_pfx and M != "999999"):
225
+ M = neg_prefix + " " + M
226
+
227
+ if(A != "999999" and M != "999999"):
228
+ if A in prod_pronouns :
229
+ A = "product"
230
+ dict7 = {"noun" : A, "adj" : M, "rule" : 7}
231
+ rule7_pairs.append(dict7)
232
+
233
+ aspects = []
234
+
235
+ aspects = rule1_pairs + rule2_pairs + rule3_pairs +rule4_pairs +rule5_pairs + rule6_pairs + rule7_pairs
236
+
237
+ return aspects
238
+
239
+ def extract_aspects(nlp, reviews):
240
+ aspects = []
241
+
242
+ data = ([
243
+ (x[1], x[0]) for x in reviews['text_cleaned'].reset_index().to_numpy()
244
+ ])
245
+
246
+ for doc, review_id in nlp.pipe(data, as_tuples=True):
247
+ doc_aspects = extract_doc_aspects(doc)
248
+ doc_aspects = [
249
+ [review_id, aspect['noun'], aspect['adj'], aspect['rule']]
250
+ for aspect in doc_aspects if not aspect['noun'].lower().startswith('product')
251
+ ]
252
+ # filter aspects with out of vocubalary nouns
253
+ doc_aspects = [
254
+ doc_aspect for doc_aspect in doc_aspects
255
+ if has_vectors(nlp(doc_aspect[1]))
256
+ ]
257
+ aspects.extend(doc_aspects)
258
+
259
+ aspects = pd.DataFrame(aspects, columns=['review_id', 'aspect', 'opinion', 'rule'])
260
+
261
+ return aspects