reviews-insights / aspects_extraction.py
analytics-jiten's picture
Create aspects_extraction.py
60ded96
raw
history blame
8.79 kB
import pandas as pd
import numpy as np
def has_vectors(doc):
return np.all([token.has_vector for token in doc])
def extract_doc_aspects(doc):
prod_pronouns = ['it','this','they','these']
rule1_pairs = []
rule2_pairs = []
rule3_pairs = []
rule4_pairs = []
rule5_pairs = []
rule6_pairs = []
rule7_pairs = []
for token in doc:
if token.text == 'product':
continue
## FIRST RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect
## RULE = M is child of A with a relationship of amod
A = "999999"
M = "999999"
if token.dep_ == "amod" and not token.is_stop:
M = token.text
A = token.head.text
# add adverbial modifier of adjective (e.g. 'most comfortable headphones')
M_children = token.children
for child_m in M_children:
if(child_m.dep_ == "advmod"):
M_hash = child_m.text
M = M_hash + " " + M
break
# negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters)
A_children = token.head.children
for child_a in A_children:
if(child_a.dep_ == "det" and child_a.text == 'no'):
neg_prefix = 'not'
M = neg_prefix + " " + M
break
if(A != "999999" and M != "999999"):
if A in prod_pronouns :
A = "product"
dict1 = {"noun" : A, "adj" : M, "rule" : 1}
rule1_pairs.append(dict1)
# # SECOND RULE OF DEPENDANCY PARSE -
# # M - Sentiment modifier || A - Aspect
# Direct Object - A is a child of something with relationship of nsubj, while
# M is a child of the same something with relationship of dobj
# Assumption - A verb will have only one NSUBJ and DOBJ
children = token.children
A = "999999"
M = "999999"
add_neg_pfx = False
for child in children :
if(child.dep_ == "nsubj" and not child.is_stop):
A = child.text
if((child.dep_ == "dobj" and child.pos_ == "ADJ") and not child.is_stop):
M = child.text
if(child.dep_ == "neg"):
neg_prefix = child.text
add_neg_pfx = True
if (add_neg_pfx and M != "999999"):
M = neg_prefix + " " + M
if(A != "999999" and M != "999999"):
if A in prod_pronouns :
A = "product"
dict2 = {"noun" : A, "adj" : M, "rule" : 2}
rule2_pairs.append(dict2)
## THIRD RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect
## Adjectival Complement - A is a child of something with relationship of nsubj, while
## M is a child of the same something with relationship of acomp
## Assumption - A verb will have only one NSUBJ and DOBJ
## "The sound of the speakers would be better. The sound of the speakers could be better" - handled using AUX dependency
children = token.children
A = "999999"
M = "999999"
add_neg_pfx = False
for child in children :
if(child.dep_ == "nsubj" and not child.is_stop):
A = child.text
if(child.dep_ == "acomp" and not child.is_stop):
M = child.text
# example - 'this could have been better' -> (this, not better)
if(child.dep_ == "aux" and child.tag_ == "MD"):
neg_prefix = "not"
add_neg_pfx = True
if(child.dep_ == "neg"):
neg_prefix = child.text
add_neg_pfx = True
if (add_neg_pfx and M != "999999"):
M = neg_prefix + " " + M
if(A != "999999" and M != "999999"):
if A in prod_pronouns :
A = "product"
dict3 = {"noun" : A, "adj" : M, "rule" : 3}
rule3_pairs.append(dict3)
## FOURTH RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect
#Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while
# M is a child of the same something with relationship of advmod
#Assumption - A verb will have only one NSUBJ and DOBJ
children = token.children
A = "999999"
M = "999999"
add_neg_pfx = False
for child in children :
if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop):
A = child.text
if(child.dep_ == "advmod" and not child.is_stop):
M = child.text
M_children = child.children
for child_m in M_children:
if(child_m.dep_ == "advmod"):
M_hash = child_m.text
M = M_hash + " " + child.text
break
if(child.dep_ == "neg"):
neg_prefix = child.text
add_neg_pfx = True
if (add_neg_pfx and M != "999999"):
M = neg_prefix + " " + M
if(A != "999999" and M != "999999"):
if A in prod_pronouns :
A = "product"
dict4 = {"noun" : A, "adj" : M, "rule" : 4}
rule4_pairs.append(dict4)
## FIFTH RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect
#Complement of a copular verb - A is a child of M with relationship of nsubj, while
# M has a child with relationship of cop
#Assumption - A verb will have only one NSUBJ and DOBJ
children = token.children
A = "999999"
buf_var = "999999"
for child in children :
if(child.dep_ == "nsubj" and not child.is_stop):
A = child.text
if(child.dep_ == "cop" and not child.is_stop):
buf_var = child.text
if(A != "999999" and buf_var != "999999"):
if A in prod_pronouns :
A = "product"
dict5 = {"noun" : A, "adj" : token.text, "rule" : 5}
rule5_pairs.append(dict5)
## SIXTH RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect
## Example - "It ok", "ok" is INTJ (interjections like bravo, great etc)
children = token.children
A = "999999"
M = "999999"
if(token.pos_ == "INTJ" and not token.is_stop):
for child in children :
if(child.dep_ == "nsubj" and not child.is_stop):
A = child.text
M = token.text
if(A != "999999" and M != "999999"):
if A in prod_pronouns :
A = "product"
dict6 = {"noun" : A, "adj" : M, "rule" : 6}
rule6_pairs.append(dict6)
## SEVENTH RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect
## ATTR - link between a verb like 'be/seem/appear' and its complement
## Example: 'this is garbage' -> (this, garbage)
children = token.children
A = "999999"
M = "999999"
add_neg_pfx = False
for child in children :
if(child.dep_ == "nsubj" and not child.is_stop):
A = child.text
if((child.dep_ == "attr") and not child.is_stop):
M = child.text
if(child.dep_ == "neg"):
neg_prefix = child.text
add_neg_pfx = True
if (add_neg_pfx and M != "999999"):
M = neg_prefix + " " + M
if(A != "999999" and M != "999999"):
if A in prod_pronouns :
A = "product"
dict7 = {"noun" : A, "adj" : M, "rule" : 7}
rule7_pairs.append(dict7)
aspects = []
aspects = rule1_pairs + rule2_pairs + rule3_pairs +rule4_pairs +rule5_pairs + rule6_pairs + rule7_pairs
return aspects
def extract_aspects(nlp, reviews):
aspects = []
data = ([
(x[1], x[0]) for x in reviews['text_cleaned'].reset_index().to_numpy()
])
for doc, review_id in nlp.pipe(data, as_tuples=True):
doc_aspects = extract_doc_aspects(doc)
doc_aspects = [
[review_id, aspect['noun'], aspect['adj'], aspect['rule']]
for aspect in doc_aspects if not aspect['noun'].lower().startswith('product')
]
# filter aspects with out of vocubalary nouns
doc_aspects = [
doc_aspect for doc_aspect in doc_aspects
if has_vectors(nlp(doc_aspect[1]))
]
aspects.extend(doc_aspects)
aspects = pd.DataFrame(aspects, columns=['review_id', 'aspect', 'opinion', 'rule'])
return aspects