Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
def has_vectors(doc): | |
return np.all([token.has_vector for token in doc]) | |
def extract_doc_aspects(doc): | |
prod_pronouns = ['it','this','they','these'] | |
rule1_pairs = [] | |
rule2_pairs = [] | |
rule3_pairs = [] | |
rule4_pairs = [] | |
rule5_pairs = [] | |
rule6_pairs = [] | |
rule7_pairs = [] | |
for token in doc: | |
if token.text == 'product': | |
continue | |
## FIRST RULE OF DEPENDANCY PARSE - | |
## M - Sentiment modifier || A - Aspect | |
## RULE = M is child of A with a relationship of amod | |
A = "999999" | |
M = "999999" | |
if token.dep_ == "amod" and not token.is_stop: | |
M = token.text | |
A = token.head.text | |
# add adverbial modifier of adjective (e.g. 'most comfortable headphones') | |
M_children = token.children | |
for child_m in M_children: | |
if(child_m.dep_ == "advmod"): | |
M_hash = child_m.text | |
M = M_hash + " " + M | |
break | |
# negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters) | |
A_children = token.head.children | |
for child_a in A_children: | |
if(child_a.dep_ == "det" and child_a.text == 'no'): | |
neg_prefix = 'not' | |
M = neg_prefix + " " + M | |
break | |
if(A != "999999" and M != "999999"): | |
if A in prod_pronouns : | |
A = "product" | |
dict1 = {"noun" : A, "adj" : M, "rule" : 1} | |
rule1_pairs.append(dict1) | |
# # SECOND RULE OF DEPENDANCY PARSE - | |
# # M - Sentiment modifier || A - Aspect | |
# Direct Object - A is a child of something with relationship of nsubj, while | |
# M is a child of the same something with relationship of dobj | |
# Assumption - A verb will have only one NSUBJ and DOBJ | |
children = token.children | |
A = "999999" | |
M = "999999" | |
add_neg_pfx = False | |
for child in children : | |
if(child.dep_ == "nsubj" and not child.is_stop): | |
A = child.text | |
if((child.dep_ == "dobj" and child.pos_ == "ADJ") and not child.is_stop): | |
M = child.text | |
if(child.dep_ == "neg"): | |
neg_prefix = child.text | |
add_neg_pfx = True | |
if (add_neg_pfx and M != "999999"): | |
M = neg_prefix + " " + M | |
if(A != "999999" and M != "999999"): | |
if A in prod_pronouns : | |
A = "product" | |
dict2 = {"noun" : A, "adj" : M, "rule" : 2} | |
rule2_pairs.append(dict2) | |
## THIRD RULE OF DEPENDANCY PARSE - | |
## M - Sentiment modifier || A - Aspect | |
## Adjectival Complement - A is a child of something with relationship of nsubj, while | |
## M is a child of the same something with relationship of acomp | |
## Assumption - A verb will have only one NSUBJ and DOBJ | |
## "The sound of the speakers would be better. The sound of the speakers could be better" - handled using AUX dependency | |
children = token.children | |
A = "999999" | |
M = "999999" | |
add_neg_pfx = False | |
for child in children : | |
if(child.dep_ == "nsubj" and not child.is_stop): | |
A = child.text | |
if(child.dep_ == "acomp" and not child.is_stop): | |
M = child.text | |
# example - 'this could have been better' -> (this, not better) | |
if(child.dep_ == "aux" and child.tag_ == "MD"): | |
neg_prefix = "not" | |
add_neg_pfx = True | |
if(child.dep_ == "neg"): | |
neg_prefix = child.text | |
add_neg_pfx = True | |
if (add_neg_pfx and M != "999999"): | |
M = neg_prefix + " " + M | |
if(A != "999999" and M != "999999"): | |
if A in prod_pronouns : | |
A = "product" | |
dict3 = {"noun" : A, "adj" : M, "rule" : 3} | |
rule3_pairs.append(dict3) | |
## FOURTH RULE OF DEPENDANCY PARSE - | |
## M - Sentiment modifier || A - Aspect | |
#Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while | |
# M is a child of the same something with relationship of advmod | |
#Assumption - A verb will have only one NSUBJ and DOBJ | |
children = token.children | |
A = "999999" | |
M = "999999" | |
add_neg_pfx = False | |
for child in children : | |
if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop): | |
A = child.text | |
if(child.dep_ == "advmod" and not child.is_stop): | |
M = child.text | |
M_children = child.children | |
for child_m in M_children: | |
if(child_m.dep_ == "advmod"): | |
M_hash = child_m.text | |
M = M_hash + " " + child.text | |
break | |
if(child.dep_ == "neg"): | |
neg_prefix = child.text | |
add_neg_pfx = True | |
if (add_neg_pfx and M != "999999"): | |
M = neg_prefix + " " + M | |
if(A != "999999" and M != "999999"): | |
if A in prod_pronouns : | |
A = "product" | |
dict4 = {"noun" : A, "adj" : M, "rule" : 4} | |
rule4_pairs.append(dict4) | |
## FIFTH RULE OF DEPENDANCY PARSE - | |
## M - Sentiment modifier || A - Aspect | |
#Complement of a copular verb - A is a child of M with relationship of nsubj, while | |
# M has a child with relationship of cop | |
#Assumption - A verb will have only one NSUBJ and DOBJ | |
children = token.children | |
A = "999999" | |
buf_var = "999999" | |
for child in children : | |
if(child.dep_ == "nsubj" and not child.is_stop): | |
A = child.text | |
if(child.dep_ == "cop" and not child.is_stop): | |
buf_var = child.text | |
if(A != "999999" and buf_var != "999999"): | |
if A in prod_pronouns : | |
A = "product" | |
dict5 = {"noun" : A, "adj" : token.text, "rule" : 5} | |
rule5_pairs.append(dict5) | |
## SIXTH RULE OF DEPENDANCY PARSE - | |
## M - Sentiment modifier || A - Aspect | |
## Example - "It ok", "ok" is INTJ (interjections like bravo, great etc) | |
children = token.children | |
A = "999999" | |
M = "999999" | |
if(token.pos_ == "INTJ" and not token.is_stop): | |
for child in children : | |
if(child.dep_ == "nsubj" and not child.is_stop): | |
A = child.text | |
M = token.text | |
if(A != "999999" and M != "999999"): | |
if A in prod_pronouns : | |
A = "product" | |
dict6 = {"noun" : A, "adj" : M, "rule" : 6} | |
rule6_pairs.append(dict6) | |
## SEVENTH RULE OF DEPENDANCY PARSE - | |
## M - Sentiment modifier || A - Aspect | |
## ATTR - link between a verb like 'be/seem/appear' and its complement | |
## Example: 'this is garbage' -> (this, garbage) | |
children = token.children | |
A = "999999" | |
M = "999999" | |
add_neg_pfx = False | |
for child in children : | |
if(child.dep_ == "nsubj" and not child.is_stop): | |
A = child.text | |
if((child.dep_ == "attr") and not child.is_stop): | |
M = child.text | |
if(child.dep_ == "neg"): | |
neg_prefix = child.text | |
add_neg_pfx = True | |
if (add_neg_pfx and M != "999999"): | |
M = neg_prefix + " " + M | |
if(A != "999999" and M != "999999"): | |
if A in prod_pronouns : | |
A = "product" | |
dict7 = {"noun" : A, "adj" : M, "rule" : 7} | |
rule7_pairs.append(dict7) | |
aspects = [] | |
aspects = rule1_pairs + rule2_pairs + rule3_pairs +rule4_pairs +rule5_pairs + rule6_pairs + rule7_pairs | |
return aspects | |
def extract_aspects(nlp, reviews): | |
aspects = [] | |
data = ([ | |
(x[1], x[0]) for x in reviews['text_cleaned'].reset_index().to_numpy() | |
]) | |
for doc, review_id in nlp.pipe(data, as_tuples=True): | |
doc_aspects = extract_doc_aspects(doc) | |
doc_aspects = [ | |
[review_id, aspect['noun'], aspect['adj'], aspect['rule']] | |
for aspect in doc_aspects if not aspect['noun'].lower().startswith('product') | |
] | |
# filter aspects with out of vocubalary nouns | |
doc_aspects = [ | |
doc_aspect for doc_aspect in doc_aspects | |
if has_vectors(nlp(doc_aspect[1])) | |
] | |
aspects.extend(doc_aspects) | |
aspects = pd.DataFrame(aspects, columns=['review_id', 'aspect', 'opinion', 'rule']) | |
return aspects | |