import pandas as pd import numpy as np def has_vectors(doc): return np.all([token.has_vector for token in doc]) def extract_doc_aspects(doc): prod_pronouns = ['it','this','they','these'] rule1_pairs = [] rule2_pairs = [] rule3_pairs = [] rule4_pairs = [] rule5_pairs = [] rule6_pairs = [] rule7_pairs = [] for token in doc: if token.text == 'product': continue ## FIRST RULE OF DEPENDANCY PARSE - ## M - Sentiment modifier || A - Aspect ## RULE = M is child of A with a relationship of amod A = "999999" M = "999999" if token.dep_ == "amod" and not token.is_stop: M = token.text A = token.head.text # add adverbial modifier of adjective (e.g. 'most comfortable headphones') M_children = token.children for child_m in M_children: if(child_m.dep_ == "advmod"): M_hash = child_m.text M = M_hash + " " + M break # negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters) A_children = token.head.children for child_a in A_children: if(child_a.dep_ == "det" and child_a.text == 'no'): neg_prefix = 'not' M = neg_prefix + " " + M break if(A != "999999" and M != "999999"): if A in prod_pronouns : A = "product" dict1 = {"noun" : A, "adj" : M, "rule" : 1} rule1_pairs.append(dict1) # # SECOND RULE OF DEPENDANCY PARSE - # # M - Sentiment modifier || A - Aspect # Direct Object - A is a child of something with relationship of nsubj, while # M is a child of the same something with relationship of dobj # Assumption - A verb will have only one NSUBJ and DOBJ children = token.children A = "999999" M = "999999" add_neg_pfx = False for child in children : if(child.dep_ == "nsubj" and not child.is_stop): A = child.text if((child.dep_ == "dobj" and child.pos_ == "ADJ") and not child.is_stop): M = child.text if(child.dep_ == "neg"): neg_prefix = child.text add_neg_pfx = True if (add_neg_pfx and M != "999999"): M = neg_prefix + " " + M if(A != "999999" and M != "999999"): if A in prod_pronouns : A = "product" dict2 = {"noun" : A, "adj" : M, "rule" : 2} rule2_pairs.append(dict2) ## THIRD RULE OF DEPENDANCY PARSE - ## M - Sentiment modifier || A - Aspect ## Adjectival Complement - A is a child of something with relationship of nsubj, while ## M is a child of the same something with relationship of acomp ## Assumption - A verb will have only one NSUBJ and DOBJ ## "The sound of the speakers would be better. The sound of the speakers could be better" - handled using AUX dependency children = token.children A = "999999" M = "999999" add_neg_pfx = False for child in children : if(child.dep_ == "nsubj" and not child.is_stop): A = child.text if(child.dep_ == "acomp" and not child.is_stop): M = child.text # example - 'this could have been better' -> (this, not better) if(child.dep_ == "aux" and child.tag_ == "MD"): neg_prefix = "not" add_neg_pfx = True if(child.dep_ == "neg"): neg_prefix = child.text add_neg_pfx = True if (add_neg_pfx and M != "999999"): M = neg_prefix + " " + M if(A != "999999" and M != "999999"): if A in prod_pronouns : A = "product" dict3 = {"noun" : A, "adj" : M, "rule" : 3} rule3_pairs.append(dict3) ## FOURTH RULE OF DEPENDANCY PARSE - ## M - Sentiment modifier || A - Aspect #Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while # M is a child of the same something with relationship of advmod #Assumption - A verb will have only one NSUBJ and DOBJ children = token.children A = "999999" M = "999999" add_neg_pfx = False for child in children : if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop): A = child.text if(child.dep_ == "advmod" and not child.is_stop): M = child.text M_children = child.children for child_m in M_children: if(child_m.dep_ == "advmod"): M_hash = child_m.text M = M_hash + " " + child.text break if(child.dep_ == "neg"): neg_prefix = child.text add_neg_pfx = True if (add_neg_pfx and M != "999999"): M = neg_prefix + " " + M if(A != "999999" and M != "999999"): if A in prod_pronouns : A = "product" dict4 = {"noun" : A, "adj" : M, "rule" : 4} rule4_pairs.append(dict4) ## FIFTH RULE OF DEPENDANCY PARSE - ## M - Sentiment modifier || A - Aspect #Complement of a copular verb - A is a child of M with relationship of nsubj, while # M has a child with relationship of cop #Assumption - A verb will have only one NSUBJ and DOBJ children = token.children A = "999999" buf_var = "999999" for child in children : if(child.dep_ == "nsubj" and not child.is_stop): A = child.text if(child.dep_ == "cop" and not child.is_stop): buf_var = child.text if(A != "999999" and buf_var != "999999"): if A in prod_pronouns : A = "product" dict5 = {"noun" : A, "adj" : token.text, "rule" : 5} rule5_pairs.append(dict5) ## SIXTH RULE OF DEPENDANCY PARSE - ## M - Sentiment modifier || A - Aspect ## Example - "It ok", "ok" is INTJ (interjections like bravo, great etc) children = token.children A = "999999" M = "999999" if(token.pos_ == "INTJ" and not token.is_stop): for child in children : if(child.dep_ == "nsubj" and not child.is_stop): A = child.text M = token.text if(A != "999999" and M != "999999"): if A in prod_pronouns : A = "product" dict6 = {"noun" : A, "adj" : M, "rule" : 6} rule6_pairs.append(dict6) ## SEVENTH RULE OF DEPENDANCY PARSE - ## M - Sentiment modifier || A - Aspect ## ATTR - link between a verb like 'be/seem/appear' and its complement ## Example: 'this is garbage' -> (this, garbage) children = token.children A = "999999" M = "999999" add_neg_pfx = False for child in children : if(child.dep_ == "nsubj" and not child.is_stop): A = child.text if((child.dep_ == "attr") and not child.is_stop): M = child.text if(child.dep_ == "neg"): neg_prefix = child.text add_neg_pfx = True if (add_neg_pfx and M != "999999"): M = neg_prefix + " " + M if(A != "999999" and M != "999999"): if A in prod_pronouns : A = "product" dict7 = {"noun" : A, "adj" : M, "rule" : 7} rule7_pairs.append(dict7) aspects = [] aspects = rule1_pairs + rule2_pairs + rule3_pairs +rule4_pairs +rule5_pairs + rule6_pairs + rule7_pairs return aspects def extract_aspects(nlp, reviews): aspects = [] data = ([ (x[1], x[0]) for x in reviews['text_cleaned'].reset_index().to_numpy() ]) for doc, review_id in nlp.pipe(data, as_tuples=True): doc_aspects = extract_doc_aspects(doc) doc_aspects = [ [review_id, aspect['noun'], aspect['adj'], aspect['rule']] for aspect in doc_aspects if not aspect['noun'].lower().startswith('product') ] # filter aspects with out of vocubalary nouns doc_aspects = [ doc_aspect for doc_aspect in doc_aspects if has_vectors(nlp(doc_aspect[1])) ] aspects.extend(doc_aspects) aspects = pd.DataFrame(aspects, columns=['review_id', 'aspect', 'opinion', 'rule']) return aspects