Spaces:

analytics-jiten
/

reviews-insights

Sleeping

App Files Files Community

analytics-jiten commited on Jan 4, 2024

Commit

60ded96

1 Parent(s): 01ad901

Create aspects_extraction.py

Browse files

Files changed (1) hide show

aspects_extraction.py +261 -0

aspects_extraction.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import pandas as pd
+import numpy as np
+def has_vectors(doc):
+  return np.all([token.has_vector for token in doc])
+def extract_doc_aspects(doc):
+    prod_pronouns = ['it','this','they','these']
+    rule1_pairs = []
+    rule2_pairs = []
+    rule3_pairs = []
+    rule4_pairs = []
+    rule5_pairs = []
+    rule6_pairs = []
+    rule7_pairs = []
+    for token in doc:
+        if token.text == 'product':
+          continue
+        ## FIRST RULE OF DEPENDANCY PARSE -
+        ## M - Sentiment modifier || A - Aspect
+        ## RULE = M is child of A with a relationship of amod
+        A = "999999"
+        M = "999999"
+        if token.dep_ == "amod" and not token.is_stop:
+            M = token.text
+            A = token.head.text
+            # add adverbial modifier of adjective (e.g. 'most comfortable headphones')
+            M_children = token.children
+            for child_m in M_children:
+                if(child_m.dep_ == "advmod"):
+                    M_hash = child_m.text
+                    M = M_hash + " " + M
+                    break
+            # negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters)
+            A_children = token.head.children
+            for child_a in A_children:
+                if(child_a.dep_ == "det" and child_a.text == 'no'):
+                    neg_prefix = 'not'
+                    M = neg_prefix + " " + M
+                    break
+        if(A != "999999" and M != "999999"):
+            if A in prod_pronouns :
+                A = "product"
+            dict1 = {"noun" : A, "adj" : M, "rule" : 1}
+            rule1_pairs.append(dict1)
+        # # SECOND RULE OF DEPENDANCY PARSE -
+        # # M - Sentiment modifier || A - Aspect
+        # Direct Object - A is a child of something with relationship of nsubj, while
+        # M is a child of the same something with relationship of dobj
+        # Assumption - A verb will have only one NSUBJ and DOBJ
+        children = token.children
+        A = "999999"
+        M = "999999"
+        add_neg_pfx = False
+        for child in children :
+            if(child.dep_ == "nsubj" and not child.is_stop):
+                A = child.text
+            if((child.dep_ == "dobj" and child.pos_ == "ADJ") and not child.is_stop):
+                M = child.text
+            if(child.dep_ == "neg"):
+                neg_prefix = child.text
+                add_neg_pfx = True
+        if (add_neg_pfx and M != "999999"):
+            M = neg_prefix + " " + M
+        if(A != "999999" and M != "999999"):
+            if A in prod_pronouns :
+                A = "product"
+            dict2 = {"noun" : A, "adj" : M, "rule" : 2}
+            rule2_pairs.append(dict2)
+        ## THIRD RULE OF DEPENDANCY PARSE -
+        ## M - Sentiment modifier || A - Aspect
+        ## Adjectival Complement - A is a child of something with relationship of nsubj, while
+        ## M is a child of the same something with relationship of acomp
+        ## Assumption - A verb will have only one NSUBJ and DOBJ
+        ## "The sound of the speakers would be better. The sound of the speakers could be better" - handled using AUX dependency
+        children = token.children
+        A = "999999"
+        M = "999999"
+        add_neg_pfx = False
+        for child in children :
+            if(child.dep_ == "nsubj" and not child.is_stop):
+                A = child.text
+            if(child.dep_ == "acomp" and not child.is_stop):
+                M = child.text
+            # example - 'this could have been better' -> (this, not better)
+            if(child.dep_ == "aux" and child.tag_ == "MD"):
+                neg_prefix = "not"
+                add_neg_pfx = True
+            if(child.dep_ == "neg"):
+                neg_prefix = child.text
+                add_neg_pfx = True
+        if (add_neg_pfx and M != "999999"):
+            M = neg_prefix + " " + M
+        if(A != "999999" and M != "999999"):
+            if A in prod_pronouns :
+                A = "product"
+            dict3 = {"noun" : A, "adj" : M, "rule" : 3}
+            rule3_pairs.append(dict3)
+        ## FOURTH RULE OF DEPENDANCY PARSE -
+        ## M - Sentiment modifier || A - Aspect
+        #Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while
+        # M is a child of the same something with relationship of advmod
+        #Assumption - A verb will have only one NSUBJ and DOBJ
+        children = token.children
+        A = "999999"
+        M = "999999"
+        add_neg_pfx = False
+        for child in children :
+            if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop):
+                A = child.text
+            if(child.dep_ == "advmod" and not child.is_stop):
+                M = child.text
+                M_children = child.children
+                for child_m in M_children:
+                    if(child_m.dep_ == "advmod"):
+                        M_hash = child_m.text
+                        M = M_hash + " " + child.text
+                        break
+            if(child.dep_ == "neg"):
+                neg_prefix = child.text
+                add_neg_pfx = True
+        if (add_neg_pfx and M != "999999"):
+            M = neg_prefix + " " + M
+        if(A != "999999" and M != "999999"):
+            if A in prod_pronouns :
+                A = "product"
+            dict4 = {"noun" : A, "adj" : M, "rule" : 4}
+            rule4_pairs.append(dict4)
+        ## FIFTH RULE OF DEPENDANCY PARSE -
+        ## M - Sentiment modifier || A - Aspect
+        #Complement of a copular verb - A is a child of M with relationship of nsubj, while
+        # M has a child with relationship of cop
+        #Assumption - A verb will have only one NSUBJ and DOBJ
+        children = token.children
+        A = "999999"
+        buf_var = "999999"
+        for child in children :
+            if(child.dep_ == "nsubj" and not child.is_stop):
+                A = child.text
+            if(child.dep_ == "cop" and not child.is_stop):
+                buf_var = child.text
+        if(A != "999999" and buf_var != "999999"):
+            if A in prod_pronouns :
+                A = "product"
+            dict5 = {"noun" : A, "adj" : token.text, "rule" : 5}
+            rule5_pairs.append(dict5)
+        ## SIXTH RULE OF DEPENDANCY PARSE -
+        ## M - Sentiment modifier || A - Aspect
+        ## Example - "It ok", "ok" is INTJ (interjections like bravo, great etc)
+        children = token.children
+        A = "999999"
+        M = "999999"
+        if(token.pos_ == "INTJ" and not token.is_stop):
+            for child in children :
+                if(child.dep_ == "nsubj" and not child.is_stop):
+                    A = child.text
+                    M = token.text
+        if(A != "999999" and M != "999999"):
+            if A in prod_pronouns :
+                A = "product"
+            dict6 = {"noun" : A, "adj" : M, "rule" : 6}
+            rule6_pairs.append(dict6)
+        ## SEVENTH RULE OF DEPENDANCY PARSE -
+        ## M - Sentiment modifier || A - Aspect
+        ## ATTR - link between a verb like 'be/seem/appear' and its complement
+        ## Example: 'this is garbage' -> (this, garbage)
+        children = token.children
+        A = "999999"
+        M = "999999"
+        add_neg_pfx = False
+        for child in children :
+            if(child.dep_ == "nsubj" and not child.is_stop):
+                A = child.text
+            if((child.dep_ == "attr") and not child.is_stop):
+                M = child.text
+            if(child.dep_ == "neg"):
+                neg_prefix = child.text
+                add_neg_pfx = True
+        if (add_neg_pfx and M != "999999"):
+            M = neg_prefix + " " + M
+        if(A != "999999" and M != "999999"):
+            if A in prod_pronouns :
+                A = "product"
+            dict7 = {"noun" : A, "adj" : M, "rule" : 7}
+            rule7_pairs.append(dict7)
+    aspects = []
+    aspects = rule1_pairs + rule2_pairs + rule3_pairs +rule4_pairs +rule5_pairs + rule6_pairs + rule7_pairs
+    return aspects
+def extract_aspects(nlp, reviews):
+  aspects = []
+  data = ([
+    (x[1], x[0]) for x in reviews['text_cleaned'].reset_index().to_numpy()
+  ])
+  for doc, review_id in nlp.pipe(data, as_tuples=True):
+    doc_aspects = extract_doc_aspects(doc)
+    doc_aspects = [
+        [review_id, aspect['noun'], aspect['adj'], aspect['rule']]
+        for aspect in doc_aspects if not aspect['noun'].lower().startswith('product')
+    ]
+    # filter aspects with out of vocubalary nouns
+    doc_aspects = [
+        doc_aspect for doc_aspect in doc_aspects
+        if has_vectors(nlp(doc_aspect[1]))
+    ]
+    aspects.extend(doc_aspects)
+  aspects = pd.DataFrame(aspects, columns=['review_id', 'aspect', 'opinion', 'rule'])
+  return aspects