Spaces:

DanielSc4
/

DataAnalyticsNLP

Runtime error

App Files Files Community

DanielSc4 commited on Sep 9, 2023

Commit

5affbbc

1 Parent(s): 9cbeac4

Update w/ LDA

Browse files

Files changed (2) hide show

app.py +136 -11
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -3,8 +3,10 @@ import gradio as gr
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.feature_extraction.text import CountVectorizer
 def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
     format_s = "{s}\n{c}"
@@ -12,15 +14,138 @@ def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
         format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
     ]
-def main(button, chose_context):
     df = pd.read_csv('./data/results.csv', index_col=0)
-    print(chose_context)
-    data = concat_comments(df.sup_comment, df.comment)
     subreddits = df.subreddit.value_counts().index[:22]
     weight_counts = {
@@ -55,7 +180,7 @@ def main(button, chose_context):
     ax.legend(loc="upper right")
     plt.xticks(rotation=70)
-    plt.show()
 with gr.Blocks() as demo:
@@ -63,12 +188,12 @@ with gr.Blocks() as demo:
         label="Plot type",
         choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
     )
-    chose_context = gr.Radio(
         label="Context LDA",
-        choices=['comment', 'sup comment', 'sup comment + comment'], value='scatter_plot'
     )
     plot = gr.Plot(label="Plot")
-    button.change(main, inputs=[button, chose_context], outputs=[plot])
     demo.load(main, inputs=[button], outputs=[plot])

 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
+import nltk, spacy, gensim
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.feature_extraction.text import CountVectorizer
+from pprint import pprint
 def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
     format_s = "{s}\n{c}"
         format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
     ]
+def sent_to_words(sentences):
+    for sentence in sentences:
+        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
+def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
+    texts_out = []
+    for sent in texts:
+        doc = nlp(" ".join(sent))
+        texts_out.append(" ".join([
+            token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags
+        ]))
+    return texts_out
+def main(button, choose_context):
     df = pd.read_csv('./data/results.csv', index_col=0)
+    if choose_context == 'comment':
+        data = df.comment
+    elif choose_context == 'sup comment':
+        data = df.sup_comment
+    elif choose_context == 'sup comment + comment':
+        data = concat_comments(df.sup_comment, df.comment)
+    data_words = list(sent_to_words(data))
+    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+    data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"]) #select noun and verb
+    vectorizer = CountVectorizer(
+    analyzer='word',
+    min_df=10,
+    stop_words='english',
+    lowercase=True,
+    token_pattern='[a-zA-Z0-9]{3,}'
+)
+    data_vectorized = vectorizer.fit_transform(data_lemmatized)
+    lda_model = LatentDirichletAllocation(
+        n_components=5,
+        max_iter=10,
+        learning_method='online',
+        random_state=100,
+        batch_size=128,
+        evaluate_every = -1,
+        n_jobs = -1,
+    )
+    lda_output = lda_model.fit_transform(data_vectorized)
+    print(lda_model)    # Model attributes
+    # Log Likelyhood: Higher the better
+    print("Log Likelihood: ", lda_model.score(data_vectorized))
+    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
+    print("Perplexity: ", lda_model.perplexity(data_vectorized))
+    # See model parameters
+    pprint(lda_model.get_params())
+    best_lda_model = lda_model
+    lda_output = best_lda_model.transform(data_vectorized)
+    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
+    docnames = ["Doc" + str(i) for i in range(len(data))]
+    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
+    dominant_topic = np.argmax(df_document_topic.values, axis=1)
+    df_document_topic["dominant_topic"] = dominant_topic
+    # Topic-Keyword Matrix
+    df_topic_keywords = pd.DataFrame(best_lda_model.components_)
+    df_topic_keywords
+    # Assign Column and Index
+    df_topic_keywords.columns = vectorizer.get_feature_names_out()
+    df_topic_keywords.index = topicnames
+    # View
+    df_topic_keywords
+    # Show top n keywords for each topic
+    def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
+        keywords = np.array(vectorizer.get_feature_names_out())
+        topic_keywords = []
+        for topic_weights in lda_model.components_:
+            top_keyword_locs = (-topic_weights).argsort()[:n_words]
+            topic_keywords.append(keywords.take(top_keyword_locs))
+        return topic_keywords
+    topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
+    # Topic - Keywords Dataframe
+    df_topic_keywords = pd.DataFrame(topic_keywords)
+    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
+    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
+    df_topic_keywords
+    topics = [
+        f'Topic {i}' for i in range(len(df_topic_keywords))
+    ]
+    df_topic_keywords["Topics"] = topics
+    df_topic_keywords
+    # # Define function to predict topic for a given text document.
+    # nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
+    # def predict_topic(text, nlp=nlp):
+    #     global sent_to_words
+    #     global lemmatization
+    #     # Step 1: Clean with simple_preprocess
+    #     mytext_2 = list(sent_to_words(text))
+    #     # Step 2: Lemmatize
+    #     mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
+    #     # Step 3: Vectorize transform
+    #     mytext_4 = vectorizer.transform(mytext_3)
+    #     # Step 4: LDA Transform
+    #     topic_probability_scores = best_lda_model.transform(mytext_4)
+    #     topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
+    #     # Step 5: Infer Topic
+    #     infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
+    #     #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
+    #     return infer_topic, topic, topic_probability_scores
+    # # Predict the topic
+    # mytext = ["This is a test of a random topic where I talk about politics"]
+    # infer_topic, topic, prob_scores = predict_topic(text = mytext)
+    def apply_predict_topic(text):
+        text = [text]
+        infer_topic, topic, prob_scores = predict_topic(text = text)
+        return(infer_topic)
+    df["Topic_key_word"] = df['comment'].apply(apply_predict_topic)
+    # plot
     subreddits = df.subreddit.value_counts().index[:22]
     weight_counts = {
     ax.legend(loc="upper right")
     plt.xticks(rotation=70)
+    return fig
 with gr.Blocks() as demo:
         label="Plot type",
         choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
     )
+    choose_context = gr.Radio(
         label="Context LDA",
+        choices=['comment', 'sup comment', 'sup comment + comment'], value='sup comment'
     )
     plot = gr.Plot(label="Plot")
+    button.change(main, inputs=[button, choose_context], outputs=[plot])
     demo.load(main, inputs=[button], outputs=[plot])

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 nltk
 spacy
 gensim
-sklearn

 nltk
 spacy
 gensim
+scikit-learn