Spaces:

awacke1
/

Topic-Wizard-SKlearn

Runtime error

App Files Files Community

awacke1 commited on Mar 20, 2023

Commit

5554539

1 Parent(s): 072885d

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -47

app.py CHANGED Viewed

@@ -5,50 +5,49 @@ import plotly.express as px
 st.set_page_config(page_title="Topic Modeling with Bertopic")
-# Function to read the uploaded file and return a Pandas DataFrame
-def read_file(file):
-    if file.type == 'text/plain':
-        df = pd.read_csv(file, header=None, names=['data'])
-    elif file.type == 'text/csv':
-        df = pd.read_csv(file)
-    else:
-        st.error("Unsupported file format. Please upload a TXT or CSV file.")
-        return None
-    return df
-# Sidebar to upload the file
-st.sidebar.title("Upload File")
-file = st.sidebar.file_uploader("Choose a file", type=["txt", "csv"])
-# Perform topic modeling when the user clicks the "Visualize" button
-if st.sidebar.button("Visualize"):
-    # Read the uploaded file
-    df = read_file(file)
-    if df is None:
-        st.stop()
-    # Perform topic modeling using Bertopic
-    model = bertopic.Bertopic()
-    topics, probabilities = model.fit_transform(df['data'])
-    # Create a plot of the topic distribution
-    fig = px.histogram(x=topics, nbins=max(topics)+1, color_discrete_sequence=px.colors.qualitative.Pastel)
-    fig.update_layout(
-        title="Distribution of Topics",
-        xaxis_title="Topic",
-        yaxis_title="Count",
-    )
-    st.plotly_chart(fig)
-    # Display the top words in each topic
-    st.write("Top words in each topic:")
-    for topic_id in range(max(topics)+1):
-        st.write(f"Topic {topic_id}: {model.get_topic(topic_id)}")
-    # Display the clusters
-    st.write("Clusters:")
-    for cluster_id, docs in model.get_clusters().items():
-        st.write(f"Cluster {cluster_id}:")
-        for doc in docs:
-            st.write(f"\t{doc}")

 st.set_page_config(page_title="Topic Modeling with Bertopic")
+from datasets import load_dataset
+st.markdown("""
+https://github.com/pinecone-io/examples/tree/master/learn/algos-and-libraries/bertopic
+""")
+data = load_dataset('jamescalam/python-reddit')
+data = data.filter(
+    lambda x: True if len(x['selftext']) > 30 else 0
+)
+from bertopic import BERTopic
+from sklearn.feature_extraction.text import CountVectorizer
+# we add this to remove stopwords
+vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
+model = BERTopic(
+    vectorizer_model=vectorizer_model,
+    language='english', calculate_probabilities=True,
+    verbose=True
+)
+topics, probs = model.fit_transform(text)
+freq = model.get_topic_info()
+freq.head(10)
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('all-MiniLM-L6-v2')
+model
+import numpy as np
+from tqdm.auto import tqdm
+batch_size = 16
+embeds = np.zeros((n, model.get_sentence_embedding_dimension()))
+for i in tqdm(range(0, n, batch_size)):
+    i_end = min(i+batch_size, n)
+    batch = data['selftext'][i:i_end]
+    batch_embed = model.encode(batch)
+    embeds[i:i_end,:] = batch_embed