Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import bertopic | |
import plotly.express as px | |
st.set_page_config(page_title="Topic Modeling with Bertopic") | |
from datasets import load_dataset | |
st.markdown(""" | |
https://github.com/pinecone-io/examples/tree/master/learn/algos-and-libraries/bertopic | |
""") | |
data = load_dataset('jamescalam/python-reddit') | |
data = data.filter( | |
lambda x: True if len(x['selftext']) > 30 else 0 | |
) | |
from bertopic import BERTopic | |
from sklearn.feature_extraction.text import CountVectorizer | |
# we add this to remove stopwords | |
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") | |
model = BERTopic( | |
vectorizer_model=vectorizer_model, | |
language='english', calculate_probabilities=True, | |
verbose=True | |
) | |
topics, probs = model.fit_transform(text) | |
freq = model.get_topic_info() | |
freq.head(10) | |
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
model | |
import numpy as np | |
from tqdm.auto import tqdm | |
batch_size = 16 | |
embeds = np.zeros((n, model.get_sentence_embedding_dimension())) | |
for i in tqdm(range(0, n, batch_size)): | |
i_end = min(i+batch_size, n) | |
batch = data['selftext'][i:i_end] | |
batch_embed = model.encode(batch) | |
embeds[i:i_end,:] = batch_embed | |