Spaces:
Runtime error
Runtime error
File size: 1,327 Bytes
c6b92c7 072885d 5554539 b827309 5554539 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import streamlit as st
import pandas as pd
import bertopic
import plotly.express as px
st.set_page_config(page_title="Topic Modeling with Bertopic")
from datasets import load_dataset
st.markdown("""
https://github.com/pinecone-io/examples/tree/master/learn/algos-and-libraries/bertopic
""")
# data = load_dataset('jamescalam/python-reddit')
data = load_dataset("awacke1/LOINC-Panels-and-Forms")
data = data.filter(
lambda x: True if len(x['selftext']) > 30 else 0
)
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
model = BERTopic(
vectorizer_model=vectorizer_model,
language='english', calculate_probabilities=True,
verbose=True
)
topics, probs = model.fit_transform(text)
freq = model.get_topic_info()
freq.head(10)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
model
import numpy as np
from tqdm.auto import tqdm
batch_size = 16
embeds = np.zeros((n, model.get_sentence_embedding_dimension()))
for i in tqdm(range(0, n, batch_size)):
i_end = min(i+batch_size, n)
batch = data['selftext'][i:i_end]
batch_embed = model.encode(batch)
embeds[i:i_end,:] = batch_embed
|