awacke1's picture
Update app.py
072885d
raw
history blame
1.7 kB
import streamlit as st
import pandas as pd
import bertopic
import plotly.express as px
st.set_page_config(page_title="Topic Modeling with Bertopic")
# Function to read the uploaded file and return a Pandas DataFrame
def read_file(file):
if file.type == 'text/plain':
df = pd.read_csv(file, header=None, names=['data'])
elif file.type == 'text/csv':
df = pd.read_csv(file)
else:
st.error("Unsupported file format. Please upload a TXT or CSV file.")
return None
return df
# Sidebar to upload the file
st.sidebar.title("Upload File")
file = st.sidebar.file_uploader("Choose a file", type=["txt", "csv"])
# Perform topic modeling when the user clicks the "Visualize" button
if st.sidebar.button("Visualize"):
# Read the uploaded file
df = read_file(file)
if df is None:
st.stop()
# Perform topic modeling using Bertopic
model = bertopic.Bertopic()
topics, probabilities = model.fit_transform(df['data'])
# Create a plot of the topic distribution
fig = px.histogram(x=topics, nbins=max(topics)+1, color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(
title="Distribution of Topics",
xaxis_title="Topic",
yaxis_title="Count",
)
st.plotly_chart(fig)
# Display the top words in each topic
st.write("Top words in each topic:")
for topic_id in range(max(topics)+1):
st.write(f"Topic {topic_id}: {model.get_topic(topic_id)}")
# Display the clusters
st.write("Clusters:")
for cluster_id, docs in model.get_clusters().items():
st.write(f"Cluster {cluster_id}:")
for doc in docs:
st.write(f"\t{doc}")