import streamlit as st import pandas as pd import bertopic import plotly.express as px st.set_page_config(page_title="Topic Modeling with Bertopic") # Function to read the uploaded file and return a Pandas DataFrame def read_file(file): if file.type == 'text/plain': df = pd.read_csv(file, header=None, names=['data']) elif file.type == 'text/csv': df = pd.read_csv(file) else: st.error("Unsupported file format. Please upload a TXT or CSV file.") return None return df # Sidebar to upload the file st.sidebar.title("Upload File") file = st.sidebar.file_uploader("Choose a file", type=["txt", "csv"]) # Perform topic modeling when the user clicks the "Visualize" button if st.sidebar.button("Visualize"): # Read the uploaded file df = read_file(file) if df is None: st.stop() # Perform topic modeling using Bertopic model = bertopic.Bertopic() topics, probabilities = model.fit_transform(df['data']) # Create a plot of the topic distribution fig = px.histogram(x=topics, nbins=max(topics)+1, color_discrete_sequence=px.colors.qualitative.Pastel) fig.update_layout( title="Distribution of Topics", xaxis_title="Topic", yaxis_title="Count", ) st.plotly_chart(fig) # Display the top words in each topic st.write("Top words in each topic:") for topic_id in range(max(topics)+1): st.write(f"Topic {topic_id}: {model.get_topic(topic_id)}") # Display the clusters st.write("Clusters:") for cluster_id, docs in model.get_clusters().items(): st.write(f"Cluster {cluster_id}:") for doc in docs: st.write(f"\t{doc}")