File size: 1,699 Bytes
c6b92c7
072885d
 
 
 
 
 
 
 
 
 
 
 
d8f9678
072885d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import streamlit as st
import pandas as pd
import bertopic
import plotly.express as px

st.set_page_config(page_title="Topic Modeling with Bertopic")

# Function to read the uploaded file and return a Pandas DataFrame
def read_file(file):
    if file.type == 'text/plain':
        df = pd.read_csv(file, header=None, names=['data'])
    elif file.type == 'text/csv':
        df = pd.read_csv(file)
    else:
        st.error("Unsupported file format. Please upload a TXT or CSV file.")
        return None
    return df

# Sidebar to upload the file
st.sidebar.title("Upload File")
file = st.sidebar.file_uploader("Choose a file", type=["txt", "csv"])

# Perform topic modeling when the user clicks the "Visualize" button
if st.sidebar.button("Visualize"):

    # Read the uploaded file
    df = read_file(file)
    if df is None:
        st.stop()

    # Perform topic modeling using Bertopic
    model = bertopic.Bertopic()
    topics, probabilities = model.fit_transform(df['data'])

    # Create a plot of the topic distribution
    fig = px.histogram(x=topics, nbins=max(topics)+1, color_discrete_sequence=px.colors.qualitative.Pastel)
    fig.update_layout(
        title="Distribution of Topics",
        xaxis_title="Topic",
        yaxis_title="Count",
    )
    st.plotly_chart(fig)

    # Display the top words in each topic
    st.write("Top words in each topic:")
    for topic_id in range(max(topics)+1):
        st.write(f"Topic {topic_id}: {model.get_topic(topic_id)}")

    # Display the clusters
    st.write("Clusters:")
    for cluster_id, docs in model.get_clusters().items():
        st.write(f"Cluster {cluster_id}:")
        for doc in docs:
            st.write(f"\t{doc}")