awacke1 commited on
Commit
5554539
·
1 Parent(s): 072885d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -47
app.py CHANGED
@@ -5,50 +5,49 @@ import plotly.express as px
5
 
6
  st.set_page_config(page_title="Topic Modeling with Bertopic")
7
 
8
- # Function to read the uploaded file and return a Pandas DataFrame
9
- def read_file(file):
10
- if file.type == 'text/plain':
11
- df = pd.read_csv(file, header=None, names=['data'])
12
- elif file.type == 'text/csv':
13
- df = pd.read_csv(file)
14
- else:
15
- st.error("Unsupported file format. Please upload a TXT or CSV file.")
16
- return None
17
- return df
18
-
19
- # Sidebar to upload the file
20
- st.sidebar.title("Upload File")
21
- file = st.sidebar.file_uploader("Choose a file", type=["txt", "csv"])
22
-
23
- # Perform topic modeling when the user clicks the "Visualize" button
24
- if st.sidebar.button("Visualize"):
25
-
26
- # Read the uploaded file
27
- df = read_file(file)
28
- if df is None:
29
- st.stop()
30
-
31
- # Perform topic modeling using Bertopic
32
- model = bertopic.Bertopic()
33
- topics, probabilities = model.fit_transform(df['data'])
34
-
35
- # Create a plot of the topic distribution
36
- fig = px.histogram(x=topics, nbins=max(topics)+1, color_discrete_sequence=px.colors.qualitative.Pastel)
37
- fig.update_layout(
38
- title="Distribution of Topics",
39
- xaxis_title="Topic",
40
- yaxis_title="Count",
41
- )
42
- st.plotly_chart(fig)
43
-
44
- # Display the top words in each topic
45
- st.write("Top words in each topic:")
46
- for topic_id in range(max(topics)+1):
47
- st.write(f"Topic {topic_id}: {model.get_topic(topic_id)}")
48
-
49
- # Display the clusters
50
- st.write("Clusters:")
51
- for cluster_id, docs in model.get_clusters().items():
52
- st.write(f"Cluster {cluster_id}:")
53
- for doc in docs:
54
- st.write(f"\t{doc}")
 
5
 
6
  st.set_page_config(page_title="Topic Modeling with Bertopic")
7
 
8
+ from datasets import load_dataset
9
+
10
+ st.markdown("""
11
+ https://github.com/pinecone-io/examples/tree/master/learn/algos-and-libraries/bertopic
12
+ """)
13
+
14
+ data = load_dataset('jamescalam/python-reddit')
15
+ data = data.filter(
16
+ lambda x: True if len(x['selftext']) > 30 else 0
17
+ )
18
+ from bertopic import BERTopic
19
+ from sklearn.feature_extraction.text import CountVectorizer
20
+
21
+ # we add this to remove stopwords
22
+ vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
23
+
24
+ model = BERTopic(
25
+ vectorizer_model=vectorizer_model,
26
+ language='english', calculate_probabilities=True,
27
+ verbose=True
28
+ )
29
+ topics, probs = model.fit_transform(text)
30
+ freq = model.get_topic_info()
31
+ freq.head(10)
32
+
33
+
34
+ from sentence_transformers import SentenceTransformer
35
+
36
+ model = SentenceTransformer('all-MiniLM-L6-v2')
37
+ model
38
+
39
+ import numpy as np
40
+ from tqdm.auto import tqdm
41
+
42
+ batch_size = 16
43
+
44
+ embeds = np.zeros((n, model.get_sentence_embedding_dimension()))
45
+
46
+ for i in tqdm(range(0, n, batch_size)):
47
+ i_end = min(i+batch_size, n)
48
+ batch = data['selftext'][i:i_end]
49
+ batch_embed = model.encode(batch)
50
+ embeds[i:i_end,:] = batch_embed
51
+
52
+
53
+