awacke1 commited on
Commit
d8f9678
·
1 Parent(s): f1eb405

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -24
app.py CHANGED
@@ -2,20 +2,21 @@ import streamlit as st
2
  from sklearn.decomposition import NMF
3
  from sklearn.feature_extraction.text import CountVectorizer
4
  from sklearn.pipeline import Pipeline
 
 
 
 
 
 
5
 
6
- bow_vectorizer = CountVectorizer()
7
- nmf = NMF(n_components=10)
8
- topic_pipeline = Pipeline(
9
- [
10
- ("bow", bow_vectorizer),
11
- ("nmf", nmf),
12
- ]
13
- )
14
 
15
  st.subheader("Topic Modeling with Topic-Wizard")
16
- uploaded_file = st.file_uploader("choose a text file", type=["txt"])
17
- if uploaded_file is not None:
18
- st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')
 
19
 
20
  st.write("OR")
21
 
@@ -23,21 +24,46 @@ input_text = st.text_area(
23
  label="Enter text separated by newlines",
24
  value="",
25
  key="text",
26
- height=150
27
  )
28
 
29
- button=st.button('Get Segments')
30
- if (button==True) and input_text != "":
31
- texts = input_text.split('\n')
32
- sents = []
33
- #for text in texts:
34
- # doc = nlp(text)
35
- # for sent in doc.sents:
36
- # sents.append(sent)
37
-
38
- topic_pipeline.fit(st.session_state["text"])
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- import topicwizard
 
41
 
42
- topicwizard.visualize(pipeline=topic_pipeline, corpus=st.session_state["text"])
 
 
 
 
 
 
 
 
43
 
 
 
 
 
2
  from sklearn.decomposition import NMF
3
  from sklearn.feature_extraction.text import CountVectorizer
4
  from sklearn.pipeline import Pipeline
5
+ from bertopic import BERTopic
6
+ import streamlit.components.v1 as components
7
+ from sentence_transformers import SentenceTransformer
8
+ from umap import UMAP
9
+ from hdbscan import HDBSCAN
10
+ from sklearn.feature_extraction.text import CountVectorizer
11
 
12
+ # Initialize BERTopic model
13
+ model = BERTopic()
 
 
 
 
 
 
14
 
15
  st.subheader("Topic Modeling with Topic-Wizard")
16
+ uploaded_file = st.file_uploader("Choose a text file", type=["txt"])
17
+
18
+ if uploaded_file is not None:
19
+ st.session_state["text"] = uploaded_file.getvalue().decode("utf-8")
20
 
21
  st.write("OR")
22
 
 
24
  label="Enter text separated by newlines",
25
  value="",
26
  key="text",
27
+ height=150,
28
  )
29
 
30
+ button = st.button("Get Segments")
31
+
32
+ if button and (uploaded_file is not None or input_text != ""):
33
+ if uploaded_file is not None:
34
+ texts = st.session_state["text"].split("\n")
35
+ else:
36
+ texts = input_text.split("\n")
37
+
38
+ # Fit BERTopic model
39
+ topics, probabilities = model.fit_transform(texts)
40
+
41
+ # Create embeddings
42
+ embeddings_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
43
+ embeddings = embeddings_model.encode(texts)
44
+
45
+ # Reduce dimensionality of embeddings using UMAP
46
+ umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine")
47
+ umap_embeddings = umap_model.fit_transform(embeddings)
48
+
49
+ # Cluster topics using HDBSCAN
50
+ cluster = HDBSCAN(
51
+ min_cluster_size=15, metric="euclidean", cluster_selection_method="eom"
52
+ ).fit(umap_embeddings)
53
 
54
+ # Visualize BERTopic results with Streamlit
55
+ st.title("BERTopic Visualization")
56
 
57
+ # Display top N most representative topics and their documents
58
+ num_topics = st.sidebar.slider("Select number of topics to display", 1, 20, 5, 1)
59
+ topic_words, topic_docs = model.get_topics(num_topics=num_topics, with_documents=True)
60
+ for i, topic in enumerate(topic_words):
61
+ st.write(f"## Topic {i}")
62
+ st.write("Keywords:", ", ".join(topic))
63
+ st.write("Documents:")
64
+ for doc in topic_docs[i][:5]:
65
+ st.write("-", texts[doc])
66
 
67
+ # Display topic clusters
68
+ st.write("## Topic Clusters")
69
+ components.html(cluster.labels_.tolist(), height=500, width=800)