AIEcosystem commited on
Commit
70e6432
·
verified ·
1 Parent(s): f11aa25

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +161 -257
src/streamlit_app.py CHANGED
@@ -1,60 +1,29 @@
1
  import os
2
- os.environ['HF_HOME'] = '/tmp'
3
  import time
4
  import streamlit as st
5
  import pandas as pd
6
  import io
7
  import plotly.express as px
8
  import zipfile
9
- import json
10
- from cryptography.fernet import Fernet
11
  from streamlit_extras.stylable_container import stylable_container
12
- from typing import Optional
13
- from gliner import GLiNER
14
- from comet_ml import Experiment
15
  from transformers import pipeline
 
16
 
17
-
18
-
 
 
 
19
 
20
  st.markdown(
21
  """
22
  <style>
23
- /* Main app background with a subtle rainbow gradient */
24
  .stApp {
25
  background: linear-gradient(135deg, #f0f8ff, #f5f0ff, #fff0f5);
26
  color: #000000;
27
  font-family: 'Inter', sans-serif;
28
  }
29
-
30
- /* Rainbow gradient for the sidebar */
31
- .css-1d36184, .css-1d36184:hover, .css-1d36184:focus {
32
- background: linear-gradient(180deg, #FFC0CB, #FFD700, #98FB98, #ADD8E6, #BA55D3);
33
- secondary-background-color: #FFC080;
34
- }
35
-
36
- /* Expander background color with a slight transparency */
37
- .streamlit-expanderContent {
38
- background-color: rgba(255, 255, 255, 0.7);
39
- border-radius: 10px;
40
- }
41
-
42
- /* Expander header with a gentle gradient and bold text */
43
- .streamlit-expanderHeader {
44
- background: linear-gradient(90deg, #FADADD, #FFF9E0, #E0FFF8);
45
- border-radius: 10px;
46
- font-weight: bold;
47
- }
48
-
49
- /* Text Area with a light background and subtle border */
50
- .stTextArea textarea {
51
- background-color: #FFF0F5;
52
- color: #000000;
53
- border: 1px solid #ccc;
54
- border-radius: 8px;
55
- }
56
-
57
- /* Button with a solid color and elegant hover effect */
58
  .stButton > button {
59
  background-color: #FF69B4;
60
  color: #FFFFFF;
@@ -68,45 +37,11 @@ st.markdown(
68
  box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
69
  transform: translateY(-2px);
70
  }
71
-
72
- /* Warning box with a soft orange and rounded corners */
73
- .stAlert.st-warning {
74
- background-color: #FFDDAA;
75
- color: #000000;
76
- border-radius: 10px;
77
- border-left: 5px solid #FFA500;
78
- }
79
-
80
- /* Success box with a fresh green and rounded corners */
81
- .stAlert.st-success {
82
- background-color: #D4EDDA;
83
- color: #155724;
84
- border-radius: 10px;
85
- border-left: 5px solid #28A745;
86
- }
87
-
88
- /* Custom CSS to make the title text rainbow-colored */
89
- h1 {
90
- background: linear-gradient(45deg, #FF69B4, #FFD700, #00FF7F, #00BFFF, #8A2BE2);
91
- -webkit-background-clip: text;
92
- -webkit-text-fill-color: transparent;
93
- font-size: 3em;
94
- font-weight: 800;
95
- }
96
-
97
  </style>
98
  """,
99
  unsafe_allow_html=True
100
  )
101
 
102
-
103
- st.set_page_config(
104
- layout="wide",
105
- page_title="English Keyphrase"
106
- )
107
-
108
-
109
-
110
  # --- Comet ML Setup ---
111
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
112
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -116,42 +51,21 @@ comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAM
116
  if not comet_initialized:
117
  st.warning("Comet ML not initialized. Check environment variables.")
118
 
119
-
120
-
121
-
122
-
123
  # --- UI Header and Notes ---
124
  st.subheader("AcademiaMiner", divider="rainbow")
125
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
126
-
127
  expander = st.expander("**Important notes*")
128
- expander.write('''
129
- **Named Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
130
-
131
- Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
132
-
133
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
134
-
135
- **Usage Limits:** You can request results unlimited times for one (1) month.
136
-
137
- **Supported Languages:** English
138
-
139
- **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
140
-
141
- For any errors or inquiries, please contact us at [email protected]'''
142
- )
143
-
144
-
145
 
146
  with st.sidebar:
147
  st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
148
  code = '''
149
- <iframe
150
- src="https://aiecosystem-business-core.hf.space"
151
- frameborder="0"
152
- width="850"
153
- height="450"
154
- ></iframe>
155
  '''
156
  st.code(code, language="html")
157
  st.text("")
@@ -160,189 +74,179 @@ with st.sidebar:
160
  st.subheader("🚀 Ready to build your own NER Web App?", divider="rainbow")
161
  st.link_button("NER Builder", "https://nlpblogs.com", type="primary")
162
 
163
-
164
  @st.cache_resource
165
  def load_ner_model():
166
- """Loads the GLiNER model and caches it."""
167
  try:
168
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints= labels)
 
 
 
 
169
  except Exception as e:
170
- st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
171
  st.stop()
172
- model = load_ner_model()
173
-
174
-
175
- @st.cache_resource
176
- def load_ner_model():
177
- return pipeline("token-classification",
178
- model="ml6team/keyphrase-extraction-kbir-inspec",
179
- aggregation_strategy="max",
180
- stride=128,
181
- ignore_labels=["O"])
182
 
183
  model = load_ner_model()
184
-
185
-
186
 
 
187
  text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')
188
 
189
  def clear_text():
190
  """Clears the text area."""
191
  st.session_state['my_text_area'] = ""
 
192
 
193
  st.button("Clear text", on_click=clear_text)
194
 
195
-
196
  if st.button("Results"):
197
- start_time = time.time()
198
  if not text.strip():
199
- st.warning("Please enter some text to extract entities.")
200
  else:
201
- with st.spinner("Analyzing text...", show_time=True):
202
- entities = model(text_for_ner)
203
- data = []
204
- if entities:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  for entity in entities:
206
- if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
207
- data.append({
208
- 'word': entity['word'],
209
- 'entity_group': entity['entity_group'],
210
- 'score': entity['score'],
211
- 'start': entity['start'],
212
- 'end': entity['end']
213
- })
214
- else:
215
- st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
216
- df = pd.DataFrame(data)
217
- else:
218
- df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
219
-
220
- if not df.empty:
221
- pattern = r'[^\w\s]'
222
- df['word'] = df['word'].replace(pattern, '', regex=True)
223
- df = df.replace('', 'Unknown')
224
-
225
- st.subheader("All Extracted Keyphrases", divider="rainbow")
226
- st.dataframe(df, use_container_width=True)
227
-
228
- with st.expander("See Glossary of tags"):
229
- st.write('''
230
- **word**: ['entity extracted from your text data']
231
-
232
- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
233
-
234
- **entity_group**: ['label (tag) assigned to a given extracted entity']
235
-
236
- **start**: ['index of the start of the corresponding entity']
237
-
238
- **end**: ['index of the end of the corresponding entity']
239
-
240
- ''')
241
- st.divider()
242
-
243
- st.subheader("Most Frequent Keyphrases", divider="rainbow")
244
- word_counts = df['word'].value_counts().reset_index()
245
- word_counts.columns = ['word', 'count']
246
- df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
247
-
248
- if not df_frequent.empty:
249
- tab1, tab2 = st.tabs(["Table", "Chart"])
250
-
251
- with tab1:
252
- st.dataframe(df_frequent, use_container_width=True)
253
-
254
- with tab2:
255
- fig_frequent_bar = px.bar(
256
- df_frequent,
257
- x='count',
258
- y='word',
259
- orientation='h',
260
- title='Top Frequent Keyphrases by Count',
261
- color='count',
262
- color_continuous_scale=px.colors.sequential.Viridis
263
- )
264
- fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
265
- st.plotly_chart(fig_frequent_bar, use_container_width=True)
266
-
267
- if comet_initialized and 'experiment' in locals():
268
- experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
269
- else:
270
- st.info("No keyphrases found with more than one occurrence to display in tabs.")
271
-
272
- st.divider()
273
-
274
- experiment = None
275
- if comet_initialized:
276
- experiment = Experiment(
277
- api_key=COMET_API_KEY,
278
- workspace=COMET_WORKSPACE,
279
- project_name=COMET_PROJECT_NAME,
280
- )
281
- experiment.log_parameter("input_source_type", source_type)
282
- experiment.log_parameter("input_content_length", len(text_for_ner))
283
- experiment.log_table("predicted_entities", df)
284
-
285
- st.subheader("Treemap of All Keyphrases", divider="rainbow")
286
- fig_treemap = px.treemap(
287
- df,
288
- path=[px.Constant("all"), 'entity_group', 'word'],
289
- values='score',
290
- color='word',
291
- color_continuous_scale=px.colors.sequential.Plasma
292
- )
293
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
294
- st.plotly_chart(fig_treemap, use_container_width=True)
295
-
296
- if comet_initialized and experiment:
297
- experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
298
-
299
- # --- Download Section ---
300
- dfa = pd.DataFrame(
301
- data={
302
- 'Column Name': ['word', 'entity_group', 'score', 'start', 'end'],
303
- 'Description': [
304
- 'entity extracted from your text data',
305
- 'label (tag) assigned to a given extracted entity',
306
- 'accuracy score; how accurately a tag has been assigned to a given entity',
307
- 'index of the start of the corresponding entity',
308
- 'index of the end of the corresponding entity'
309
- ]
310
- }
311
  )
312
- buf = io.BytesIO()
313
- with zipfile.ZipFile(buf, "w") as myzip:
314
- if not df.empty:
315
- myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
316
- myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
317
- myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- with stylable_container(
320
- key="download_button",
321
- css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
322
- ):
323
- st.download_button(
324
- label="Download zip file",
325
- data=buf.getvalue(),
326
- file_name="nlpblogs_ner_results.zip",
327
- mime="application/zip",
328
- )
329
- st.divider()
330
- else:
331
- st.warning("No entities found to generate visualizations.")
332
- else:
333
- st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
334
  except Exception as e:
335
  st.error(f"An unexpected error occurred during processing: {e}")
336
  finally:
337
- if comet_initialized and experiment is not None:
338
  try:
 
 
 
 
339
  experiment.end()
340
  except Exception as comet_e:
341
  st.warning(f"Comet ML experiment.end() failed: {comet_e}")
342
- if start_time_overall is not None:
343
- end_time_overall = time.time()
344
- elapsed_time_overall = end_time_overall - start_time_overall
345
- st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
346
- st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")
347
- else:
348
- st.warning("Please enter some text, a URL, or upload a file to analyze.")
 
1
  import os
 
2
  import time
3
  import streamlit as st
4
  import pandas as pd
5
  import io
6
  import plotly.express as px
7
  import zipfile
 
 
8
  from streamlit_extras.stylable_container import stylable_container
 
 
 
9
  from transformers import pipeline
10
+ from comet_ml import Experiment
11
 
12
+ # --- App Configuration and Styling ---
13
+ st.set_page_config(
14
+ layout="wide",
15
+ page_title="English Keyphrase"
16
+ )
17
 
18
  st.markdown(
19
  """
20
  <style>
21
+ /* ... (your CSS styles here, as they were mostly fine) ... */
22
  .stApp {
23
  background: linear-gradient(135deg, #f0f8ff, #f5f0ff, #fff0f5);
24
  color: #000000;
25
  font-family: 'Inter', sans-serif;
26
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  .stButton > button {
28
  background-color: #FF69B4;
29
  color: #FFFFFF;
 
37
  box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
38
  transform: translateY(-2px);
39
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  </style>
41
  """,
42
  unsafe_allow_html=True
43
  )
44
 
 
 
 
 
 
 
 
 
45
  # --- Comet ML Setup ---
46
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
47
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 
51
  if not comet_initialized:
52
  st.warning("Comet ML not initialized. Check environment variables.")
53
 
 
 
 
 
54
  # --- UI Header and Notes ---
55
  st.subheader("AcademiaMiner", divider="rainbow")
56
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 
57
  expander = st.expander("**Important notes*")
58
+ expander.write('''**Named Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
59
+ Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
60
+ **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
61
+ **Usage Limits:** You can request results unlimited times for one (1) month.
62
+ **Supported Languages:** English
63
+ **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. For any errors or inquiries, please contact us at info@nlpblogs.com''')
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  with st.sidebar:
66
  st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
67
  code = '''
68
+ <iframe src="https://aiecosystem-business-core.hf.space" frameborder="0" width="850" height="450"></iframe>
 
 
 
 
 
69
  '''
70
  st.code(code, language="html")
71
  st.text("")
 
74
  st.subheader("🚀 Ready to build your own NER Web App?", divider="rainbow")
75
  st.link_button("NER Builder", "https://nlpblogs.com", type="primary")
76
 
77
+ # --- Model Loading ---
78
  @st.cache_resource
79
  def load_ner_model():
80
+ """Loads the keyphrase extraction model and caches it."""
81
  try:
82
+ return pipeline(
83
+ "token-classification",
84
+ model="ml6team/keyphrase-extraction-kbir-inspec",
85
+ aggregation_strategy="max"
86
+ )
87
  except Exception as e:
88
+ st.error(f"Failed to load NER model: {e}")
89
  st.stop()
 
 
 
 
 
 
 
 
 
 
90
 
91
  model = load_ner_model()
 
 
92
 
93
+ # --- Main App Logic ---
94
  text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')
95
 
96
  def clear_text():
97
  """Clears the text area."""
98
  st.session_state['my_text_area'] = ""
99
+ st.session_state.text_processed = False
100
 
101
  st.button("Clear text", on_click=clear_text)
102
 
 
103
  if st.button("Results"):
 
104
  if not text.strip():
105
+ st.warning("Please enter some text to extract keyphrases.")
106
  else:
107
+ start_time_overall = time.time()
108
+
109
+ # Initialize Comet ML experiment at the start
110
+ experiment = None
111
+ if comet_initialized:
112
+ try:
113
+ experiment = Experiment(
114
+ api_key=COMET_API_KEY,
115
+ workspace=COMET_WORKSPACE,
116
+ project_name=COMET_PROJECT_NAME,
117
+ )
118
+ except Exception as e:
119
+ st.warning(f"Could not initialize Comet ML experiment: {e}")
120
+ experiment = None
121
+
122
+ try:
123
+ with st.spinner("Analyzing text...", ):
124
+ # The pipeline model returns a list of dictionaries.
125
+ entities = model(text)
126
+
127
+ data = []
128
  for entity in entities:
129
+ # 'ml6team/keyphrase-extraction-kbir-inspec' model doesn't have 'entity_group'
130
+ # It just uses 'label'
131
+ data.append({
132
+ 'word': entity['word'],
133
+ 'label': entity['label'],
134
+ 'score': entity['score'],
135
+ 'start': entity['start'],
136
+ 'end': entity['end']
137
+ })
138
+
139
+ if not data:
140
+ st.warning("No keyphrases found in the text.")
141
+ st.stop()
142
+
143
+ df = pd.DataFrame(data)
144
+
145
+ # --- Data Cleaning and Processing ---
146
+ pattern = r'[^\w\s]'
147
+ df['word'] = df['word'].replace(pattern, '', regex=True)
148
+ df = df.replace('', 'Unknown')
149
+
150
+ # --- All Extracted Keyphrases ---
151
+ st.subheader("All Extracted Keyphrases", divider="rainbow")
152
+ st.dataframe(df, use_container_width=True)
153
+ with st.expander("See Glossary of tags"):
154
+ st.write('''
155
+ **word**: ['keyphrase extracted from your text data']
156
+ **score**: ['accuracy score; how accurately a tag has been assigned']
157
+ **label**: ['label (tag) assigned to a given extracted keyphrase']
158
+ **start**: ['index of the start of the corresponding entity']
159
+ **end**: ['index of the end of the corresponding entity']
160
+ ''')
161
+
162
+ # --- Most Frequent Keyphrases ---
163
+ st.subheader("Most Frequent Keyphrases", divider="rainbow")
164
+ word_counts = df['word'].value_counts().reset_index()
165
+ word_counts.columns = ['word', 'count']
166
+ df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
167
+
168
+ if not df_frequent.empty:
169
+ tab1, tab2 = st.tabs(["Table", "Chart"])
170
+ with tab1:
171
+ st.dataframe(df_frequent, use_container_width=True)
172
+ with tab2:
173
+ fig_frequent_bar = px.bar(
174
+ df_frequent,
175
+ x='count',
176
+ y='word',
177
+ orientation='h',
178
+ title='Top Frequent Keyphrases by Count',
179
+ color='count',
180
+ color_continuous_scale=px.colors.sequential.Viridis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  )
182
+ fig_frequent_bar.update_layout(yaxis={'categoryorder': 'total ascending'})
183
+ st.plotly_chart(fig_frequent_bar, use_container_width=True)
184
+ if experiment:
185
+ experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
186
+ else:
187
+ st.info("No keyphrases found with more than one occurrence.")
188
+
189
+ # --- Treemap of All Keyphrases ---
190
+ st.subheader("Treemap of All Keyphrases", divider="rainbow")
191
+ # Use 'label' instead of 'entity_group'
192
+ fig_treemap = px.treemap(
193
+ df,
194
+ path=[px.Constant("all"), 'label', 'word'],
195
+ values='score',
196
+ color='word',
197
+ color_continuous_scale=px.colors.sequential.Plasma
198
+ )
199
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
200
+ st.plotly_chart(fig_treemap, use_container_width=True)
201
+ if experiment:
202
+ experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
203
+
204
+ # --- Download Section ---
205
+ dfa = pd.DataFrame(
206
+ data={
207
+ 'Column Name': ['word', 'label', 'score', 'start', 'end'],
208
+ 'Description': [
209
+ 'keyphrase extracted from your text data',
210
+ 'label (tag) assigned to a given keyphrase',
211
+ 'accuracy score; how accurately a tag has been assigned',
212
+ 'index of the start of the corresponding entity',
213
+ 'index of the end of the corresponding entity'
214
+ ]
215
+ }
216
+ )
217
+ buf = io.BytesIO()
218
+ with zipfile.ZipFile(buf, "w") as myzip:
219
+ myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
220
+ myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
221
+ myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
222
+
223
+ with stylable_container(
224
+ key="download_button",
225
+ css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
226
+ ):
227
+ st.download_button(
228
+ label="Download zip file",
229
+ data=buf.getvalue(),
230
+ file_name="nlpblogs_ner_results.zip",
231
+ mime="application/zip",
232
+ )
233
+ st.divider()
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  except Exception as e:
236
  st.error(f"An unexpected error occurred during processing: {e}")
237
  finally:
238
+ if experiment:
239
  try:
240
+ # Log parameters and tables before ending the experiment
241
+ experiment.log_parameter("input_source_type", "text_area")
242
+ experiment.log_parameter("input_content_length", len(text))
243
+ experiment.log_table("predicted_entities", df)
244
  experiment.end()
245
  except Exception as comet_e:
246
  st.warning(f"Comet ML experiment.end() failed: {comet_e}")
247
+
248
+ # Show elapsed time
249
+ end_time_overall = time.time()
250
+ elapsed_time_overall = end_time_overall - start_time_overall
251
+ st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
252
+