AIEcosystem commited on
Commit
501f0bd
·
verified ·
1 Parent(s): e16f3db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -162
app.py CHANGED
@@ -223,175 +223,178 @@ else:
223
  st.session_state['uploaded_file_content'] = None
224
  st.session_state['file_uploader_key'] += 1
225
 
226
- # --- Main Processing Logic (triggered by input or refresh) ---
227
- experiment = None
228
- start_time_overall = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- if st.button("Analyze Text", type="primary") and st.session_state['encrypted_text_to_process']:
231
- try:
232
- start_time_overall = time.time()
 
233
 
234
- if st.session_state['source_type_attempts'] >= max_attempts:
235
- st.error(f"You have requested results {max_attempts} times. You have reached your request limit.")
236
- st.stop()
237
-
238
- st.session_state['source_type_attempts'] += 1
239
- save_persistent_data(st.session_state['source_type_attempts'], st.session_state['file_upload_history'])
240
 
241
- @st.cache_resource
242
- def load_ner_model():
243
- return pipeline("token-classification",
244
- model="ml6team/keyphrase-extraction-kbir-inspec",
245
- aggregation_strategy="max",
246
- stride=128,
247
- ignore_labels=["O"])
248
-
249
- model = load_ner_model()
250
- text_for_ner = decrypt_text(st.session_state['encrypted_text_to_process'])
251
-
252
- if text_for_ner and len(text_for_ner.strip()) > 0:
253
- with st.spinner("Analyzing text...", show_time=True):
254
- entities = model(text_for_ner)
255
- data = []
256
- if entities:
257
- for entity in entities:
258
- if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
259
- data.append({
260
- 'word': entity['word'],
261
- 'entity_group': entity['entity_group'],
262
- 'score': entity['score'],
263
- 'start': entity['start'],
264
- 'end': entity['end']
265
- })
266
- else:
267
- st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
268
- df = pd.DataFrame(data)
269
- else:
270
- df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
271
-
272
- if not df.empty:
273
- pattern = r'[^\w\s]'
274
- df['word'] = df['word'].replace(pattern, '', regex=True)
275
- df = df.replace('', 'Unknown')
276
-
277
- st.subheader("All Extracted Keyphrases", divider="rainbow")
278
- st.dataframe(df, use_container_width=True)
279
-
280
- with st.expander("See Glossary of tags"):
281
- st.write('''
282
- **word**: ['entity extracted from your text data']
283
-
284
- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
285
-
286
- **entity_group**: ['label (tag) assigned to a given extracted entity']
287
-
288
- **start**: ['index of the start of the corresponding entity']
289
-
290
- **end**: ['index of the end of the corresponding entity']
291
-
292
- ''')
293
- st.divider()
294
 
295
- st.subheader("Most Frequent Keyphrases", divider="rainbow")
296
- word_counts = df['word'].value_counts().reset_index()
297
- word_counts.columns = ['word', 'count']
298
- df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
299
 
300
- if not df_frequent.empty:
301
- tab1, tab2 = st.tabs(["Table", "Chart"])
302
 
303
- with tab1:
304
- st.dataframe(df_frequent, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- with tab2:
307
- fig_frequent_bar = px.bar(
308
- df_frequent,
309
- x='count',
310
- y='word',
311
- orientation='h',
312
- title='Top Frequent Keyphrases by Count',
313
- color='count',
314
- color_continuous_scale=px.colors.sequential.Viridis
315
  )
316
- fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
317
- st.plotly_chart(fig_frequent_bar, use_container_width=True)
318
-
319
- if comet_initialized and experiment:
320
- experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
321
- else:
322
- st.info("No keyphrases found with more than one occurrence to display in tabs.")
323
-
324
- st.divider()
325
-
326
- if comet_initialized:
327
- experiment = Experiment(
328
- api_key=COMET_API_KEY,
329
- workspace=COMET_WORKSPACE,
330
- project_name=COMET_PROJECT_NAME,
331
  )
332
- experiment.log_parameter("input_source_type", source_type)
333
- experiment.log_parameter("input_content_length", len(text_for_ner))
334
- experiment.log_table("predicted_entities", df)
335
-
336
- st.subheader("Treemap of All Keyphrases", divider="rainbow")
337
- fig_treemap = px.treemap(
338
- df,
339
- path=[px.Constant("all"), 'entity_group', 'word'],
340
- values='score',
341
- color='word',
342
- color_continuous_scale=px.colors.sequential.Plasma
343
- )
344
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
345
- st.plotly_chart(fig_treemap, use_container_width=True)
346
-
347
- if comet_initialized and experiment:
348
- experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
349
-
350
- # --- Download Section ---
351
- dfa = pd.DataFrame(
352
- data={
353
- 'Column Name': ['word', 'entity_group', 'score', 'start', 'end'],
354
- 'Description': [
355
- 'entity extracted from your text data',
356
- 'label (tag) assigned to a given extracted entity',
357
- 'accuracy score; how accurately a tag has been assigned to a given entity',
358
- 'index of the start of the corresponding entity',
359
- 'index of the end of the corresponding entity'
360
- ]
361
- }
362
- )
363
- buf = io.BytesIO()
364
- with zipfile.ZipFile(buf, "w") as myzip:
365
- if not df.empty:
366
- myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
367
- myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
368
- myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
369
-
370
- with stylable_container(
371
- key="download_button",
372
- css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
373
- ):
374
- st.download_button(
375
- label="Download zip file",
376
- data=buf.getvalue(),
377
- file_name="nlpblogs_ner_results.zip",
378
- mime="application/zip",
379
  )
380
- st.divider()
381
- else:
382
- st.warning("No entities found to generate visualizations.")
383
- else:
384
- st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
385
- except Exception as e:
386
- st.error(f"An unexpected error occurred during processing: {e}")
387
- finally:
388
- if comet_initialized and experiment is not None:
389
- try:
390
- experiment.end()
391
- except Exception as comet_e:
392
- st.warning(f"Comet ML experiment.end() failed: {comet_e}")
393
- if start_time_overall is not None:
394
- end_time_overall = time.time()
395
- elapsed_time_overall = end_time_overall - start_time_overall
396
- st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
397
- st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  st.session_state['uploaded_file_content'] = None
224
  st.session_state['file_uploader_key'] += 1
225
 
226
+ # --- Main Processing Logic (corrected placement) ---
227
+ # The button must be outside the conditional logic that populates the session state
228
+ # so that it is always rendered and can be clicked to trigger the analysis.
229
+ if st.button("Analyze Text", type="primary"):
230
+ if st.session_state['encrypted_text_to_process']:
231
+ try:
232
+ start_time_overall = time.time()
233
+
234
+ if st.session_state['source_type_attempts'] >= max_attempts:
235
+ st.error(f"You have requested results {max_attempts} times. You have reached your request limit.")
236
+ st.stop()
237
+
238
+ st.session_state['source_type_attempts'] += 1
239
+ save_persistent_data(st.session_state['source_type_attempts'], st.session_state['file_upload_history'])
240
+
241
+ @st.cache_resource
242
+ def load_ner_model():
243
+ return pipeline("token-classification",
244
+ model="ml6team/keyphrase-extraction-kbir-inspec",
245
+ aggregation_strategy="max",
246
+ stride=128,
247
+ ignore_labels=["O"])
248
+
249
+ model = load_ner_model()
250
+ text_for_ner = decrypt_text(st.session_state['encrypted_text_to_process'])
251
+
252
+ if text_for_ner and len(text_for_ner.strip()) > 0:
253
+ with st.spinner("Analyzing text...", show_time=True):
254
+ entities = model(text_for_ner)
255
+ data = []
256
+ if entities:
257
+ for entity in entities:
258
+ if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
259
+ data.append({
260
+ 'word': entity['word'],
261
+ 'entity_group': entity['entity_group'],
262
+ 'score': entity['score'],
263
+ 'start': entity['start'],
264
+ 'end': entity['end']
265
+ })
266
+ else:
267
+ st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
268
+ df = pd.DataFrame(data)
269
+ else:
270
+ df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
271
 
272
+ if not df.empty:
273
+ pattern = r'[^\w\s]'
274
+ df['word'] = df['word'].replace(pattern, '', regex=True)
275
+ df = df.replace('', 'Unknown')
276
 
277
+ st.subheader("All Extracted Keyphrases", divider="rainbow")
278
+ st.dataframe(df, use_container_width=True)
 
 
 
 
279
 
280
+ with st.expander("See Glossary of tags"):
281
+ st.write('''
282
+ **word**: ['entity extracted from your text data']
283
+
284
+ **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
285
+
286
+ **entity_group**: ['label (tag) assigned to a given extracted entity']
287
+
288
+ **start**: ['index of the start of the corresponding entity']
289
+
290
+ **end**: ['index of the end of the corresponding entity']
291
+
292
+ ''')
293
+ st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
+ st.subheader("Most Frequent Keyphrases", divider="rainbow")
296
+ word_counts = df['word'].value_counts().reset_index()
297
+ word_counts.columns = ['word', 'count']
298
+ df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
299
 
300
+ if not df_frequent.empty:
301
+ tab1, tab2 = st.tabs(["Table", "Chart"])
302
 
303
+ with tab1:
304
+ st.dataframe(df_frequent, use_container_width=True)
305
+
306
+ with tab2:
307
+ fig_frequent_bar = px.bar(
308
+ df_frequent,
309
+ x='count',
310
+ y='word',
311
+ orientation='h',
312
+ title='Top Frequent Keyphrases by Count',
313
+ color='count',
314
+ color_continuous_scale=px.colors.sequential.Viridis
315
+ )
316
+ fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
317
+ st.plotly_chart(fig_frequent_bar, use_container_width=True)
318
+
319
+ if comet_initialized and 'experiment' in locals():
320
+ experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
321
+ else:
322
+ st.info("No keyphrases found with more than one occurrence to display in tabs.")
323
 
324
+ st.divider()
325
+
326
+ experiment = None
327
+ if comet_initialized:
328
+ experiment = Experiment(
329
+ api_key=COMET_API_KEY,
330
+ workspace=COMET_WORKSPACE,
331
+ project_name=COMET_PROJECT_NAME,
 
332
  )
333
+ experiment.log_parameter("input_source_type", source_type)
334
+ experiment.log_parameter("input_content_length", len(text_for_ner))
335
+ experiment.log_table("predicted_entities", df)
336
+
337
+ st.subheader("Treemap of All Keyphrases", divider="rainbow")
338
+ fig_treemap = px.treemap(
339
+ df,
340
+ path=[px.Constant("all"), 'entity_group', 'word'],
341
+ values='score',
342
+ color='word',
343
+ color_continuous_scale=px.colors.sequential.Plasma
 
 
 
 
344
  )
345
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
346
+ st.plotly_chart(fig_treemap, use_container_width=True)
347
+
348
+ if comet_initialized and experiment:
349
+ experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
350
+
351
+ # --- Download Section ---
352
+ dfa = pd.DataFrame(
353
+ data={
354
+ 'Column Name': ['word', 'entity_group', 'score', 'start', 'end'],
355
+ 'Description': [
356
+ 'entity extracted from your text data',
357
+ 'label (tag) assigned to a given extracted entity',
358
+ 'accuracy score; how accurately a tag has been assigned to a given entity',
359
+ 'index of the start of the corresponding entity',
360
+ 'index of the end of the corresponding entity'
361
+ ]
362
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  )
364
+ buf = io.BytesIO()
365
+ with zipfile.ZipFile(buf, "w") as myzip:
366
+ if not df.empty:
367
+ myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
368
+ myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
369
+ myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
370
+
371
+ with stylable_container(
372
+ key="download_button",
373
+ css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
374
+ ):
375
+ st.download_button(
376
+ label="Download zip file",
377
+ data=buf.getvalue(),
378
+ file_name="nlpblogs_ner_results.zip",
379
+ mime="application/zip",
380
+ )
381
+ st.divider()
382
+ else:
383
+ st.warning("No entities found to generate visualizations.")
384
+ else:
385
+ st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
386
+ except Exception as e:
387
+ st.error(f"An unexpected error occurred during processing: {e}")
388
+ finally:
389
+ if comet_initialized and experiment is not None:
390
+ try:
391
+ experiment.end()
392
+ except Exception as comet_e:
393
+ st.warning(f"Comet ML experiment.end() failed: {comet_e}")
394
+ if start_time_overall is not None:
395
+ end_time_overall = time.time()
396
+ elapsed_time_overall = end_time_overall - start_time_overall
397
+ st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
398
+ st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")
399
+ else:
400
+ st.warning("Please enter some text, a URL, or upload a file to analyze.")