AIEcosystem commited on
Commit
e16f3db
·
verified ·
1 Parent(s): 38d7111

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -213
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  from bs4 import BeautifulSoup
3
  import pandas as pd
@@ -8,66 +9,61 @@ import io
8
  import os
9
  import zipfile
10
  import re
11
- import numpy as np
12
  import json
13
- import requests
14
  from cryptography.fernet import Fernet
15
  from streamlit_extras.stylable_container import stylable_container
16
  from comet_ml import Experiment
17
 
18
- st.set_page_config(layout="wide", page_title="English Keyphrase TXT & URL Entity Finder")
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # --- Persistent Counter & History Configuration ---
21
- # The counter and history will be stored in a JSON file for persistence across restarts.
22
- COUNTER_FILE = "counter.json"
23
- max_attempts = 300
24
 
25
  def load_persistent_data():
26
  """
27
- Loads the attempts count and file/URL history from a persistent JSON file.
28
  Returns default values if the file doesn't exist or is invalid.
29
  """
30
- if os.path.exists(COUNTER_FILE):
31
  try:
32
- with open(COUNTER_FILE, "r") as f:
33
  data = json.load(f)
34
- return data.get('source_type_attempts', 0), data.get('source_type_history', [])
35
  except (json.JSONDecodeError, KeyError):
36
- # If the file is corrupted or malformed, return defaults
37
  return 0, []
38
  return 0, []
39
 
40
  def save_persistent_data(attempts, history):
41
  """
42
- Saves the current attempts count and file/URL history to the persistent JSON file.
43
  """
44
- with open(COUNTER_FILE, "w") as f:
45
- json.dump({'source_type_attempts': attempts, 'source_type_history': history}, f, indent=4)
46
 
47
- def clear_history():
48
- """
49
- Callback function for the "Clear History" button.
50
- Resets the session state variables for the counter and history, then saves the empty state.
51
- """
52
- st.session_state['source_type_attempts'] = 0
53
- st.session_state['source_type_history'] = []
54
- save_persistent_data(0, [])
55
 
56
- # --- Initialize session state with persistent data ---
57
  if 'source_type_attempts' not in st.session_state:
58
  attempts, history = load_persistent_data()
59
  st.session_state['source_type_attempts'] = attempts
60
- st.session_state['source_type_history'] = history
61
- # Save the initial state to ensure the file exists on the first run
62
- save_persistent_data(st.session_state['source_type_attempts'], st.session_state['source_type_history'])
63
-
64
- # --- Configuration for Comet ML ---
65
- COMET_API_KEY = os.environ.get("COMET_API_KEY")
66
- COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
67
- COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
68
- comet_initialized = False
69
- if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
70
- comet_initialized = True
71
 
72
  if 'encrypted_text_to_process' not in st.session_state:
73
  st.session_state['encrypted_text_to_process'] = None
@@ -86,7 +82,7 @@ def load_encryption_key():
86
  key_bytes = key_str.encode('utf-8')
87
  return Fernet(key_bytes)
88
  except ValueError as ve:
89
- st.error(f"Configuration Error: {ve}. Please ensure the 'FERNET_KEY' environment variable is set securely in your deployment environment (e.g., Hugging Face Spaces secrets, Render environment variables) or in a local .env file for development.")
90
  st.stop()
91
  except Exception as e:
92
  st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
@@ -117,6 +113,7 @@ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
117
  expander = st.expander("**Important notes on the English Keyphrase TXT & URL Entity Finder**")
118
  expander.write('''
119
  **Named Entities:** This English Keyphrase TXT & URL Entity Finder extracts keyphrases from English academic and scientific papers.
 
120
  Results are presented in an easy-to-read table, visualized in an interactive bar chart and tree map, and are available for download along with a Glossary of tags.
121
 
122
  **How to Use:**
@@ -124,76 +121,70 @@ expander.write('''
124
  2. Alternatively, type or paste text directly into the text area and press Ctrl + Enter.
125
  3. Or, upload your TXT file.
126
 
127
- **Usage Limits:** You can request results up to 300 requests within a 30-day period.
 
128
  **Customization:** To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
 
129
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
 
130
  For any errors or inquiries, please contact us at [email protected]
131
- ''')
132
 
133
  # --- Sidebar Content ---
134
  with st.sidebar:
135
-
136
-
 
 
 
137
 
138
- # --- New section to display history in the sidebar ---
139
- st.subheader("Request History", divider="rainbow")
140
- if st.session_state['source_type_history']:
141
- history_df = pd.DataFrame(st.session_state['source_type_history'])
142
  st.dataframe(history_df, use_container_width=True, hide_index=True)
143
- else:
144
- st.info("No requests have been made yet.")
145
-
146
- st.subheader("Build your own NER Web App in a minute without writing a single line of code.", divider="rainbow")
147
- st.link_button("NER File Builder", "https://nlpblogs.com/shop/named-entity-recognition-ner/ner-file-builder/", type="primary")
148
-
149
- # --- New button to clear the history ---
150
- st.button("Clear History", on_click=clear_history)
151
 
 
 
152
 
153
  # --- Input Fields ---
154
- def clear_url_input():
155
- st.session_state.url = ""
156
- st.session_state.encrypted_text_to_process = None
157
- st.session_state.uploaded_file_content = None
158
- st.session_state.my_text_area = ""
159
- st.session_state['file_uploader_key'] += 1
160
- def clear_text_input():
161
- st.session_state.my_text_area = ""
162
- st.session_state.encrypted_text_to_process = None
163
- st.session_state.uploaded_file_content = None
164
- st.session_state.url = ""
165
- st.session_state['file_uploader_key'] += 1
166
- def clear_file_input():
167
- st.session_state.uploaded_file_content = None
168
- st.session_state.encrypted_text_to_process = None
169
  st.session_state.url = ""
170
  st.session_state.my_text_area = ""
 
 
171
  st.session_state['file_uploader_key'] += 1
 
172
 
173
  url = st.text_input("Enter URL from the internet, and then press Enter:", key="url")
174
- st.button("Clear URL", on_click=clear_url_input)
175
  text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area')
176
- st.button("Clear Text", on_click=clear_text_input)
177
  uploaded_file = st.file_uploader("Or upload a .txt file", type=["txt"], key=f"file_uploader_{st.session_state['file_uploader_key']}")
178
- st.button("Clear Uploaded File", on_click=clear_file_input)
179
 
180
  source_type = None
181
- input_content = None
182
  current_run_text = None
183
 
184
- if uploaded_file is not None:
185
  source_type = 'file'
186
- input_content = uploaded_file.name
187
- string_data = io.StringIO(uploaded_file.getvalue().decode("utf-8")).read()
188
- current_run_text = string_data
189
- st.session_state['uploaded_file_content'] = current_run_text
190
- st.success("TXT file uploaded successfully. File content encrypted and secured. Due to security protocols, the file content is hidden.")
191
- st.divider()
192
- st.write("**Input text content (from uploaded file)**")
193
- st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
 
 
 
 
 
 
 
 
 
194
  elif url:
195
  source_type = 'url'
196
- input_content = url
197
  if not url.startswith(("http://", "https://")):
198
  st.error("Please enter a valid URL starting with 'http://' or 'https://'.")
199
  current_run_text = None
@@ -204,6 +195,12 @@ elif url:
204
  f.raise_for_status()
205
  soup = BeautifulSoup(f.text, 'html.parser')
206
  current_run_text = soup.get_text(separator=' ', strip=True)
 
 
 
 
 
 
207
  st.divider()
208
  st.write("**Input text content (from URL)**")
209
  st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
@@ -212,141 +209,143 @@ elif url:
212
  current_run_text = None
213
  elif text:
214
  source_type = 'text'
215
- input_content = text
216
  current_run_text = text
217
  st.divider()
218
  st.write("**Input text content (from text area)**")
219
  st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
220
 
221
- # Encrypt and store the text in session state if available
222
  if current_run_text and current_run_text.strip():
223
- st.session_state['encrypted_text_to_process'] = encrypt_text(current_run_text)
 
224
  else:
225
  st.session_state['encrypted_text_to_process'] = None
 
 
 
226
 
227
  # --- Main Processing Logic (triggered by input or refresh) ---
228
  experiment = None
229
  start_time_overall = None
230
- try:
231
- if source_type:
 
232
  start_time_overall = time.time()
233
 
234
  if st.session_state['source_type_attempts'] >= max_attempts:
235
- st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
236
- pass
237
- else:
238
- # Increment the counter and immediately save it
239
- st.session_state['source_type_attempts'] += 1
240
- st.session_state['source_type_history'].append({
241
- 'source': source_type,
242
- 'content': input_content if source_type == 'file' else 'URL/Text Area Content',
243
- 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
244
- })
245
- save_persistent_data(st.session_state['source_type_attempts'], st.session_state['source_type_history'])
246
-
247
- @st.cache_resource
248
- def load_ner_model():
249
- return pipeline("token-classification", model="ml6team/keyphrase-extraction-kbir-inspec", aggregation_strategy="max", stride=128, ignore_labels=["O"])
250
-
251
- model = load_ner_model()
252
-
253
- text_for_ner = None
254
- if st.session_state['encrypted_text_to_process'] is not None:
255
- text_for_ner = decrypt_text(st.session_state['encrypted_text_to_process'])
256
-
257
- if text_for_ner and len(text_for_ner.strip()) > 0:
258
- with st.spinner("Analyzing text...", show_time=True):
259
- entities = model(text_for_ner)
260
- data = []
261
- if entities:
262
- for entity in entities:
263
- if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
264
- data.append({
265
- 'word': entity['word'],
266
- 'entity_group': entity['entity_group'],
267
- 'score': entity['score'],
268
- 'start': entity['start'],
269
- 'end': entity['end']
270
- })
271
- else:
272
- st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
273
- df = pd.DataFrame(data)
274
- else:
275
- df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
276
-
277
- if not df.empty:
278
- pattern = r'[^\w\s]'
279
- df['word'] = df['word'].replace(pattern, '', regex=True)
280
- df = df.replace('', 'Unknown')
281
-
282
- st.subheader("All Extracted Keyphrases", divider="rainbow")
283
- st.dataframe(df, use_container_width=True)
284
-
285
- with st.expander("See Glossary of tags"):
286
- st.write('''
287
- '**word**': ['entity extracted from your text data']
288
- '**score**': ['accuracy score; how accurately a tag has been assigned to a given entity']
289
- '**entity_group**': ['label (tag) assigned to a given extracted entity']
290
- '**start**': ['index of the start of the corresponding entity']
291
- '**end**': ['index of the end of the corresponding entity']
292
- ''')
293
- st.divider()
294
-
295
- st.subheader("Most Frequent Keyphrases", divider="rainbow")
296
- word_counts = df['word'].value_counts().reset_index()
297
- word_counts.columns = ['word', 'count']
298
-
299
- df_frequent = word_counts[word_counts['count'] > 1].sort_values(by='count', ascending=False).head(15)
300
-
301
- if not df_frequent.empty:
302
- tab1, tab2 = st.tabs(["Table", "Chart"])
303
- with tab1:
304
- st.dataframe(df_frequent, use_container_width=True)
305
- with tab2:
306
- fig_frequent_bar = px.bar(
307
- df_frequent,
308
- x='count',
309
- y='word',
310
- orientation='h',
311
- title='Top Frequent Keyphrases by Count',
312
- color='count',
313
- color_continuous_scale=px.colors.sequential.Viridis
314
- )
315
- fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
316
- st.plotly_chart(fig_frequent_bar, use_container_width=True)
317
- if comet_initialized and experiment:
318
- experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
319
  else:
320
- st.info("No keyphrases found with more than one occurrence to display in tabs.")
321
- st.divider()
322
-
323
- if comet_initialized:
324
- experiment = Experiment(
325
- api_key=COMET_API_KEY,
326
- workspace=COMET_WORKSPACE,
327
- project_name=COMET_PROJECT_NAME,
328
- )
329
- experiment.log_parameter("input_source_type", source_type)
330
- experiment.log_parameter("input_content_length", len(input_content) if isinstance(input_content, str) else len(str(input_content)))
331
- if not df.empty:
332
- experiment.log_table("predicted_entities", df)
333
- else:
334
- experiment.log_text("No entities found for logging.")
335
-
336
- st.subheader("Treemap of All Keyphrases", divider="rainbow")
337
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'],
338
- values='score',
339
- color='word',
340
- color_continuous_scale=px.colors.sequential.Plasma
341
- )
342
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
343
- st.plotly_chart(fig_treemap, use_container_width=True)
344
-
345
- if comet_initialized and experiment:
346
- experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  else:
349
- st.warning("No entities found to generate visualizations.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  # --- Download Section ---
352
  dfa = pd.DataFrame(
@@ -379,19 +378,20 @@ try:
379
  mime="application/zip",
380
  )
381
  st.divider()
382
- else:
383
- st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
384
- except Exception as e:
385
- st.error(f"An unexpected error occurred: {e}")
386
- finally:
387
- if comet_initialized and experiment is not None:
388
- try:
389
- experiment.end()
390
- except Exception as comet_e:
391
- st.warning(f"Comet ML experiment.end() failed: {comet_e}")
392
- if start_time_overall is not None:
393
- end_time_overall = time.time()
394
- elapsed_time_overall = end_time_overall - start_time_overall
395
- st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
396
-
397
- st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")
 
 
1
+ import requests
2
  import streamlit as st
3
  from bs4 import BeautifulSoup
4
  import pandas as pd
 
9
  import os
10
  import zipfile
11
  import re
 
12
  import json
 
13
  from cryptography.fernet import Fernet
14
  from streamlit_extras.stylable_container import stylable_container
15
  from comet_ml import Experiment
16
 
17
+ st.set_page_config(
18
+ layout="wide",
19
+ page_title="English Keyphrase TXT & URL Entity Finder"
20
+ )
21
+
22
+ # --- Configuration for Comet ML ---
23
+ COMET_API_KEY = os.environ.get("COMET_API_KEY")
24
+ COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
25
+ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
26
+ comet_initialized = False
27
+ if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
28
+ comet_initialized = True
29
 
30
  # --- Persistent Counter & History Configuration ---
31
+ PERSISTENCE_FILE = "app_data.json"
32
+ max_attempts = 10
 
33
 
34
  def load_persistent_data():
35
  """
36
+ Loads the attempts count and file upload history from a persistent JSON file.
37
  Returns default values if the file doesn't exist or is invalid.
38
  """
39
+ if os.path.exists(PERSISTENCE_FILE):
40
  try:
41
+ with open(PERSISTENCE_FILE, "r") as f:
42
  data = json.load(f)
43
+ return data.get('source_type_attempts', 0), data.get('file_upload_history', [])
44
  except (json.JSONDecodeError, KeyError):
45
+ st.warning("Warning: Could not read persistent data file. Starting with a fresh state.")
46
  return 0, []
47
  return 0, []
48
 
49
  def save_persistent_data(attempts, history):
50
  """
51
+ Saves the current attempts count and file upload history to the persistent JSON file.
52
  """
53
+ with open(PERSISTENCE_FILE, "w") as f:
54
+ json.dump({'source_type_attempts': attempts, 'file_upload_history': history}, f, indent=4)
55
 
56
+ def clear_input_history_and_rerun():
57
+ """Callback function for the "Clear Input History" button."""
58
+ st.session_state['file_upload_history'] = []
59
+ save_persistent_data(st.session_state['source_type_attempts'], [])
60
+ st.experimental_rerun()
 
 
 
61
 
62
+ # --- Initialize session state for attempts and encrypted text ---
63
  if 'source_type_attempts' not in st.session_state:
64
  attempts, history = load_persistent_data()
65
  st.session_state['source_type_attempts'] = attempts
66
+ st.session_state['file_upload_history'] = history
 
 
 
 
 
 
 
 
 
 
67
 
68
  if 'encrypted_text_to_process' not in st.session_state:
69
  st.session_state['encrypted_text_to_process'] = None
 
82
  key_bytes = key_str.encode('utf-8')
83
  return Fernet(key_bytes)
84
  except ValueError as ve:
85
+ st.error(f"Configuration Error: {ve}. Please ensure the 'FERNET_KEY' environment variable is set securely.")
86
  st.stop()
87
  except Exception as e:
88
  st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
 
113
  expander = st.expander("**Important notes on the English Keyphrase TXT & URL Entity Finder**")
114
  expander.write('''
115
  **Named Entities:** This English Keyphrase TXT & URL Entity Finder extracts keyphrases from English academic and scientific papers.
116
+
117
  Results are presented in an easy-to-read table, visualized in an interactive bar chart and tree map, and are available for download along with a Glossary of tags.
118
 
119
  **How to Use:**
 
121
  2. Alternatively, type or paste text directly into the text area and press Ctrl + Enter.
122
  3. Or, upload your TXT file.
123
 
124
+ **Usage Limits:** You can request results up to 10 times.
125
+
126
  **Customization:** To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
127
+
128
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
129
+
130
  For any errors or inquiries, please contact us at [email protected]
131
+ ''')
132
 
133
  # --- Sidebar Content ---
134
  with st.sidebar:
135
+ container = st.container(border=True)
136
+ container.write("**Named Entity Recognition (NER)** is the task of extracting and tagging entities in text data. Entities can be persons, organizations, locations, countries, products, events etc.")
137
+
138
+ st.subheader("Persistent Data", divider="rainbow")
139
+ st.info(f"Requests remaining today: **{max_attempts - st.session_state['source_type_attempts']}**")
140
 
141
+ if st.session_state['file_upload_history']:
142
+ st.subheader("File & URL History", divider="rainbow")
143
+ history_df = pd.DataFrame(st.session_state['file_upload_history'])
 
144
  st.dataframe(history_df, use_container_width=True, hide_index=True)
145
+ st.button("Clear Input History", on_click=clear_input_history_and_rerun, type="secondary")
 
 
 
 
 
 
 
146
 
147
+ st.subheader("Related NER Web Apps", divider="rainbow")
148
+ st.link_button("Scandinavian JSON Entity Finder", "https://nlpblogs.com/shop/named-entity-recognition-ner/scandinavian-json-entity-finder/", type="primary")
149
 
150
  # --- Input Fields ---
151
+ def clear_inputs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  st.session_state.url = ""
153
  st.session_state.my_text_area = ""
154
+ st.session_state['uploaded_file_content'] = None
155
+ st.session_state['encrypted_text_to_process'] = None
156
  st.session_state['file_uploader_key'] += 1
157
+ st.experimental_rerun()
158
 
159
  url = st.text_input("Enter URL from the internet, and then press Enter:", key="url")
 
160
  text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area')
 
161
  uploaded_file = st.file_uploader("Or upload a .txt file", type=["txt"], key=f"file_uploader_{st.session_state['file_uploader_key']}")
162
+ st.button("Clear All Inputs", on_click=clear_inputs)
163
 
164
  source_type = None
 
165
  current_run_text = None
166
 
167
+ if uploaded_file is not None and st.session_state.get('uploaded_file_content') is None:
168
  source_type = 'file'
169
+ try:
170
+ string_data = io.StringIO(uploaded_file.getvalue().decode("utf-8")).read()
171
+ current_run_text = string_data
172
+ st.session_state['uploaded_file_content'] = current_run_text
173
+ st.session_state['file_upload_history'].append({
174
+ 'source_type': 'file',
175
+ 'filename': uploaded_file.name,
176
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
177
+ })
178
+ save_persistent_data(st.session_state['source_type_attempts'], st.session_state['file_upload_history'])
179
+ st.success("TXT file uploaded successfully. File content encrypted and secured. Due to security protocols, the file content is hidden.")
180
+ st.divider()
181
+ st.write("**Input text content (from uploaded file)**")
182
+ st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
183
+ except Exception as e:
184
+ st.error(f"Error processing uploaded file: {e}")
185
+ current_run_text = None
186
  elif url:
187
  source_type = 'url'
 
188
  if not url.startswith(("http://", "https://")):
189
  st.error("Please enter a valid URL starting with 'http://' or 'https://'.")
190
  current_run_text = None
 
195
  f.raise_for_status()
196
  soup = BeautifulSoup(f.text, 'html.parser')
197
  current_run_text = soup.get_text(separator=' ', strip=True)
198
+ st.session_state['file_upload_history'].append({
199
+ 'source_type': 'url',
200
+ 'filename': url,
201
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
202
+ })
203
+ save_persistent_data(st.session_state['source_type_attempts'], st.session_state['file_upload_history'])
204
  st.divider()
205
  st.write("**Input text content (from URL)**")
206
  st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
 
209
  current_run_text = None
210
  elif text:
211
  source_type = 'text'
 
212
  current_run_text = text
213
  st.divider()
214
  st.write("**Input text content (from text area)**")
215
  st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
216
 
 
217
  if current_run_text and current_run_text.strip():
218
+ if st.session_state.get('encrypted_text_to_process') is None:
219
+ st.session_state['encrypted_text_to_process'] = encrypt_text(current_run_text)
220
  else:
221
  st.session_state['encrypted_text_to_process'] = None
222
+ if uploaded_file is None:
223
+ st.session_state['uploaded_file_content'] = None
224
+ st.session_state['file_uploader_key'] += 1
225
 
226
  # --- Main Processing Logic (triggered by input or refresh) ---
227
  experiment = None
228
  start_time_overall = None
229
+
230
+ if st.button("Analyze Text", type="primary") and st.session_state['encrypted_text_to_process']:
231
+ try:
232
  start_time_overall = time.time()
233
 
234
  if st.session_state['source_type_attempts'] >= max_attempts:
235
+ st.error(f"You have requested results {max_attempts} times. You have reached your request limit.")
236
+ st.stop()
237
+
238
+ st.session_state['source_type_attempts'] += 1
239
+ save_persistent_data(st.session_state['source_type_attempts'], st.session_state['file_upload_history'])
240
+
241
+ @st.cache_resource
242
+ def load_ner_model():
243
+ return pipeline("token-classification",
244
+ model="ml6team/keyphrase-extraction-kbir-inspec",
245
+ aggregation_strategy="max",
246
+ stride=128,
247
+ ignore_labels=["O"])
248
+
249
+ model = load_ner_model()
250
+ text_for_ner = decrypt_text(st.session_state['encrypted_text_to_process'])
251
+
252
+ if text_for_ner and len(text_for_ner.strip()) > 0:
253
+ with st.spinner("Analyzing text...", show_time=True):
254
+ entities = model(text_for_ner)
255
+ data = []
256
+ if entities:
257
+ for entity in entities:
258
+ if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
259
+ data.append({
260
+ 'word': entity['word'],
261
+ 'entity_group': entity['entity_group'],
262
+ 'score': entity['score'],
263
+ 'start': entity['start'],
264
+ 'end': entity['end']
265
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  else:
267
+ st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
268
+ df = pd.DataFrame(data)
269
+ else:
270
+ df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
271
+
272
+ if not df.empty:
273
+ pattern = r'[^\w\s]'
274
+ df['word'] = df['word'].replace(pattern, '', regex=True)
275
+ df = df.replace('', 'Unknown')
276
+
277
+ st.subheader("All Extracted Keyphrases", divider="rainbow")
278
+ st.dataframe(df, use_container_width=True)
279
+
280
+ with st.expander("See Glossary of tags"):
281
+ st.write('''
282
+ **word**: ['entity extracted from your text data']
283
+
284
+ **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
285
+
286
+ **entity_group**: ['label (tag) assigned to a given extracted entity']
287
+
288
+ **start**: ['index of the start of the corresponding entity']
289
+
290
+ **end**: ['index of the end of the corresponding entity']
291
+
292
+ ''')
293
+ st.divider()
294
 
295
+ st.subheader("Most Frequent Keyphrases", divider="rainbow")
296
+ word_counts = df['word'].value_counts().reset_index()
297
+ word_counts.columns = ['word', 'count']
298
+ df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
299
+
300
+ if not df_frequent.empty:
301
+ tab1, tab2 = st.tabs(["Table", "Chart"])
302
+
303
+ with tab1:
304
+ st.dataframe(df_frequent, use_container_width=True)
305
+
306
+ with tab2:
307
+ fig_frequent_bar = px.bar(
308
+ df_frequent,
309
+ x='count',
310
+ y='word',
311
+ orientation='h',
312
+ title='Top Frequent Keyphrases by Count',
313
+ color='count',
314
+ color_continuous_scale=px.colors.sequential.Viridis
315
+ )
316
+ fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
317
+ st.plotly_chart(fig_frequent_bar, use_container_width=True)
318
+
319
+ if comet_initialized and experiment:
320
+ experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
321
  else:
322
+ st.info("No keyphrases found with more than one occurrence to display in tabs.")
323
+
324
+ st.divider()
325
+
326
+ if comet_initialized:
327
+ experiment = Experiment(
328
+ api_key=COMET_API_KEY,
329
+ workspace=COMET_WORKSPACE,
330
+ project_name=COMET_PROJECT_NAME,
331
+ )
332
+ experiment.log_parameter("input_source_type", source_type)
333
+ experiment.log_parameter("input_content_length", len(text_for_ner))
334
+ experiment.log_table("predicted_entities", df)
335
+
336
+ st.subheader("Treemap of All Keyphrases", divider="rainbow")
337
+ fig_treemap = px.treemap(
338
+ df,
339
+ path=[px.Constant("all"), 'entity_group', 'word'],
340
+ values='score',
341
+ color='word',
342
+ color_continuous_scale=px.colors.sequential.Plasma
343
+ )
344
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
345
+ st.plotly_chart(fig_treemap, use_container_width=True)
346
+
347
+ if comet_initialized and experiment:
348
+ experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
349
 
350
  # --- Download Section ---
351
  dfa = pd.DataFrame(
 
378
  mime="application/zip",
379
  )
380
  st.divider()
381
+ else:
382
+ st.warning("No entities found to generate visualizations.")
383
+ else:
384
+ st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
385
+ except Exception as e:
386
+ st.error(f"An unexpected error occurred during processing: {e}")
387
+ finally:
388
+ if comet_initialized and experiment is not None:
389
+ try:
390
+ experiment.end()
391
+ except Exception as comet_e:
392
+ st.warning(f"Comet ML experiment.end() failed: {comet_e}")
393
+ if start_time_overall is not None:
394
+ end_time_overall = time.time()
395
+ elapsed_time_overall = end_time_overall - start_time_overall
396
+ st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
397
+ st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")