Maria Tsilimos commited on
Commit
6b00951
·
unverified ·
1 Parent(s): 2300708

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -75
app.py CHANGED
@@ -2,8 +2,6 @@ import time
2
  import streamlit as st
3
  import pandas as pd
4
  import io
5
-
6
- from streamlit_extras.stylable_container import stylable_container
7
  import plotly.express as px
8
  import zipfile
9
  import os
@@ -15,6 +13,7 @@ from gliner import GLiNER
15
  from PyPDF2 import PdfReader
16
  import docx
17
  from comet_ml import Experiment
 
18
 
19
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
20
 
@@ -34,32 +33,89 @@ if 'file_upload_attempts' not in st.session_state:
34
  if 'encrypted_extracted_text' not in st.session_state:
35
  st.session_state['encrypted_extracted_text'] = None
36
 
37
-
38
-
39
  max_attempts = 10
40
 
41
 
42
- GLINER_LABELS = ["Person", "Organization", "Phone number", "Address", "Passport number",
43
- "Email", "Credit card number", "Social security number", "Health insurance ID number",
44
- "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF",
45
- "Driver license number", "Tax identification number", "Medical condition",
46
- "Identity card number", "National ID number", "IP address", "IBAN",
47
- "Credit card expiration date", "Username", "Health insurance number",
48
- "Registration number", "Student ID number", "Insurance number", "Flight number",
49
- "Landline phone number", "Blood type", "CVV", "Reservation number",
50
- "Digital signature", "Social media handle", "License plate number",
51
- "CNPJ", "Postal code", "Passport_number", "Serial number", "Vehicle registration number",
52
- "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number",
53
- "Transaction number", "National health insurance number", "CVC", "Birth certificate number",
54
- "Train ticket number", "Passport expiration date", "Social_security_number"]
55
-
56
-
57
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  @st.cache_resource
60
  def load_ner_model():
61
  """
62
- Loads the pre-trained GLiNER NER model (urchade/gliner_multi_pii-v1) and caches it.
 
63
  This model is suitable for a wide range of custom entity types.
64
  """
65
  try:
@@ -116,34 +172,32 @@ def decrypt_text(encrypted_bytes: bytes) -> str | None:
116
  st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
117
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
118
 
119
- expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**") # Updated title
 
 
 
120
  expander.write(f'''
121
- **Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: {", ".join([f'"{label}"' for label in GLINER_LABELS])}.
122
-
123
- Results are presented in an easy-to-read table, visualized in an interactive tree map,
124
- pie chart, and bar chart, and are available for download along with a Glossary of tags.
125
 
 
126
  **Supported languages:** English, French, German, Spanish, Portuguese, Italian
127
 
128
- **How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button
129
- to extract and tag entities in your text data.
130
 
131
  **Usage Limits:** You can request results up to 10 times.
132
 
133
- **Language settings:** Please check and adjust the language settings in
134
- your computer, so the French, German, Spanish, Portuguese and Italian
135
- characters are handled properly in your downloaded file.
136
 
137
- **Customization:** To change the app's background color to white or
138
- black, click the three-dot menu on the right-hand side of your app, go to
139
- Settings and then Choose app theme, colors and fonts.
140
 
141
- **Technical issues:** If your connection times out, please refresh the
142
- page or reopen the app's URL.
143
 
144
  For any errors or inquiries, please contact us at [email protected]
145
  ''')
146
 
 
 
 
147
  with st.sidebar:
148
  container = st.container(border=True)
149
  container.write("**Named Entity Recognition (NER)** is the task of "
@@ -222,8 +276,8 @@ if st.button("Results"):
222
 
223
  # Measure NER model processing time
224
  start_time_ner = time.time()
225
- # Use GLiNER's predict_entities method with the defined labels
226
- text_entities = model.predict_entities(text_for_ner, GLINER_LABELS)
227
  end_time_ner = time.time()
228
  ner_processing_time = end_time_ner - start_time_ner
229
 
@@ -235,18 +289,18 @@ if st.button("Results"):
235
  else:
236
  st.error("Unexpected GLiNER output structure. Please check the model's output format.")
237
  st.stop()
238
-
239
 
240
-
241
  # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
242
  df = df.replace('', 'Unknown').dropna()
243
 
244
  if df.empty:
245
  st.warning("No entities were extracted from the uploaded text.")
246
  st.stop()
247
-
248
-
249
 
 
 
 
 
250
 
251
  if comet_initialized:
252
  experiment = Experiment(
@@ -266,7 +320,7 @@ if st.button("Results"):
266
  st.dataframe(df_styled, use_container_width=True)
267
 
268
  with st.expander("See Glossary of tags"):
269
- st.write('''
270
  '**word**': ['entity extracted from your text data']
271
 
272
  '**score**': ['accuracy score; how accurately a tag has been assigned to
@@ -277,40 +331,42 @@ if st.button("Results"):
277
  '**start**': ['index of the start of the corresponding entity']
278
 
279
  '**end**': ['index of the end of the corresponding entity']
280
- ''')
 
281
 
282
 
283
- st.subheader("Grouped entities", divider = "orange")
284
-
285
- entity_items = [(label, label.replace('_', ' ').title()) for label in GLINER_LABELS]
286
- tabs_per_row = 5
287
- for i in range(0, len(entity_items), tabs_per_row):
288
- current_row_entities = entity_items[i : i + tabs_per_row]
289
- tab_titles = [item[1] for item in current_row_entities]
290
-
291
- tabs = st.tabs(tab_titles)
292
- for j, (entity_group_key, tab_title) in enumerate(current_row_entities):
293
- with tabs[j]:
294
- if entity_group_key in df["entity_group"].unique():
295
- df_filtered = df[df["entity_group"] == entity_group_key]
296
- st.dataframe(df_filtered, use_container_width=True)
297
- else:
298
- st.info(f"No '{tab_title}' entities found in the text.")
299
- # Display an empty DataFrame for consistency if no entities are found
300
- st.dataframe(pd.DataFrame({
301
- 'entity_group': [entity_group_key],
302
- 'score': [np.nan],
303
- 'word': [np.nan],
304
- 'start': [np.nan],
305
- 'end': [np.nan]
306
- }), hide_index=True)
307
 
308
  st.divider()
309
 
310
  # --- Visualizations ---
311
  st.subheader("Tree map", divider="orange")
312
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'], # Changed path for better visual grouping
313
- values='score', color='entity_group')
 
314
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
315
  st.plotly_chart(fig_treemap)
316
  if comet_initialized:
@@ -321,32 +377,43 @@ if st.button("Results"):
321
 
322
  col1, col2 = st.columns(2)
323
  with col1:
324
- st.subheader("Pie Chart", divider="orange")
325
  fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
326
- hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted labels')
327
  fig_pie.update_traces(textposition='inside', textinfo='percent+label')
328
  st.plotly_chart(fig_pie)
329
  if comet_initialized:
330
  experiment.log_figure(figure=fig_pie, figure_name="label_pie_chart")
331
 
332
  with col2:
333
- st.subheader("Bar Chart", divider="orange")
334
  fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
335
- title='Occurrences of predicted labels')
 
336
  st.plotly_chart(fig_bar)
337
  if comet_initialized:
338
  experiment.log_figure(figure=fig_bar, figure_name="label_bar_chart")
339
 
 
 
 
 
 
 
 
 
 
340
  # --- Downloadable Content ---
341
  dfa = pd.DataFrame(
342
  data={
343
- 'Column Name': ['word', 'entity_group','score', 'start', 'end'],
344
  'Description': [
345
  'entity extracted from your text data',
346
  'label (tag) assigned to a given extracted entity',
347
  'accuracy score; how accurately a tag has been assigned to a given entity',
348
  'index of the start of the corresponding entity',
349
  'index of the end of the corresponding entity',
 
350
  ]
351
  }
352
  )
 
2
  import streamlit as st
3
  import pandas as pd
4
  import io
 
 
5
  import plotly.express as px
6
  import zipfile
7
  import os
 
13
  from PyPDF2 import PdfReader
14
  import docx
15
  from comet_ml import Experiment
16
+ from streamlit_extras.stylable_container import stylable_container
17
 
18
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
19
 
 
33
  if 'encrypted_extracted_text' not in st.session_state:
34
  st.session_state['encrypted_extracted_text'] = None
35
 
 
 
36
  max_attempts = 10
37
 
38
 
39
+ GLINER_LABELS_CATEGORIZED = {
40
+ "Personal Identifiers": [
41
+ "Person",
42
+ "Date of birth",
43
+ "Blood type",
44
+ "Digital signature",
45
+ "Social media handle",
46
+ "Username",
47
+ "Birth certificate number",
48
+ ],
49
+ "Contact Details": [
50
+ "Address",
51
+ "Phone number",
52
+ "Mobile phone number",
53
+ "Landline phone number",
54
+ "Email",
55
+ "Fax number",
56
+ "Postal code",
57
+ ],
58
+ "Financial & Payment": [
59
+ "Credit card number",
60
+ "Credit card expiration date",
61
+ "CVV",
62
+ "CVC",
63
+ "Bank account number",
64
+ "IBAN",
65
+ "Transaction number",
66
+ "Credit card brand",
67
+ ],
68
+ "Government & Official IDs": [
69
+ "Passport number",
70
+
71
+ "Social security number",
72
+
73
+ "CPF",
74
+ "Driver license number",
75
+ "Tax identification number",
76
+ "Identity card number",
77
+ "National ID number",
78
+ "Identity document number",
79
+ "Visa number",
80
+ "License plate number",
81
+ "CNPJ",
82
+ "Registration number",
83
+ "Student ID number",
84
+ "Passport expiration date",
85
+ ],
86
+ "Medical & Health": [
87
+ "Medication",
88
+ "Medical condition",
89
+ "Health insurance ID number",
90
+ "Health insurance number",
91
+ "National health insurance number",
92
+ ],
93
+ "Travel & Transport": [
94
+ "Flight number",
95
+ "Reservation number",
96
+ "Train ticket number",
97
+ "Vehicle registration number",
98
+ ],
99
+ "General Business & Other": [
100
+ "Organization",
101
+ "Insurance company",
102
+ "IP address",
103
+ "Serial number",
104
+ "Insurance number",
105
+ ]
106
+ }
107
+
108
+ # Flatten the categorized labels into a single list for GLiNER model input
109
+ GLINER_LABELS_FLAT = [label for category_labels in GLINER_LABELS_CATEGORIZED.values() for label in category_labels]
110
+
111
+ # Create a mapping from each specific label to its category for DataFrame processing
112
+ LABEL_TO_CATEGORY_MAP = {label: category for category, labels in GLINER_LABELS_CATEGORIZED.items() for label in labels}
113
 
114
  @st.cache_resource
115
  def load_ner_model():
116
  """
117
+ Loads the pre-trained GLiNER NER model (urchade/gliner_multi_pii-v1) and
118
+ caches it.
119
  This model is suitable for a wide range of custom entity types.
120
  """
121
  try:
 
172
  st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
173
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
174
 
175
+
176
+
177
+
178
+ expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**")
179
  expander.write(f'''
180
+ **Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: "Person", "Organization", "Phone number", "Address", "Passport number", "Email", "Credit card number", "Social security number", "Health insurance ID number", "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF", "Driver license number", "Tax identification number", "Medical condition", "Identity card number", "National ID number", "IP address", "IBAN", "Credit card expiration date", "Username", "Health insurance number", "Registration number", "Student ID number", "Insurance number", "Flight number", "Landline phone number", "Blood type", "CVV", "Reservation number", "Digital signature", "Social media handle", "License plate number", "CNPJ", "Postal code", "Serial number", "Vehicle registration number", "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number", "Transaction number", "National health insurance number", "CVC", "Birth certificate number", "Train ticket number", "Passport expiration date"
 
 
 
181
 
182
+ Results are presented in an easy-to-read table, visualized in an interactive tree map, pie chart, and bar chart, and are available for download along with a Glossary of tags.
183
  **Supported languages:** English, French, German, Spanish, Portuguese, Italian
184
 
185
+ **How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button to extract and tag entities in your text data.
 
186
 
187
  **Usage Limits:** You can request results up to 10 times.
188
 
189
+ **Language settings:** Please check and adjust the language settings in your computer, so the French, German, Spanish, Portuguese and Italian characters are handled properly in your downloaded file.
 
 
190
 
191
+ **Customization:** To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
 
 
192
 
193
+ **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
 
194
 
195
  For any errors or inquiries, please contact us at [email protected]
196
  ''')
197
 
198
+
199
+
200
+
201
  with st.sidebar:
202
  container = st.container(border=True)
203
  container.write("**Named Entity Recognition (NER)** is the task of "
 
276
 
277
  # Measure NER model processing time
278
  start_time_ner = time.time()
279
+ # Use GLiNER's predict_entities method with the defined flat list of labels
280
+ text_entities = model.predict_entities(text_for_ner, GLINER_LABELS_FLAT)
281
  end_time_ner = time.time()
282
  ner_processing_time = end_time_ner - start_time_ner
283
 
 
289
  else:
290
  st.error("Unexpected GLiNER output structure. Please check the model's output format.")
291
  st.stop()
 
292
 
 
293
  # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
294
  df = df.replace('', 'Unknown').dropna()
295
 
296
  if df.empty:
297
  st.warning("No entities were extracted from the uploaded text.")
298
  st.stop()
 
 
299
 
300
+ # --- Add 'category' column to the DataFrame based on the grouped labels ---
301
+ df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
302
+ # Handle cases where an entity_group might not have a category (shouldn't happen if maps are complete)
303
+ df['category'] = df['category'].fillna('Uncategorized')
304
 
305
  if comet_initialized:
306
  experiment = Experiment(
 
320
  st.dataframe(df_styled, use_container_width=True)
321
 
322
  with st.expander("See Glossary of tags"):
323
+ st.write("""
324
  '**word**': ['entity extracted from your text data']
325
 
326
  '**score**': ['accuracy score; how accurately a tag has been assigned to
 
331
  '**start**': ['index of the start of the corresponding entity']
332
 
333
  '**end**': ['index of the end of the corresponding entity']
334
+ '**category**': ['the broader category this entity belongs to']
335
+ """)
336
 
337
 
338
+ st.subheader("Grouped Entities by Category", divider = "orange")
339
+
340
+ # Create tabs for each category
341
+ category_names = list(GLINER_LABELS_CATEGORIZED.keys())
342
+ category_tabs = st.tabs(category_names)
343
+
344
+ for i, category_name in enumerate(category_names):
345
+ with category_tabs[i]:
346
+
347
+
348
+ # Filter the main DataFrame for the current category
349
+ df_category_filtered = df[df['category'] == category_name]
350
+
351
+ if not df_category_filtered.empty:
352
+ # Sort entities within the category by their specific type for better display
353
+ for entity_type in GLINER_LABELS_CATEGORIZED[category_name]:
354
+ df_entity_type_filtered = df_category_filtered[df_category_filtered['entity_group'] == entity_type]
355
+ if not df_entity_type_filtered.empty:
356
+ st.markdown(f"***{entity_type}***")
357
+ st.dataframe(df_entity_type_filtered.drop(columns=['category']), use_container_width=True)
358
+ else:
359
+ st.info(f"No '{entity_type}' entities found for this category.")
360
+ else:
361
+ st.info(f"No entities found for the '{category_name}' category.")
362
 
363
  st.divider()
364
 
365
  # --- Visualizations ---
366
  st.subheader("Tree map", divider="orange")
367
+ # Update treemap path to include category
368
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
369
+ values='score', color='category') # Color by category for better visual distinction
370
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
371
  st.plotly_chart(fig_treemap)
372
  if comet_initialized:
 
377
 
378
  col1, col2 = st.columns(2)
379
  with col1:
380
+ st.subheader("Pie Chart (by Entity Type)", divider="orange")
381
  fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
382
+ hover_data=['count'], labels={'count': 'count'}, title='Percentage of Predicted Labels (Entity Types)')
383
  fig_pie.update_traces(textposition='inside', textinfo='percent+label')
384
  st.plotly_chart(fig_pie)
385
  if comet_initialized:
386
  experiment.log_figure(figure=fig_pie, figure_name="label_pie_chart")
387
 
388
  with col2:
389
+ st.subheader("Bar Chart (by Entity Type)", divider="orange")
390
  fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
391
+ title='Occurrences of Predicted Labels (Entity Types)', orientation='h')
392
+ fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Order bars
393
  st.plotly_chart(fig_bar)
394
  if comet_initialized:
395
  experiment.log_figure(figure=fig_bar, figure_name="label_bar_chart")
396
 
397
+ # Add a chart for categories
398
+ st.subheader("Entity Counts by Category", divider="orange")
399
+ category_counts = df['category'].value_counts().reset_index().rename(columns={"index": "category", "count": "count"})
400
+ fig_cat_bar = px.bar(category_counts, x="count", y="category", color="category", text_auto=True,
401
+ title='Occurrences of Entities by Category', orientation='h')
402
+ fig_cat_bar.update_layout(yaxis={'categoryorder':'total ascending'})
403
+ st.plotly_chart(fig_cat_bar)
404
+
405
+
406
  # --- Downloadable Content ---
407
  dfa = pd.DataFrame(
408
  data={
409
+ 'Column Name': ['word', 'entity_group', 'score', 'start', 'end', 'category'],
410
  'Description': [
411
  'entity extracted from your text data',
412
  'label (tag) assigned to a given extracted entity',
413
  'accuracy score; how accurately a tag has been assigned to a given entity',
414
  'index of the start of the corresponding entity',
415
  'index of the end of the corresponding entity',
416
+ 'the broader category this entity belongs to',
417
  ]
418
  }
419
  )