Maria Tsilimos commited on
Commit
67a9d8b
·
unverified ·
1 Parent(s): c4395b7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +379 -0
app.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import io
5
+
6
+ from streamlit_extras.stylable_container import stylable_container
7
+ import plotly.express as px
8
+ import zipfile
9
+ import os
10
+ import re
11
+ import numpy as np
12
+
13
+ from cryptography.fernet import Fernet
14
+ from gliner import GLiNER
15
+ from PyPDF2 import PdfReader
16
+ import docx
17
+ from comet_ml import Experiment
18
+
19
+ st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
20
+
21
+ # --- Configuration ---
22
+ COMET_API_KEY = os.environ.get("COMET_API_KEY")
23
+ COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
24
+ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
25
+
26
+ comet_initialized = False
27
+ if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
28
+ comet_initialized = True
29
+
30
+ # --- Initialize session state ---
31
+ if 'file_upload_attempts' not in st.session_state:
32
+ st.session_state['file_upload_attempts'] = 0
33
+
34
+ if 'encrypted_extracted_text' not in st.session_state:
35
+ st.session_state['encrypted_extracted_text'] = None
36
+
37
+
38
+
39
+ max_attempts = 10
40
+
41
+
42
+ GLINER_LABELS = ["Person", "Organization", "Phone number", "Address", "Passport number",
43
+ "Email", "Credit card number", "Social security number", "Health insurance ID number",
44
+ "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF",
45
+ "Driver license number", "Tax identification number", "Medical condition",
46
+ "Identity card number", "National ID number", "IP address", "IBAN",
47
+ "Credit card expiration date", "Username", "Health insurance number",
48
+ "Registration number", "Student ID number", "Insurance number", "Flight number",
49
+ "Landline phone number", "Blood type", "CVV", "Reservation number",
50
+ "Digital signature", "Social media handle", "License plate number",
51
+ "CNPJ", "Postal code", "Passport_number", "Serial number", "Vehicle registration number",
52
+ "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number",
53
+ "Transaction number", "National health insurance number", "CVC", "Birth certificate number",
54
+ "Train ticket number", "Passport expiration date", "Social_security_number"]
55
+
56
+
57
+
58
+
59
+ @st.cache_resource
60
+ def load_ner_model():
61
+ """
62
+ Loads the pre-trained GLiNER NER model (urchade/gliner_multi_pii-v1) and caches it.
63
+ This model is suitable for a wide range of custom entity types.
64
+ """
65
+ try:
66
+ return GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
67
+ except Exception as e:
68
+ st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
69
+ st.stop()
70
+
71
+ @st.cache_resource
72
+ def load_encryption_key():
73
+ """
74
+ Loads the Fernet encryption key from environment variables.
75
+ This key is crucial for encrypting/decrypting sensitive data.
76
+ It's cached as a resource to be loaded only once.
77
+ """
78
+ try:
79
+ # Get the key string from environment variables
80
+ key_str = os.environ.get("FERNET_KEY")
81
+ if not key_str:
82
+ raise ValueError("FERNET_KEY environment variable not set. Cannot perform encryption/decryption.")
83
+
84
+ # Fernet key must be bytes, so encode the string
85
+ key_bytes = key_str.encode('utf-8')
86
+ return Fernet(key_bytes)
87
+ except ValueError as ve:
88
+ st.error(f"Configuration Error: {ve}. Please ensure the 'FERNET_KEY' environment variable is set securely in your deployment environment (e.g., Hugging Face Spaces secrets, Render environment variables) or in a local .env file for development.")
89
+ st.stop() # Stop the app if the key is not found, as security is compromised
90
+ except Exception as e:
91
+ st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
92
+ st.stop()
93
+
94
+ # Initialize the Fernet cipher instance globally (cached)
95
+ fernet = load_encryption_key()
96
+
97
+ def encrypt_text(text_content: str) -> bytes:
98
+ """
99
+ Encrypts a string using the loaded Fernet cipher.
100
+ The input string is first encoded to UTF-8 bytes.
101
+ """
102
+ return fernet.encrypt(text_content.encode('utf-8'))
103
+
104
+ def decrypt_text(encrypted_bytes: bytes) -> str | None:
105
+ """
106
+ Decrypts bytes using the loaded Fernet cipher.
107
+ Returns the decrypted string, or None if decryption fails (e.g., tampering).
108
+ """
109
+ try:
110
+ return fernet.decrypt(encrypted_bytes).decode('utf-8')
111
+ except Exception as e:
112
+ st.error(f"Decryption failed. This might indicate data tampering or an incorrect encryption key. Error: {e}")
113
+ return None
114
+
115
+ # --- UI Elements ---
116
+ st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
117
+ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
118
+
119
+ expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**") # Updated title
120
+ expander.write(f'''
121
+ **Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: {", ".join([f'"{label}"' for label in GLINER_LABELS])}.
122
+
123
+ Results are presented in an easy-to-read table, visualized in an interactive tree map,
124
+ pie chart, and bar chart, and are available for download along with a Glossary of tags.
125
+ **Supported languages** English, French, German, Spanish, Portuguese, Italian
126
+
127
+ **How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button
128
+ to extract and tag entities in your text data.
129
+
130
+ **Usage Limits:** You can request results up to 10 times.
131
+
132
+ **Language settings:** Please check and adjust the language settings in
133
+ your computer, so the French, German, Spanish, Portuguese and Italian
134
+ characters are handled properly in your downloaded file.
135
+
136
+ **Customization:** To change the app's background color to white or
137
+ black, click the three-dot menu on the right-hand side of your app, go to
138
+ Settings and then Choose app theme, colors and fonts.
139
+
140
+ **Technical issues:** If your connection times out, please refresh the
141
+ page or reopen the app's URL.
142
+
143
+ For any errors or inquiries, please contact us at [email protected]
144
+ ''')
145
+
146
+ with st.sidebar:
147
+ container = st.container(border=True)
148
+ container.write("**Named Entity Recognition (NER)** is the task of "
149
+ "extracting and tagging entities in text data. Entities can be persons, "
150
+ "organizations, locations, countries, products, events etc.")
151
+ st.subheader("Related NER Web Apps", divider="orange")
152
+ st.link_button("Scandinavian JSON Entity Finder",
153
+ "https://nlpblogs.com/shop/named-entity-recognition-ner/scandinavian-json-entity-finder/",
154
+ type="primary")
155
+
156
+ # --- File Upload (PDF/DOCX) ---
157
+ uploaded_file = st.file_uploader("Upload your file. Accepted file formats include: .pdf, .docx", type=['pdf', 'docx'])
158
+
159
+ # Initialize text for the current run outside the if uploaded_file block
160
+ current_run_text = None
161
+
162
+ if uploaded_file is not None:
163
+ file_extension = uploaded_file.name.split('.')[-1].lower()
164
+ if file_extension == 'pdf':
165
+ try:
166
+ pdf_reader = PdfReader(uploaded_file)
167
+ text_content = ""
168
+ for page in pdf_reader.pages:
169
+ text_content += page.extract_text()
170
+ current_run_text = text_content
171
+ st.success("PDF file uploaded successfully. File content encrypted and secured. Due to security protocols, the file content is hidden.")
172
+ except Exception as e:
173
+ st.error(f"An error occurred while reading PDF: {e}")
174
+ current_run_text = None
175
+ elif file_extension == 'docx':
176
+ try:
177
+ doc = docx.Document(uploaded_file)
178
+ text_content = "\n".join([para.text for para in doc.paragraphs])
179
+ current_run_text = text_content
180
+ st.success("DOCX file uploaded successfully. File content encrypted and secured. Due to security protocols, the file content is hidden.")
181
+ except Exception as e:
182
+ st.error(f"An error occurred while reading DOCX: {e}")
183
+ current_run_text = None
184
+ else:
185
+ st.warning("Unsupported file type. Please upload a .pdf or .docx file.")
186
+ current_run_text = None
187
+
188
+ if current_run_text and current_run_text.strip():
189
+ # --- ENCRYPT THE EXTRACTED TEXT BEFORE STORING IN SESSION STATE ---
190
+ encrypted_text_bytes = encrypt_text(current_run_text)
191
+ st.session_state['encrypted_extracted_text'] = encrypted_text_bytes
192
+
193
+ st.divider()
194
+ else:
195
+ st.session_state['encrypted_extracted_text'] = None
196
+ st.error("Could not extract meaningful text from the uploaded file.")
197
+
198
+ # --- Results Button and Processing Logic ---
199
+ if st.button("Results"):
200
+ start_time_overall = time.time() # Start time for overall processing
201
+ if not comet_initialized:
202
+ st.warning("Comet ML not initialized. Check environment variables if you wish to log data.")
203
+
204
+ if st.session_state['file_upload_attempts'] >= max_attempts:
205
+ st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
206
+ st.stop()
207
+
208
+ # --- DECRYPT THE TEXT BEFORE PASSING TO NER MODEL ---
209
+ text_for_ner = None
210
+ if st.session_state['encrypted_extracted_text'] is not None:
211
+ text_for_ner = decrypt_text(st.session_state['encrypted_extracted_text'])
212
+
213
+ if text_for_ner is None or not text_for_ner.strip():
214
+ st.warning("No extractable text content available for analysis. Please upload a valid PDF or DOCX file.")
215
+ st.stop()
216
+
217
+ st.session_state['file_upload_attempts'] += 1
218
+
219
+ with st.spinner("Analyzing text...", show_time=True):
220
+ model = load_ner_model()
221
+
222
+ # Measure NER model processing time
223
+ start_time_ner = time.time()
224
+ # Use GLiNER's predict_entities method with the defined labels
225
+ text_entities = model.predict_entities(text_for_ner, GLINER_LABELS)
226
+ end_time_ner = time.time()
227
+ ner_processing_time = end_time_ner - start_time_ner
228
+
229
+ df = pd.DataFrame(text_entities)
230
+
231
+ # Rename 'label' to 'entity_group' and 'text' to 'word' for consistency
232
+ if 'label' in df.columns:
233
+ df.rename(columns={'label': 'entity_group', 'text': 'word'}, inplace=True)
234
+ else:
235
+ st.error("Unexpected GLiNER output structure. Please check the model's output format.")
236
+ st.stop()
237
+
238
+
239
+
240
+ # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
241
+ df = df.replace('', 'Unknown').dropna()
242
+
243
+ if df.empty:
244
+ st.warning("No entities were extracted from the uploaded text.")
245
+ st.stop()
246
+
247
+
248
+
249
+
250
+ if comet_initialized:
251
+ experiment = Experiment(
252
+ api_key=COMET_API_KEY,
253
+ workspace=COMET_WORKSPACE,
254
+ project_name=COMET_PROJECT_NAME,
255
+ )
256
+ experiment.log_parameter("input_text_length", len(text_for_ner))
257
+ experiment.log_table("predicted_entities", df)
258
+ experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
259
+
260
+
261
+ # --- Display Results ---
262
+ st.subheader("Extracted Entities", divider="rainbow")
263
+ properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
264
+ df_styled = df.style.set_properties(**properties)
265
+ st.dataframe(df_styled, use_container_width=True)
266
+
267
+ with st.expander("See Glossary of tags"):
268
+ st.write('''
269
+ '**word**': ['entity extracted from your text data']
270
+
271
+ '**score**': ['accuracy score; how accurately a tag has been assigned to
272
+ a given entity']
273
+
274
+ '**entity_group**': ['label (tag) assigned to a given extracted entity']
275
+
276
+ '**start**': ['index of the start of the corresponding entity']
277
+
278
+ '**end**': ['index of the end of the corresponding entity']
279
+ ''')
280
+
281
+
282
+ st.subheader("Grouped entities", divider = "orange")
283
+
284
+ entity_items = [(label, label.replace('_', ' ').title()) for label in GLINER_LABELS]
285
+ tabs_per_row = 5
286
+ for i in range(0, len(entity_items), tabs_per_row):
287
+ current_row_entities = entity_items[i : i + tabs_per_row]
288
+ tab_titles = [item[1] for item in current_row_entities]
289
+
290
+ tabs = st.tabs(tab_titles)
291
+ for j, (entity_group_key, tab_title) in enumerate(current_row_entities):
292
+ with tabs[j]:
293
+ if entity_group_key in df["entity_group"].unique():
294
+ df_filtered = df[df["entity_group"] == entity_group_key]
295
+ st.dataframe(df_filtered, use_container_width=True)
296
+ else:
297
+ st.info(f"No '{tab_title}' entities found in the text.")
298
+ # Display an empty DataFrame for consistency if no entities are found
299
+ st.dataframe(pd.DataFrame({
300
+ 'entity_group': [entity_group_key],
301
+ 'score': [np.nan],
302
+ 'word': [np.nan],
303
+ 'start': [np.nan],
304
+ 'end': [np.nan]
305
+ }), hide_index=True)
306
+
307
+ st.divider()
308
+
309
+ # --- Visualizations ---
310
+ st.subheader("Tree map", divider="orange")
311
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'], # Changed path for better visual grouping
312
+ values='score', color='entity_group')
313
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
314
+ st.plotly_chart(fig_treemap)
315
+ if comet_initialized:
316
+ experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
317
+
318
+ value_counts1 = df['entity_group'].value_counts()
319
+ final_df_counts = value_counts1.reset_index().rename(columns={"index": "entity_group", "count": "count"})
320
+
321
+ col1, col2 = st.columns(2)
322
+ with col1:
323
+ st.subheader("Pie Chart", divider="orange")
324
+ fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
325
+ hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted labels')
326
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
327
+ st.plotly_chart(fig_pie)
328
+ if comet_initialized:
329
+ experiment.log_figure(figure=fig_pie, figure_name="label_pie_chart")
330
+
331
+ with col2:
332
+ st.subheader("Bar Chart", divider="orange")
333
+ fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
334
+ title='Occurrences of predicted labels')
335
+ st.plotly_chart(fig_bar)
336
+ if comet_initialized:
337
+ experiment.log_figure(figure=fig_bar, figure_name="label_bar_chart")
338
+
339
+ # --- Downloadable Content ---
340
+ dfa = pd.DataFrame(
341
+ data={
342
+ 'Column Name': ['word', 'entity_group','score', 'start', 'end'],
343
+ 'Description': [
344
+ 'entity extracted from your text data',
345
+ 'label (tag) assigned to a given extracted entity',
346
+ 'accuracy score; how accurately a tag has been assigned to a given entity',
347
+ 'index of the start of the corresponding entity',
348
+ 'index of the end of the corresponding entity',
349
+ ]
350
+ }
351
+ )
352
+
353
+ buf = io.BytesIO()
354
+ with zipfile.ZipFile(buf, "w") as myzip:
355
+ myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
356
+ myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
357
+
358
+ with stylable_container(
359
+ key="download_button",
360
+ css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
361
+ ):
362
+ st.download_button(
363
+ label="Download zip file",
364
+ data=buf.getvalue(),
365
+ file_name="nlpblogs_ner_results.zip",
366
+ mime="application/zip",
367
+ )
368
+ if comet_initialized:
369
+ experiment.log_asset(buf.getvalue(), file_name="downloadable_results.zip")
370
+
371
+ st.divider()
372
+ if comet_initialized:
373
+ experiment.end()
374
+
375
+ end_time_overall = time.time() # End time for overall processing
376
+ elapsed_time_overall = end_time_overall - start_time_overall
377
+ st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
378
+
379
+ st.write(f"Number of times you requested results: **{st.session_state['file_upload_attempts']}/{max_attempts}**")