Maria Tsilimos commited on
Commit
17f7ba5
·
unverified ·
1 Parent(s): a92dba4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +369 -0
app.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import streamlit as st
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ from transformers import pipeline
6
+ import plotly.express as px
7
+ import time
8
+ import io
9
+ import os
10
+ import zipfile
11
+ import re
12
+ import numpy as np
13
+ from cryptography.fernet import Fernet
14
+ from streamlit_extras.stylable_container import stylable_container
15
+ from comet_ml import Experiment
16
+
17
+ st.set_page_config(layout="wide", page_title="English Keyphrase TXT & URL Entity Finder")
18
+
19
+ # --- Configuration for Comet ML ---
20
+ COMET_API_KEY = os.environ.get("COMET_API_KEY")
21
+ COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
22
+ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
23
+ comet_initialized = False
24
+ if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
25
+ comet_initialized = True
26
+
27
+ # --- Initialize session state for attempts and encrypted text ---
28
+ if 'source_type_attempts' not in st.session_state:
29
+ st.session_state['source_type_attempts'] = 0
30
+ if 'encrypted_text_to_process' not in st.session_state:
31
+ st.session_state['encrypted_text_to_process'] = None
32
+ if 'uploaded_file_content' not in st.session_state:
33
+ st.session_state['uploaded_file_content'] = None # To store content of uploaded file
34
+ if 'file_uploader_key' not in st.session_state:
35
+ st.session_state['file_uploader_key'] = 0 # To reset the file uploader
36
+
37
+ max_attempts = 10
38
+
39
+ # --- Fernet Encryption Setup ---
40
+ @st.cache_resource
41
+ def load_encryption_key():
42
+ try:
43
+ key_str = os.environ.get("FERNET_KEY")
44
+ if not key_str:
45
+ raise ValueError("FERNET_KEY environment variable not set. Cannot perform encryption/decryption.")
46
+ key_bytes = key_str.encode('utf-8')
47
+ return Fernet(key_bytes)
48
+ except ValueError as ve:
49
+ st.error(f"Configuration Error: {ve}. Please ensure the 'FERNET_KEY' environment variable is set securely in your deployment environment (e.g., Hugging Face Spaces secrets, Render environment variables) or in a local .env file for development.")
50
+ st.stop()
51
+ except Exception as e:
52
+ st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
53
+ st.stop()
54
+
55
+ # Initialize the Fernet cipher instance globally (cached)
56
+ fernet = load_encryption_key()
57
+
58
+ def encrypt_text(text_content: str) -> bytes:
59
+ """Encrypts a string using the loaded Fernet cipher."""
60
+ return fernet.encrypt(text_content.encode('utf-8'))
61
+
62
+ def decrypt_text(encrypted_bytes: bytes) -> str | None:
63
+ """
64
+ Decrypts bytes using the loaded Fernet cipher.
65
+ Returns the decrypted string, or None if decryption fails.
66
+ """
67
+ try:
68
+ return fernet.decrypt(encrypted_bytes).decode('utf-8')
69
+ except Exception as e:
70
+ st.error(f"Decryption failed. This might indicate data tampering or an incorrect encryption key. Error: {e}")
71
+ return None
72
+
73
+ # --- UI Header and Notes ---
74
+ st.subheader("English Keyphrase TXT & URL Entity Finder", divider="rainbow")
75
+ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
76
+
77
+ expander = st.expander("**Important notes on the English Keyphrase TXT & URL Entity Finder**")
78
+ expander.write('''
79
+ **Named Entities:** This English Keyphrase TXT & URL Entity Finder extracts keyphrases from English academic and scientific papers.
80
+
81
+ Results are presented in an easy-to-read table, visualized in an interactive bar chart and tree map, and are available for download along with a Glossary of tags.
82
+
83
+ **How to Use:**
84
+ 1. Paste a URL and press Enter.
85
+ 2. Alternatively, type or paste text directly into the text area and press Ctrl + Enter.
86
+ 3. Or, upload your TXT file.
87
+
88
+ **Usage Limits:** You can request results up to 10 times.
89
+
90
+ **Customization:** To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
91
+
92
+ **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
93
+
94
+ For any errors or inquiries, please contact us at [email protected]
95
+ ''')
96
+
97
+ # --- Sidebar Content ---
98
+ with st.sidebar:
99
+ container = st.container(border=True)
100
+ container.write("**Named Entity Recognition (NER)** is the task of extracting and tagging entities in text data. Entities can be persons, organizations, locations, countries, products, events etc.")
101
+ st.subheader("Related NER Web Apps", divider="rainbow")
102
+ st.link_button("Scandinavian JSON Entity Finder", "https://nlpblogs.com/shop/named-entity-recognition-ner/scandinavian-json-entity-finder/", type="primary")
103
+
104
+ # --- Input Fields ---
105
+ def clear_url_input():
106
+ st.session_state.url = ""
107
+ st.session_state.encrypted_text_to_process = None
108
+ st.session_state.uploaded_file_content = None # Clear file content as well
109
+ st.session_state.my_text_area = "" # Clear text area
110
+ st.session_state['file_uploader_key'] += 1 # Increment key to reset file uploader
111
+
112
+ def clear_text_input():
113
+ st.session_state.my_text_area = ""
114
+ st.session_state.encrypted_text_to_process = None
115
+ st.session_state.uploaded_file_content = None # Clear file content as well
116
+ st.session_state.url = "" # Clear URL
117
+ st.session_state['file_uploader_key'] += 1 # Increment key to reset file uploader
118
+
119
+ def clear_file_input():
120
+ st.session_state.uploaded_file_content = None
121
+ st.session_state.encrypted_text_to_process = None
122
+ st.session_state.url = "" # Clear URL
123
+ st.session_state.my_text_area = "" # Clear text area
124
+ st.session_state['file_uploader_key'] += 1 # Increment key to reset file uploader
125
+
126
+ url = st.text_input("Enter URL from the internet, and then press Enter:", key="url")
127
+ st.button("Clear URL", on_click=clear_url_input)
128
+
129
+ text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area')
130
+ st.button("Clear Text", on_click=clear_text_input)
131
+
132
+ uploaded_file = st.file_uploader("Or upload a .txt file", type=["txt"], key=f"file_uploader_{st.session_state['file_uploader_key']}")
133
+ st.button("Clear Uploaded File", on_click=clear_file_input)
134
+
135
+ source_type = None
136
+ input_content = None
137
+ current_run_text = None # This will hold the text before encryption for the current run
138
+
139
+ # --- Logic to determine input source and content ---
140
+ if uploaded_file is not None:
141
+ source_type = 'file'
142
+ input_content = uploaded_file.name # Store filename as input_content for logging
143
+ # Read the content of the uploaded file
144
+ string_data = io.StringIO(uploaded_file.getvalue().decode("utf-8")).read()
145
+ current_run_text = string_data
146
+ st.session_state['uploaded_file_content'] = current_run_text # Store in session state for re-runs
147
+ st.success("TXT file uploaded successfully. File content encrypted and secured. Due to security protocols, the file content is hidden.")
148
+ st.divider()
149
+ st.write("**Input text content (from uploaded file)**")
150
+ st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
151
+ elif url:
152
+ source_type = 'url'
153
+ input_content = url
154
+ # Fetch and encrypt URL content immediately
155
+ if not url.startswith(("http://", "https://")):
156
+ st.error("Please enter a valid URL starting with 'http://' or 'https://'.")
157
+ current_run_text = None
158
+ else:
159
+ try:
160
+ with st.spinner(f"Fetching and parsing content from **{url}**...", show_time=True):
161
+ f = requests.get(url, timeout=10)
162
+ f.raise_for_status()
163
+ soup = BeautifulSoup(f.text, 'html.parser')
164
+ current_run_text = soup.get_text(separator=' ', strip=True)
165
+ st.divider()
166
+ st.write("**Input text content (from URL)**")
167
+ st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
168
+ except Exception as e:
169
+ st.error(f"Error fetching or parsing URL: {e}")
170
+ current_run_text = None
171
+ elif text:
172
+ source_type = 'text'
173
+ input_content = text
174
+ current_run_text = text
175
+ st.divider()
176
+ st.write("**Input text content (from text area)**")
177
+ st.write(current_run_text[:500] + "..." if len(current_run_text) > 500 else current_run_text)
178
+
179
+ # Encrypt and store the text in session state if available
180
+ if current_run_text and current_run_text.strip():
181
+ st.session_state['encrypted_text_to_process'] = encrypt_text(current_run_text)
182
+ else:
183
+ st.session_state['encrypted_text_to_process'] = None
184
+
185
+ # --- Main Processing Logic (triggered by input or refresh) ---
186
+ # Initialize experiment here, before the try block, to ensure it's always defined
187
+ experiment = None
188
+ start_time_overall = None # Initialize to None so it can be checked in finally
189
+
190
+ try: # Outer try block for general error handling and finally cleanup
191
+ if source_type: # Only proceed if there's a source type
192
+ start_time_overall = time.time() # Start timer here, now within the try block scope
193
+
194
+ if st.session_state['source_type_attempts'] >= max_attempts:
195
+ st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
196
+ pass
197
+ else:
198
+ st.session_state['source_type_attempts'] += 1
199
+
200
+ @st.cache_resource
201
+ def load_ner_model():
202
+ return pipeline("token-classification", model="ml6team/keyphrase-extraction-kbir-inspec", aggregation_strategy="max", stride=128, ignore_labels=["O"])
203
+
204
+ model = load_ner_model()
205
+
206
+ # Decrypt text from session state before processing
207
+ text_for_ner = None
208
+ if st.session_state['encrypted_text_to_process'] is not None:
209
+ text_for_ner = decrypt_text(st.session_state['encrypted_text_to_process'])
210
+
211
+ if text_for_ner and len(text_for_ner.strip()) > 0:
212
+ with st.spinner("Analyzing text...", show_time=True):
213
+ entities = model(text_for_ner)
214
+ data = []
215
+ if entities:
216
+ for entity in entities:
217
+ if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
218
+ data.append({
219
+ 'word': entity['word'],
220
+ 'entity_group': entity['entity_group'],
221
+ 'score': entity['score'],
222
+ 'start': entity['start'],
223
+ 'end': entity['end']
224
+ })
225
+ else:
226
+ st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
227
+ df = pd.DataFrame(data)
228
+ else:
229
+ df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
230
+
231
+ if not df.empty:
232
+ pattern = r'[^\w\s]'
233
+ df['word'] = df['word'].replace(pattern, '', regex=True)
234
+ df = df.replace('', 'Unknown')
235
+
236
+ st.subheader("All Extracted Keyphrases", divider="rainbow")
237
+ st.dataframe(df, use_container_width=True) # Full dataframe of all entities
238
+
239
+ # Glossary section is an expander and functions as requested
240
+ with st.expander("See Glossary of tags"):
241
+ st.write('''
242
+ '**word**': ['entity extracted from your text data']
243
+
244
+ '**score**': ['accuracy score; how accurately a tag has been assigned to a given entity']
245
+
246
+ '**entity_group**': ['label (tag) assigned to a given extracted entity']
247
+
248
+ '**start**': ['index of the start of the corresponding entity']
249
+
250
+ '**end**': ['index of the end of the corresponding entity']
251
+
252
+ ''')
253
+ st.divider()
254
+
255
+ # --- Most Frequent Keyphrases Section with Tabs ---
256
+ st.subheader("Most Frequent Keyphrases", divider="rainbow")
257
+ # Calculate frequency of each keyphrase
258
+ word_counts = df['word'].value_counts().reset_index()
259
+ word_counts.columns = ['word', 'count']
260
+
261
+ # Filter for keyphrases that appear more than once (or top N)
262
+ # Let's show top 15 frequent keyphrases for better visualization
263
+ df_frequent = word_counts[word_counts['count'] > 1].sort_values(by='count', ascending=False).head(15)
264
+
265
+ if not df_frequent.empty:
266
+ tab1, tab2 = st.tabs(["Table", "Chart"])
267
+
268
+ with tab1:
269
+
270
+ st.dataframe(df_frequent, use_container_width=True)
271
+
272
+ with tab2:
273
+
274
+ # Bar chart for frequent keyphrases
275
+ fig_frequent_bar = px.bar(
276
+ df_frequent,
277
+ x='count',
278
+ y='word',
279
+ orientation='h',
280
+ title='Top Frequent Keyphrases by Count',
281
+ color='count', # Color bars based on count
282
+ color_continuous_scale=px.colors.sequential.Viridis # Example color scale
283
+ )
284
+ fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Sort bars by count
285
+ st.plotly_chart(fig_frequent_bar, use_container_width=True)
286
+
287
+ if comet_initialized and experiment:
288
+ experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
289
+ else:
290
+ st.info("No keyphrases found with more than one occurrence to display in tabs.")
291
+
292
+ st.divider()
293
+
294
+ if comet_initialized:
295
+ experiment = Experiment(
296
+ api_key=COMET_API_KEY,
297
+ workspace=COMET_WORKSPACE,
298
+ project_name=COMET_PROJECT_NAME,
299
+ )
300
+ experiment.log_parameter("input_source_type", source_type)
301
+ experiment.log_parameter("input_content_length", len(input_content) if isinstance(input_content, str) else len(str(input_content)))
302
+ if not df.empty:
303
+ experiment.log_table("predicted_entities", df)
304
+ else:
305
+ experiment.log_text("No entities found for logging.")
306
+
307
+ # Treemap
308
+ st.subheader("Treemap of All Keyphrases", divider="rainbow")
309
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'],
310
+ values='score',
311
+ color='word', # Color by 'word' for different colors for each key
312
+ color_continuous_scale=px.colors.sequential.Plasma # Example color scale
313
+ )
314
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
315
+ st.plotly_chart(fig_treemap, use_container_width=True)
316
+
317
+ if comet_initialized and experiment:
318
+ experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
319
+
320
+
321
+ else:
322
+ st.warning("No entities found to generate visualizations.")
323
+
324
+ # --- Download Section ---
325
+ dfa = pd.DataFrame(
326
+ data={
327
+ 'Column Name': ['word', 'entity_group', 'score', 'start', 'end'],
328
+ 'Description': [
329
+ 'entity extracted from your text data',
330
+ 'label (tag) assigned to a given extracted entity',
331
+ 'accuracy score; how accurately a tag has been assigned to a given entity',
332
+ 'index of the start of the corresponding entity',
333
+ 'index of the end of the corresponding entity'
334
+ ]
335
+ }
336
+ )
337
+ buf = io.BytesIO()
338
+ with zipfile.ZipFile(buf, "w") as myzip:
339
+ if not df.empty:
340
+ myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
341
+ myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
342
+ myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
343
+
344
+ with stylable_container(
345
+ key="download_button",
346
+ css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
347
+ ):
348
+ st.download_button(
349
+ label="Download zip file",
350
+ data=buf.getvalue(),
351
+ file_name="nlpblogs_ner_results.zip",
352
+ mime="application/zip",
353
+ )
354
+ st.divider()
355
+ else:
356
+ st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
357
+ except Exception as e:
358
+ st.error(f"An unexpected error occurred: {e}")
359
+ finally:
360
+ if comet_initialized and experiment is not None:
361
+ try:
362
+ experiment.end()
363
+ except Exception as comet_e:
364
+ st.warning(f"Comet ML experiment.end() failed: {comet_e}")
365
+ if start_time_overall is not None:
366
+ end_time_overall = time.time()
367
+ elapsed_time_overall = end_time_overall - start_time_overall
368
+ st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
369
+ st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")