nlpblogs commited on
Commit
5e0a566
·
verified ·
1 Parent(s): 47a5fb3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -0
app.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import io
5
+ from transformers import pipeline
6
+ import plotly.express as px
7
+ import zipfile
8
+ import re
9
+ import numpy as np
10
+ import json
11
+
12
+ # --- Page Configuration ---
13
+ st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
14
+
15
+ # --- Initialize session state ---
16
+ # Removed the 'text_analysis_attempts' and 'max_attempts' as there's no limit.
17
+
18
+ # Define the categories and their associated entity labels
19
+ ENTITY_LABELS_CATEGORIZED = {
20
+ "Persons": ["PER"],
21
+ "Locations": ["LOC"],
22
+ "Organizations": ["ORG"],
23
+ "Miscellaneous": ["MISC"],
24
+ "Other": ["O"] # Including "O" for "Other" or non-entity if needed, though typically ignored by the pipeline
25
+ }
26
+
27
+ # Create a mapping from each specific entity label to its category
28
+ LABEL_TO_CATEGORY_MAP = {
29
+ label: category for category, labels in ENTITY_LABELS_CATEGORIZED.items() for label in labels
30
+ }
31
+
32
+ @st.cache_resource
33
+ def load_ner_model():
34
+ """
35
+ Loads the pre-trained NER model ("UGARIT/grc-ner-bert") and caches it.
36
+ """
37
+ try:
38
+ return pipeline(
39
+ "token-classification",
40
+ model="UGARIT/grc-ner-bert",
41
+ aggregation_strategy="max",
42
+ ignore_labels=["O"],
43
+ stride=128
44
+ )
45
+ except Exception as e:
46
+ st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
47
+ st.stop()
48
+
49
+ # --- UI Elements ---
50
+ st.subheader("Free Ancient Greek Entity Finder", divider="orange")
51
+ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
52
+
53
+ expander = st.expander("**Important notes on the Free Ancient Greek Entity Finder**")
54
+ expander.write('''
55
+ **Named Entities:** This Free Ancient Greek Entity Finder predicts four
56
+ (4) labels (“PER: person”, “LOC: location”, “ORG: organization”, “MISC:
57
+ miscellaneous”). Results are presented in an easy-to-read table, visualized in
58
+ an interactive tree map, pie chart, and bar chart, and are available for
59
+ download along with a Glossary of tags.
60
+
61
+ **How to Use:** Type or paste your Ancient Greek text into the input box. Then, click the 'Analyze Text' button
62
+ to extract and tag entities.
63
+
64
+ **Technical issues:** If your connection times out, please refresh the
65
+ page or reopen the app's URL.
66
+
67
+ For any errors or inquiries, please contact us at [email protected]
68
+ ''')
69
+
70
+ with st.sidebar:
71
+ container = st.container(border=True)
72
+ container.write("**Named Entity Recognition (NER)** is the task of "
73
+ "extracting and tagging entities in text data. Entities can be persons, "
74
+ "organizations, locations, countries, products, events etc.")
75
+ st.subheader("Related NER Web Apps", divider="orange")
76
+ st.link_button("Multilingual PDF & DOCX Entity Finder",
77
+ "https://nlpblogs.com/shop/named-entity-recognition-ner/multilingual-pdf-docx-entity-finder/",
78
+ type="primary")
79
+
80
+ text_input = st.text_area("Type or paste your Ancient Greek text here:")
81
+
82
+ # --- Results Button and Processing Logic ---
83
+ if st.button("Analyze Text"):
84
+ start_time_overall = time.time() # Start time for overall processing
85
+
86
+ # Removed the usage limit check
87
+ # if st.session_state['text_analysis_attempts'] >= max_attempts:
88
+ # st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
89
+ # st.stop()
90
+
91
+ if not text_input.strip():
92
+ st.warning("Please enter some text for analysis.")
93
+ st.stop()
94
+
95
+ # Removed incrementing the attempt counter
96
+ # st.session_state['text_analysis_attempts'] += 1
97
+
98
+ with st.spinner("Analyzing text...", show_time=True):
99
+ model = load_ner_model()
100
+
101
+ # Measure NER model processing time
102
+ start_time_ner = time.time()
103
+ text_entities = model(text_input)
104
+ end_time_ner = time.time()
105
+ ner_processing_time = end_time_ner - start_time_ner
106
+
107
+ df = pd.DataFrame(text_entities)
108
+
109
+ if 'word' in df.columns:
110
+ # Ensure 'word' column is string type before applying regex
111
+ if df['word'].dtype == 'object':
112
+ # Remove non-alphanumeric characters, keeping spaces and periods.
113
+ # For Greek, we might want to be more specific or simply remove special symbols.
114
+ # Here, a simple approach: keep letters, numbers, spaces, and periods.
115
+ pattern = r'[^\p{L}\p{N}\s.]+' # Matches any character that is NOT a Unicode letter, number, space, or period.
116
+ df['word'] = df['word'].astype(str).replace(pattern, '', regex=True)
117
+ else:
118
+ st.warning("The 'word' column is not of string type; skipping character cleaning.")
119
+ else:
120
+ st.error("The 'word' column does not exist in the DataFrame. Cannot perform cleaning.")
121
+ st.stop() # Stop execution if the column is missing
122
+
123
+ # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
124
+ df = df.replace('', 'Unknown').dropna()
125
+
126
+ if df.empty:
127
+ st.warning("No entities were extracted from the provided text.")
128
+ st.stop()
129
+
130
+ # --- Add 'category' column to the DataFrame based on the grouped labels ---
131
+ df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
132
+ # Handle cases where an entity_group might not have a category
133
+ df['category'] = df['category'].fillna('Uncategorized')
134
+
135
+ # --- Display Results ---
136
+ st.subheader("Extracted Entities", divider="rainbow")
137
+ properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
138
+ df_styled = df.style.set_properties(**properties)
139
+ st.dataframe(df_styled, use_container_width=True)
140
+
141
+ with st.expander("See Glossary of tags"):
142
+ st.write('''
143
+ '**word**': ['entity extracted from your text data']
144
+
145
+ '**score**': ['accuracy score; how accurately a tag has been assigned to
146
+ a given entity']
147
+
148
+ '**entity_group**': ['label (tag) assigned to a given extracted entity']
149
+
150
+ '**start**': ['index of the start of the corresponding entity']
151
+
152
+ '**end**': ['index of the end of the corresponding entity']
153
+
154
+ '**category**': ['the broader category the entity belongs to']
155
+ ''')
156
+
157
+ st.subheader("Grouped entities", divider="orange")
158
+
159
+ # Get unique categories and sort them for consistent tab order
160
+ unique_categories = sorted(df['category'].unique())
161
+ tabs_per_row = 4 # Adjust as needed for better layout
162
+
163
+ # Loop through categories in chunks to create rows of tabs
164
+ for i in range(0, len(unique_categories), tabs_per_row):
165
+ current_row_categories = unique_categories[i : i + tabs_per_row]
166
+ tabs = st.tabs(current_row_categories)
167
+
168
+ for j, category in enumerate(current_row_categories):
169
+ with tabs[j]:
170
+ df_filtered = df[df["category"] == category]
171
+ if not df_filtered.empty:
172
+ st.dataframe(df_filtered, use_container_width=True)
173
+ else:
174
+ st.info(f"No '{category}' entities found in the text.")
175
+ # Display an empty DataFrame for consistency if no entities are found
176
+ st.dataframe(pd.DataFrame({
177
+ 'entity_group': [np.nan],
178
+ 'score': [np.nan],
179
+ 'word': [np.nan],
180
+ 'start': [np.nan],
181
+ 'end': [np.nan],
182
+ 'category': [category]
183
+ }), hide_index=True)
184
+ st.divider()
185
+
186
+ # --- Visualizations ---
187
+ st.subheader("Tree map", divider="orange")
188
+ fig_treemap = px.treemap(df,
189
+ path=[px.Constant("all"), 'category', 'entity_group', 'word'],
190
+ values='score', color='category',
191
+ color_discrete_map={
192
+ 'Persons': 'blue',
193
+ 'Locations': 'green',
194
+ 'Organizations': 'red',
195
+ 'Miscellaneous': 'purple',
196
+ 'Uncategorized': 'gray'
197
+ })
198
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
199
+ st.plotly_chart(fig_treemap)
200
+
201
+ # Group by category and entity_group to get counts for pie and bar charts
202
+ grouped_counts = df.groupby('category').size().reset_index(name='count')
203
+
204
+ col1, col2 = st.columns(2)
205
+ with col1:
206
+ st.subheader("Pie Chart", divider="orange")
207
+ fig_pie = px.pie(grouped_counts, values='count', names='category',
208
+ hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
209
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
210
+ st.plotly_chart(fig_pie)
211
+
212
+ with col2:
213
+ st.subheader("Bar Chart", divider="orange")
214
+ fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True,
215
+ title='Occurrences of predicted categories')
216
+ st.plotly_chart(fig_bar)
217
+
218
+ # --- Downloadable Content ---
219
+ dfa = pd.DataFrame(
220
+ data={
221
+ 'Column Name': ['word', 'entity_group', 'score', 'start', 'end', 'category'],
222
+ 'Description': [
223
+ 'entity extracted from your text data',
224
+ 'label (tag) assigned to a given extracted entity',
225
+ 'accuracy score; how accurately a tag has been assigned to a given entity',
226
+ 'index of the start of the corresponding entity',
227
+ 'index of the end of the corresponding entity',
228
+ 'the broader category the entity belongs to',
229
+ ]
230
+ }
231
+ )
232
+ buf = io.BytesIO()
233
+ with zipfile.ZipFile(buf, "w") as myzip:
234
+ myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
235
+ myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
236
+
237
+ st.download_button(
238
+ label="Download zip file",
239
+ data=buf.getvalue(),
240
+ file_name="nlpblogs_ner_results.zip",
241
+ mime="application/zip",
242
+ )
243
+
244
+ end_time_overall = time.time()
245
+ elapsed_time_overall = end_time_overall - start_time_overall
246
+ st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
247
+
248
+ # Removed the display of attempts as there's no limit.
249
+ # st.write(f"Number of times you requested results: **{st.session_state['text_analysis_attempts']}/{max_attempts}**")