Spaces:

nlpblogs
/

Free-Ancient-Greek-Entity-Finder

Sleeping

App Files Files Community

nlpblogs commited on Jul 22

Commit

5e0a566

verified ·

1 Parent(s): 47a5fb3

Create app.py

Browse files

Files changed (1) hide show

app.py +249 -0

app.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import time
+import streamlit as st
+import pandas as pd
+import io
+from transformers import pipeline
+import plotly.express as px
+import zipfile
+import re
+import numpy as np
+import json
+# --- Page Configuration ---
+st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
+# --- Initialize session state ---
+# Removed the 'text_analysis_attempts' and 'max_attempts' as there's no limit.
+# Define the categories and their associated entity labels
+ENTITY_LABELS_CATEGORIZED = {
+    "Persons": ["PER"],
+    "Locations": ["LOC"],
+    "Organizations": ["ORG"],
+    "Miscellaneous": ["MISC"],
+    "Other": ["O"] # Including "O" for "Other" or non-entity if needed, though typically ignored by the pipeline
+}
+# Create a mapping from each specific entity label to its category
+LABEL_TO_CATEGORY_MAP = {
+    label: category for category, labels in ENTITY_LABELS_CATEGORIZED.items() for label in labels
+}
+@st.cache_resource
+def load_ner_model():
+    """
+    Loads the pre-trained NER model ("UGARIT/grc-ner-bert") and caches it.
+    """
+    try:
+        return pipeline(
+            "token-classification",
+            model="UGARIT/grc-ner-bert",
+            aggregation_strategy="max",
+            ignore_labels=["O"],
+            stride=128
+        )
+    except Exception as e:
+        st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
+        st.stop()
+# --- UI Elements ---
+st.subheader("Free Ancient Greek Entity Finder", divider="orange")
+st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
+expander = st.expander("**Important notes on the Free Ancient Greek Entity Finder**")
+expander.write('''
+   **Named Entities:** This Free Ancient Greek Entity Finder predicts four
+   (4) labels (“PER: person”, “LOC: location”, “ORG: organization”, “MISC:
+   miscellaneous”). Results are presented in an easy-to-read table, visualized in
+   an interactive tree map, pie chart, and bar chart, and are available for
+   download along with a Glossary of tags.
+   **How to Use:** Type or paste your Ancient Greek text into the input box. Then, click the 'Analyze Text' button
+   to extract and tag entities.
+   **Technical issues:** If your connection times out, please refresh the
+   page or reopen the app's URL.
+   For any errors or inquiries, please contact us at [email protected]
+''')
+with st.sidebar:
+    container = st.container(border=True)
+    container.write("**Named Entity Recognition (NER)** is the task of "
+                    "extracting and tagging entities in text data. Entities can be persons, "
+                    "organizations, locations, countries, products, events etc.")
+    st.subheader("Related NER Web Apps", divider="orange")
+    st.link_button("Multilingual PDF & DOCX Entity Finder",
+                   "https://nlpblogs.com/shop/named-entity-recognition-ner/multilingual-pdf-docx-entity-finder/",
+                   type="primary")
+text_input = st.text_area("Type or paste your Ancient Greek text here:")
+# --- Results Button and Processing Logic ---
+if st.button("Analyze Text"):
+    start_time_overall = time.time()  # Start time for overall processing
+    # Removed the usage limit check
+    # if st.session_state['text_analysis_attempts'] >= max_attempts:
+    #     st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
+    #     st.stop()
+    if not text_input.strip():
+        st.warning("Please enter some text for analysis.")
+        st.stop()
+    # Removed incrementing the attempt counter
+    # st.session_state['text_analysis_attempts'] += 1
+    with st.spinner("Analyzing text...", show_time=True):
+        model = load_ner_model()
+        # Measure NER model processing time
+        start_time_ner = time.time()
+        text_entities = model(text_input)
+        end_time_ner = time.time()
+        ner_processing_time = end_time_ner - start_time_ner
+        df = pd.DataFrame(text_entities)
+        if 'word' in df.columns:
+            # Ensure 'word' column is string type before applying regex
+            if df['word'].dtype == 'object':
+                # Remove non-alphanumeric characters, keeping spaces and periods.
+                # For Greek, we might want to be more specific or simply remove special symbols.
+                # Here, a simple approach: keep letters, numbers, spaces, and periods.
+                pattern = r'[^\p{L}\p{N}\s.]+' # Matches any character that is NOT a Unicode letter, number, space, or period.
+                df['word'] = df['word'].astype(str).replace(pattern, '', regex=True)
+            else:
+                st.warning("The 'word' column is not of string type; skipping character cleaning.")
+        else:
+            st.error("The 'word' column does not exist in the DataFrame. Cannot perform cleaning.")
+            st.stop() # Stop execution if the column is missing
+        # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
+        df = df.replace('', 'Unknown').dropna()
+        if df.empty:
+            st.warning("No entities were extracted from the provided text.")
+            st.stop()
+        # --- Add 'category' column to the DataFrame based on the grouped labels ---
+        df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
+        # Handle cases where an entity_group might not have a category
+        df['category'] = df['category'].fillna('Uncategorized')
+        # --- Display Results ---
+        st.subheader("Extracted Entities", divider="rainbow")
+        properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
+        df_styled = df.style.set_properties(**properties)
+        st.dataframe(df_styled, use_container_width=True)
+        with st.expander("See Glossary of tags"):
+            st.write('''
+            '**word**': ['entity extracted from your text data']
+            '**score**': ['accuracy score; how accurately a tag has been assigned to
+            a given entity']
+            '**entity_group**': ['label (tag) assigned to a given extracted entity']
+            '**start**': ['index of the start of the corresponding entity']
+            '**end**': ['index of the end of the corresponding entity']
+            '**category**': ['the broader category the entity belongs to']
+            ''')
+        st.subheader("Grouped entities", divider="orange")
+        # Get unique categories and sort them for consistent tab order
+        unique_categories = sorted(df['category'].unique())
+        tabs_per_row = 4  # Adjust as needed for better layout
+        # Loop through categories in chunks to create rows of tabs
+        for i in range(0, len(unique_categories), tabs_per_row):
+            current_row_categories = unique_categories[i : i + tabs_per_row]
+            tabs = st.tabs(current_row_categories)
+            for j, category in enumerate(current_row_categories):
+                with tabs[j]:
+                    df_filtered = df[df["category"] == category]
+                    if not df_filtered.empty:
+                        st.dataframe(df_filtered, use_container_width=True)
+                    else:
+                        st.info(f"No '{category}' entities found in the text.")
+                        # Display an empty DataFrame for consistency if no entities are found
+                        st.dataframe(pd.DataFrame({
+                            'entity_group': [np.nan],
+                            'score': [np.nan],
+                            'word': [np.nan],
+                            'start': [np.nan],
+                            'end': [np.nan],
+                            'category': [category]
+                        }), hide_index=True)
+        st.divider()
+        # --- Visualizations ---
+        st.subheader("Tree map", divider="orange")
+        fig_treemap = px.treemap(df,
+                                 path=[px.Constant("all"), 'category', 'entity_group', 'word'],
+                                 values='score', color='category',
+                                 color_discrete_map={
+                                     'Persons': 'blue',
+                                     'Locations': 'green',
+                                     'Organizations': 'red',
+                                     'Miscellaneous': 'purple',
+                                     'Uncategorized': 'gray'
+                                 })
+        fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+        st.plotly_chart(fig_treemap)
+        # Group by category and entity_group to get counts for pie and bar charts
+        grouped_counts = df.groupby('category').size().reset_index(name='count')
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("Pie Chart", divider="orange")
+            fig_pie = px.pie(grouped_counts, values='count', names='category',
+                             hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
+            fig_pie.update_traces(textposition='inside', textinfo='percent+label')
+            st.plotly_chart(fig_pie)
+        with col2:
+            st.subheader("Bar Chart", divider="orange")
+            fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True,
+                             title='Occurrences of predicted categories')
+            st.plotly_chart(fig_bar)
+        # --- Downloadable Content ---
+        dfa = pd.DataFrame(
+            data={
+                'Column Name': ['word', 'entity_group', 'score', 'start', 'end', 'category'],
+                'Description': [
+                    'entity extracted from your text data',
+                    'label (tag) assigned to a given extracted entity',
+                    'accuracy score; how accurately a tag has been assigned to a given entity',
+                    'index of the start of the corresponding entity',
+                    'index of the end of the corresponding entity',
+                    'the broader category the entity belongs to',
+                ]
+            }
+        )
+        buf = io.BytesIO()
+        with zipfile.ZipFile(buf, "w") as myzip:
+            myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
+            myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
+        st.download_button(
+            label="Download zip file",
+            data=buf.getvalue(),
+            file_name="nlpblogs_ner_results.zip",
+            mime="application/zip",
+        )
+    end_time_overall = time.time()
+    elapsed_time_overall = end_time_overall - start_time_overall
+    st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
+# Removed the display of attempts as there's no limit.
+# st.write(f"Number of times you requested results: **{st.session_state['text_analysis_attempts']}/{max_attempts}**")