Spaces:

AIEcosystem
/

AcademiaMiner

Sleeping

App Files Files Community

AIEcosystem commited on 3 days ago

Commit

70e6432

verified ·

1 Parent(s): f11aa25

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +161 -257

src/streamlit_app.py CHANGED Viewed

@@ -1,60 +1,29 @@
 import os
-os.environ['HF_HOME'] = '/tmp'
 import time
 import streamlit as st
 import pandas as pd
 import io
 import plotly.express as px
 import zipfile
-import json
-from cryptography.fernet import Fernet
 from streamlit_extras.stylable_container import stylable_container
-from typing import Optional
-from gliner import GLiNER
-from comet_ml import Experiment
 from transformers import pipeline
 st.markdown(
     """
     <style>
-    /* Main app background with a subtle rainbow gradient */
     .stApp {
         background: linear-gradient(135deg, #f0f8ff, #f5f0ff, #fff0f5);
         color: #000000;
         font-family: 'Inter', sans-serif;
     }
-    /* Rainbow gradient for the sidebar */
-    .css-1d36184, .css-1d36184:hover, .css-1d36184:focus {
-        background: linear-gradient(180deg, #FFC0CB, #FFD700, #98FB98, #ADD8E6, #BA55D3);
-        secondary-background-color: #FFC080;
-    }
-    /* Expander background color with a slight transparency */
-    .streamlit-expanderContent {
-        background-color: rgba(255, 255, 255, 0.7);
-        border-radius: 10px;
-    }
-    /* Expander header with a gentle gradient and bold text */
-    .streamlit-expanderHeader {
-        background: linear-gradient(90deg, #FADADD, #FFF9E0, #E0FFF8);
-        border-radius: 10px;
-        font-weight: bold;
-    }
-    /* Text Area with a light background and subtle border */
-    .stTextArea textarea {
-        background-color: #FFF0F5;
-        color: #000000;
-        border: 1px solid #ccc;
-        border-radius: 8px;
-    }
-    /* Button with a solid color and elegant hover effect */
     .stButton > button {
         background-color: #FF69B4;
         color: #FFFFFF;
@@ -68,45 +37,11 @@ st.markdown(
         box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
         transform: translateY(-2px);
     }
-    /* Warning box with a soft orange and rounded corners */
-    .stAlert.st-warning {
-        background-color: #FFDDAA;
-        color: #000000;
-        border-radius: 10px;
-        border-left: 5px solid #FFA500;
-    }
-    /* Success box with a fresh green and rounded corners */
-    .stAlert.st-success {
-        background-color: #D4EDDA;
-        color: #155724;
-        border-radius: 10px;
-        border-left: 5px solid #28A745;
-    }
-    /* Custom CSS to make the title text rainbow-colored */
-    h1 {
-        background: linear-gradient(45deg, #FF69B4, #FFD700, #00FF7F, #00BFFF, #8A2BE2);
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
-        font-size: 3em;
-        font-weight: 800;
-    }
     </style>
     """,
     unsafe_allow_html=True
 )
-st.set_page_config(
-    layout="wide",
-    page_title="English Keyphrase"
-)
 # --- Comet ML Setup ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -116,42 +51,21 @@ comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAM
 if not comet_initialized:
     st.warning("Comet ML not initialized. Check environment variables.")
 # --- UI Header and Notes ---
 st.subheader("AcademiaMiner", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes*")
-expander.write('''
-**Named Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
-Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
-**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
-**Usage Limits:** You can request results unlimited times for one (1) month.
-**Supported Languages:** English
-**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
-For any errors or inquiries, please contact us at [email protected]'''
-)
 with st.sidebar:
     st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
     code = '''
-    <iframe
-	src="https://aiecosystem-business-core.hf.space"
-	frameborder="0"
-	width="850"
-	height="450"
-    ></iframe>
     '''
     st.code(code, language="html")
     st.text("")
@@ -160,189 +74,179 @@ with st.sidebar:
     st.subheader("🚀 Ready to build your own NER Web App?", divider="rainbow")
     st.link_button("NER Builder", "https://nlpblogs.com", type="primary")
 @st.cache_resource
 def load_ner_model():
-    """Loads the GLiNER model and caches it."""
     try:
-        return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints= labels)
     except Exception as e:
-        st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
         st.stop()
-model = load_ner_model()
-@st.cache_resource
-def load_ner_model():
-    return pipeline("token-classification",
-                    model="ml6team/keyphrase-extraction-kbir-inspec",
-                    aggregation_strategy="max",
-                    stride=128,
-                    ignore_labels=["O"])
 model = load_ner_model()
 text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')
 def clear_text():
     """Clears the text area."""
     st.session_state['my_text_area'] = ""
 st.button("Clear text", on_click=clear_text)
 if st.button("Results"):
-    start_time = time.time()
     if not text.strip():
-        st.warning("Please enter some text to extract entities.")
     else:
-        with st.spinner("Analyzing text...", show_time=True):
-            entities = model(text_for_ner)
-            data = []
-            if entities:
                 for entity in entities:
-                    if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
-                        data.append({
-                                    'word': entity['word'],
-                                    'entity_group': entity['entity_group'],
-                                    'score': entity['score'],
-                                    'start': entity['start'],
-                                    'end': entity['end']
-                                })
-                    else:
-                        st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
-                        df = pd.DataFrame(data)
-                    else:
-                        df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
-                    if not df.empty:
-                        pattern = r'[^\w\s]'
-                        df['word'] = df['word'].replace(pattern, '', regex=True)
-                        df = df.replace('', 'Unknown')
-                        st.subheader("All Extracted Keyphrases", divider="rainbow")
-                        st.dataframe(df, use_container_width=True)
-                        with st.expander("See Glossary of tags"):
-                            st.write('''
-                            **word**: ['entity extracted from your text data']
-                            **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
-                            **entity_group**: ['label (tag) assigned to a given extracted entity']
-                            **start**: ['index of the start of the corresponding entity']
-                            **end**: ['index of the end of the corresponding entity']
-                            ''')
-                        st.divider()
-                        st.subheader("Most Frequent Keyphrases", divider="rainbow")
-                        word_counts = df['word'].value_counts().reset_index()
-                        word_counts.columns = ['word', 'count']
-                        df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
-                        if not df_frequent.empty:
-                            tab1, tab2 = st.tabs(["Table", "Chart"])
-                            with tab1:
-                                st.dataframe(df_frequent, use_container_width=True)
-                            with tab2:
-                                fig_frequent_bar = px.bar(
-                                    df_frequent,
-                                    x='count',
-                                    y='word',
-                                    orientation='h',
-                                    title='Top Frequent Keyphrases by Count',
-                                    color='count',
-                                    color_continuous_scale=px.colors.sequential.Viridis
-                                )
-                                fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
-                                st.plotly_chart(fig_frequent_bar, use_container_width=True)
-                                if comet_initialized and 'experiment' in locals():
-                                    experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
-                        else:
-                            st.info("No keyphrases found with more than one occurrence to display in tabs.")
-                        st.divider()
-                        experiment = None
-                        if comet_initialized:
-                            experiment = Experiment(
-                                api_key=COMET_API_KEY,
-                                workspace=COMET_WORKSPACE,
-                                project_name=COMET_PROJECT_NAME,
-                            )
-                            experiment.log_parameter("input_source_type", source_type)
-                            experiment.log_parameter("input_content_length", len(text_for_ner))
-                            experiment.log_table("predicted_entities", df)
-                        st.subheader("Treemap of All Keyphrases", divider="rainbow")
-                        fig_treemap = px.treemap(
-                            df,
-                            path=[px.Constant("all"), 'entity_group', 'word'],
-                            values='score',
-                            color='word',
-                            color_continuous_scale=px.colors.sequential.Plasma
-                        )
-                        fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
-                        st.plotly_chart(fig_treemap, use_container_width=True)
-                        if comet_initialized and experiment:
-                            experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
-                        # --- Download Section ---
-                        dfa = pd.DataFrame(
-                            data={
-                                'Column Name': ['word', 'entity_group', 'score', 'start', 'end'],
-                                'Description': [
-                                    'entity extracted from your text data',
-                                    'label (tag) assigned to a given extracted entity',
-                                    'accuracy score; how accurately a tag has been assigned to a given entity',
-                                    'index of the start of the corresponding entity',
-                                    'index of the end of the corresponding entity'
-                                ]
-                            }
                         )
-                        buf = io.BytesIO()
-                        with zipfile.ZipFile(buf, "w") as myzip:
-                            if not df.empty:
-                                myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
-                                myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
-                            myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
-                        with stylable_container(
-                            key="download_button",
-                            css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
-                        ):
-                            st.download_button(
-                                label="Download zip file",
-                                data=buf.getvalue(),
-                                file_name="nlpblogs_ner_results.zip",
-                                mime="application/zip",
-                            )
-                        st.divider()
-                    else:
-                        st.warning("No entities found to generate visualizations.")
-            else:
-                st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
         except Exception as e:
             st.error(f"An unexpected error occurred during processing: {e}")
         finally:
-            if comet_initialized and experiment is not None:
                 try:
                     experiment.end()
                 except Exception as comet_e:
                     st.warning(f"Comet ML experiment.end() failed: {comet_e}")
-            if start_time_overall is not None:
-                end_time_overall = time.time()
-                elapsed_time_overall = end_time_overall - start_time_overall
-                st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
-            st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")
-    else:
-        st.warning("Please enter some text, a URL, or upload a file to analyze.")

 import os
 import time
 import streamlit as st
 import pandas as pd
 import io
 import plotly.express as px
 import zipfile
 from streamlit_extras.stylable_container import stylable_container
 from transformers import pipeline
+from comet_ml import Experiment
+# --- App Configuration and Styling ---
+st.set_page_config(
+    layout="wide",
+    page_title="English Keyphrase"
+)
 st.markdown(
     """
     <style>
+    /* ... (your CSS styles here, as they were mostly fine) ... */
     .stApp {
         background: linear-gradient(135deg, #f0f8ff, #f5f0ff, #fff0f5);
         color: #000000;
         font-family: 'Inter', sans-serif;
     }
     .stButton > button {
         background-color: #FF69B4;
         color: #FFFFFF;
         box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
         transform: translateY(-2px);
     }
     </style>
     """,
     unsafe_allow_html=True
 )
 # --- Comet ML Setup ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 if not comet_initialized:
     st.warning("Comet ML not initialized. Check environment variables.")
 # --- UI Header and Notes ---
 st.subheader("AcademiaMiner", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes*")
+expander.write('''**Named Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
+    Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
+    **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
+    **Usage Limits:** You can request results unlimited times for one (1) month.
+    **Supported Languages:** English
+    **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. For any errors or inquiries, please contact us at info@nlpblogs.com''')
 with st.sidebar:
     st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
     code = '''
+    <iframe	src="https://aiecosystem-business-core.hf.space"	frameborder="0"	width="850"	height="450"></iframe>
     '''
     st.code(code, language="html")
     st.text("")
     st.subheader("🚀 Ready to build your own NER Web App?", divider="rainbow")
     st.link_button("NER Builder", "https://nlpblogs.com", type="primary")
+# --- Model Loading ---
 @st.cache_resource
 def load_ner_model():
+    """Loads the keyphrase extraction model and caches it."""
     try:
+        return pipeline(
+            "token-classification",
+            model="ml6team/keyphrase-extraction-kbir-inspec",
+            aggregation_strategy="max"
+        )
     except Exception as e:
+        st.error(f"Failed to load NER model: {e}")
         st.stop()
 model = load_ner_model()
+# --- Main App Logic ---
 text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')
 def clear_text():
     """Clears the text area."""
     st.session_state['my_text_area'] = ""
+    st.session_state.text_processed = False
 st.button("Clear text", on_click=clear_text)
 if st.button("Results"):
     if not text.strip():
+        st.warning("Please enter some text to extract keyphrases.")
     else:
+        start_time_overall = time.time()
+        # Initialize Comet ML experiment at the start
+        experiment = None
+        if comet_initialized:
+            try:
+                experiment = Experiment(
+                    api_key=COMET_API_KEY,
+                    workspace=COMET_WORKSPACE,
+                    project_name=COMET_PROJECT_NAME,
+                )
+            except Exception as e:
+                st.warning(f"Could not initialize Comet ML experiment: {e}")
+                experiment = None
+        try:
+            with st.spinner("Analyzing text...", ):
+                # The pipeline model returns a list of dictionaries.
+                entities = model(text)
+                data = []
                 for entity in entities:
+                    # 'ml6team/keyphrase-extraction-kbir-inspec' model doesn't have 'entity_group'
+                    # It just uses 'label'
+                    data.append({
+                        'word': entity['word'],
+                        'label': entity['label'],
+                        'score': entity['score'],
+                        'start': entity['start'],
+                        'end': entity['end']
+                    })
+                if not data:
+                    st.warning("No keyphrases found in the text.")
+                    st.stop()
+                df = pd.DataFrame(data)
+                # --- Data Cleaning and Processing ---
+                pattern = r'[^\w\s]'
+                df['word'] = df['word'].replace(pattern, '', regex=True)
+                df = df.replace('', 'Unknown')
+                # --- All Extracted Keyphrases ---
+                st.subheader("All Extracted Keyphrases", divider="rainbow")
+                st.dataframe(df, use_container_width=True)
+                with st.expander("See Glossary of tags"):
+                    st.write('''
+                    **word**: ['keyphrase extracted from your text data']
+                    **score**: ['accuracy score; how accurately a tag has been assigned']
+                    **label**: ['label (tag) assigned to a given extracted keyphrase']
+                    **start**: ['index of the start of the corresponding entity']
+                    **end**: ['index of the end of the corresponding entity']
+                    ''')
+                # --- Most Frequent Keyphrases ---
+                st.subheader("Most Frequent Keyphrases", divider="rainbow")
+                word_counts = df['word'].value_counts().reset_index()
+                word_counts.columns = ['word', 'count']
+                df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
+                if not df_frequent.empty:
+                    tab1, tab2 = st.tabs(["Table", "Chart"])
+                    with tab1:
+                        st.dataframe(df_frequent, use_container_width=True)
+                    with tab2:
+                        fig_frequent_bar = px.bar(
+                            df_frequent,
+                            x='count',
+                            y='word',
+                            orientation='h',
+                            title='Top Frequent Keyphrases by Count',
+                            color='count',
+                            color_continuous_scale=px.colors.sequential.Viridis
                         )
+                        fig_frequent_bar.update_layout(yaxis={'categoryorder': 'total ascending'})
+                        st.plotly_chart(fig_frequent_bar, use_container_width=True)
+                        if experiment:
+                            experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
+                else:
+                    st.info("No keyphrases found with more than one occurrence.")
+                # --- Treemap of All Keyphrases ---
+                st.subheader("Treemap of All Keyphrases", divider="rainbow")
+                # Use 'label' instead of 'entity_group'
+                fig_treemap = px.treemap(
+                    df,
+                    path=[px.Constant("all"), 'label', 'word'],
+                    values='score',
+                    color='word',
+                    color_continuous_scale=px.colors.sequential.Plasma
+                )
+                fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+                st.plotly_chart(fig_treemap, use_container_width=True)
+                if experiment:
+                    experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
+                # --- Download Section ---
+                dfa = pd.DataFrame(
+                    data={
+                        'Column Name': ['word', 'label', 'score', 'start', 'end'],
+                        'Description': [
+                            'keyphrase extracted from your text data',
+                            'label (tag) assigned to a given keyphrase',
+                            'accuracy score; how accurately a tag has been assigned',
+                            'index of the start of the corresponding entity',
+                            'index of the end of the corresponding entity'
+                        ]
+                    }
+                )
+                buf = io.BytesIO()
+                with zipfile.ZipFile(buf, "w") as myzip:
+                    myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
+                    myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
+                    myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
+                with stylable_container(
+                    key="download_button",
+                    css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
+                ):
+                    st.download_button(
+                        label="Download zip file",
+                        data=buf.getvalue(),
+                        file_name="nlpblogs_ner_results.zip",
+                        mime="application/zip",
+                    )
+                st.divider()
         except Exception as e:
             st.error(f"An unexpected error occurred during processing: {e}")
         finally:
+            if experiment:
                 try:
+                    # Log parameters and tables before ending the experiment
+                    experiment.log_parameter("input_source_type", "text_area")
+                    experiment.log_parameter("input_content_length", len(text))
+                    experiment.log_table("predicted_entities", df)
                     experiment.end()
                 except Exception as comet_e:
                     st.warning(f"Comet ML experiment.end() failed: {comet_e}")
+            # Show elapsed time
+            end_time_overall = time.time()
+            elapsed_time_overall = end_time_overall - start_time_overall
+            st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")