Spaces:

AIEcosystem
/

AcademiaMiner

Running

File size: 11,583 Bytes


import os
os.environ['HF_HOME'] = '/tmp'

import time
import streamlit as st
import pandas as pd
import io
import plotly.express as px
import zipfile
from streamlit_extras.stylable_container import stylable_container
from transformers import pipeline
from comet_ml import Experiment

# --- App Configuration and Styling ---
st.set_page_config(
    layout="wide",
    page_title="English Keyphrase"
)

st.markdown(
    """
    <style>
   
    .stApp {
        background-color: #f0f8ff; /* A single, solid color */
        color: #000000;
        font-family: 'Inter', sans-serif;
    }
    
    .stButton > button {
        background-color: #FF69B4;
        color: #FFFFFF;
        font-weight: bold;
        border-radius: 12px;
        transition: all 0.2s ease-in-out;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    }
    
    .stButton > button:hover {
        background-color: #FFB6C1;
        box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
        transform: translateY(-2px);
    }
    
    /* Text Area background and text color */
    .stTextArea textarea {
        background-color: #FFC0CB; /* A nice pink color */
        color: #000000;
        border: 1px solid #FF69B4; /* A pink border to match the button */
    }
    
    </style>
    """,
    unsafe_allow_html=True
)

# --- Comet ML Setup ---
COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)

if not comet_initialized:
    st.warning("Comet ML not initialized. Check environment variables.")

# --- UI Header and Notes ---
st.subheader("AcademiaMiner", divider="rainbow")
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
expander = st.expander("**Important notes**")
expander.write('''**Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
    
Results are presented in easy-to-read tables, visualized in an interactive tree map and a bar chart, and are available for download along with a Glossary of tags.
    
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.

**Usage Limits:** You can request results unlimited times for one (1) month.

**Supported Languages:** English

**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. 

For any errors or inquiries, please contact us at [email protected]''')

with st.sidebar:
    st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
    code = '''
    <iframe
	src="https://aiecosystem-academiaminer.hf.space"
	frameborder="0"
	width="850"
	height="450"
    ></iframe>    
    '''
    st.code(code, language="html")
    st.text("")
    st.text("")
    st.divider()
    st.subheader("🚀 Ready to build your own AI Web App?", divider="rainbow")
    st.link_button("AI Web App Builder", "https://nlpblogs.com/custom-web-app-development/", type="primary")

# --- Model Loading ---
@st.cache_resource
def load_ner_model():
    """Loads the keyphrase extraction model and caches it."""
    try:
        return pipeline(
            "token-classification",
            model="ml6team/keyphrase-extraction-kbir-inspec",
            aggregation_strategy="max", stride=128, ignore_labels=["O"]
        )
    except Exception as e:
        st.error(f"Failed to load NER model: {e}")
        st.stop()

model = load_ner_model()

# --- Main App Logic ---
text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')

def clear_text():
    """Clears the text area."""
    st.session_state['my_text_area'] = ""
    st.session_state.text_processed = False

st.button("Clear text", on_click=clear_text)

if st.button("Results"):
    if not text.strip():
        st.warning("Please enter some text to extract keyphrases.")
    else:
        start_time_overall = time.time()
        
        # Initialize Comet ML experiment at the start
        experiment = None
        if comet_initialized:
            try:
                experiment = Experiment(
                    api_key=COMET_API_KEY,
                    workspace=COMET_WORKSPACE,
                    project_name=COMET_PROJECT_NAME,
                )
            except Exception as e:
                st.warning(f"Could not initialize Comet ML experiment: {e}")
                experiment = None

        try:
            with st.spinner("Analyzing text...", ):
                # The pipeline model returns a list of dictionaries.
                entities = model(text)
                
                data = []
                for entity in entities:
                    # 'ml6team/keyphrase-extraction-kbir-inspec' model doesn't have 'entity_group'
                    # It just uses 'label'
                    data.append({
                        'word': entity['word'],
                        'label': entity['entity_group'], # This is the correct key
                        'score': entity['score'],
                        'start': entity['start'],
                        'end': entity['end']
                    })
                    

                if not data:
                    st.warning("No keyphrases found in the text.")
                    st.stop()

                df = pd.DataFrame(data)

                # --- Data Cleaning and Processing ---
                pattern = r'[^\w\s]'
                df['word'] = df['word'].replace(pattern, '', regex=True)
                df = df.replace('', 'Unknown')

                # --- All Extracted Keyphrases ---
                st.subheader("All Extracted Keyphrases", divider="rainbow")
                st.dataframe(df, use_container_width=True)
                with st.expander("See Glossary of tags"):
                    st.write('''
                    **word**: ['keyphrase extracted from your text data']
                    
                    **score**: ['accuracy score; how accurately a tag has been assigned']
                    
                    **label**: ['label (tag) assigned to a given extracted keyphrase']
                    
                    **start**: ['index of the start of the corresponding entity']
                    
                    **end**: ['index of the end of the corresponding entity']
                    ''')

                # --- Most Frequent Keyphrases ---
                st.subheader("Most Frequent Keyphrases", divider="rainbow")
                word_counts = df['word'].value_counts().reset_index()
                word_counts.columns = ['word', 'count']
                df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
                
                if not df_frequent.empty:
                    tab1, tab2 = st.tabs(["Table", "Chart"])
                    with tab1:
                        st.dataframe(df_frequent, use_container_width=True)
                    with tab2:
                        fig_frequent_bar = px.bar(
                            df_frequent,
                            x='count',
                            y='word',
                            orientation='h',
                            title='Top Frequent Keyphrases by Count',
                            color='count',
                            color_continuous_scale=px.colors.sequential.Viridis
                        )
                        fig_frequent_bar.update_layout(
                            yaxis={'categoryorder': 'total ascending'},
                            paper_bgcolor='#f0f8ff', # Sets the background color of the entire figure
                            plot_bgcolor='#f0f8ff' # Sets the background color of the plotting area
                        )
                        
                        st.plotly_chart(fig_frequent_bar, use_container_width=True)
                        if experiment:
                            experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
                else:
                    st.info("No keyphrases found with more than one occurrence.")

                # --- Treemap of All Keyphrases ---
                st.subheader("Treemap of All Keyphrases", divider="rainbow")
                # Use 'label' instead of 'entity_group'
                fig_treemap = px.treemap(
                    df,
                    path=[px.Constant("all"), 'label', 'word'],
                    values='score',
                    color='word',
                    color_continuous_scale=px.colors.sequential.Plasma
                )
                fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#f0f8ff', plot_bgcolor='#f0f8ff')
                st.plotly_chart(fig_treemap, use_container_width=True)
                if experiment:
                    experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")

                # --- Download Section ---
                dfa = pd.DataFrame(
                    data={
                        'Column Name': ['word', 'label', 'score', 'start', 'end'],
                        'Description': [
                            'keyphrase extracted from your text data',
                            'label (tag) assigned to a given keyphrase',
                            'accuracy score; how accurately a tag has been assigned',
                            'index of the start of the corresponding entity',
                            'index of the end of the corresponding entity'
                        ]
                    }
                )
                buf = io.BytesIO()
                with zipfile.ZipFile(buf, "w") as myzip:
                    myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
                    myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
                    myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))

                with stylable_container(
                    key="download_button",
                    css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""",
                ):
                    st.download_button(
                        label="Download zip file",
                        data=buf.getvalue(),
                        file_name="nlpblogs_ner_results.zip",
                        mime="application/zip",
                    )
                st.divider()

        except Exception as e:
            st.error(f"An unexpected error occurred during processing: {e}")
        finally:
            if experiment:
                try:
                    # Log parameters and tables before ending the experiment
                    experiment.log_parameter("input_source_type", "text_area")
                    experiment.log_parameter("input_content_length", len(text))
                    experiment.log_table("predicted_entities", df)
                    experiment.end()
                except Exception as comet_e:
                    st.warning(f"Comet ML experiment.end() failed: {comet_e}")
            
            # Show elapsed time
            end_time_overall = time.time()
            elapsed_time_overall = end_time_overall - start_time_overall
            st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")