Spaces:

AIEcosystem
/

AcademiaMiner

Sleeping

File size: 14,216 Bytes

import os
os.environ['HF_HOME'] = '/tmp'
import time
import streamlit as st
import pandas as pd
import io
import plotly.express as px
import zipfile
import json
from cryptography.fernet import Fernet
from streamlit_extras.stylable_container import stylable_container
from typing import Optional
from gliner import GLiNER
from comet_ml import Experiment
from transformers import pipeline




st.markdown(
    """
    <style>
    /* Main app background with a subtle rainbow gradient */
    .stApp {
        background: linear-gradient(135deg, #f0f8ff, #f5f0ff, #fff0f5);
        color: #000000;
        font-family: 'Inter', sans-serif;
    }

    /* Rainbow gradient for the sidebar */
    .css-1d36184, .css-1d36184:hover, .css-1d36184:focus {
        background: linear-gradient(180deg, #FFC0CB, #FFD700, #98FB98, #ADD8E6, #BA55D3);
        secondary-background-color: #FFC080;
    }

    /* Expander background color with a slight transparency */
    .streamlit-expanderContent {
        background-color: rgba(255, 255, 255, 0.7);
        border-radius: 10px;
    }

    /* Expander header with a gentle gradient and bold text */
    .streamlit-expanderHeader {
        background: linear-gradient(90deg, #FADADD, #FFF9E0, #E0FFF8);
        border-radius: 10px;
        font-weight: bold;
    }

    /* Text Area with a light background and subtle border */
    .stTextArea textarea {
        background-color: #FFF0F5;
        color: #000000;
        border: 1px solid #ccc;
        border-radius: 8px;
    }

    /* Button with a solid color and elegant hover effect */
    .stButton > button {
        background-color: #FF69B4;
        color: #FFFFFF;
        font-weight: bold;
        border-radius: 12px;
        transition: all 0.2s ease-in-out;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    }
    .stButton > button:hover {
        background-color: #FFB6C1;
        box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
        transform: translateY(-2px);
    }

    /* Warning box with a soft orange and rounded corners */
    .stAlert.st-warning {
        background-color: #FFDDAA;
        color: #000000;
        border-radius: 10px;
        border-left: 5px solid #FFA500;
    }

    /* Success box with a fresh green and rounded corners */
    .stAlert.st-success {
        background-color: #D4EDDA;
        color: #155724;
        border-radius: 10px;
        border-left: 5px solid #28A745;
    }

    /* Custom CSS to make the title text rainbow-colored */
    h1 {
        background: linear-gradient(45deg, #FF69B4, #FFD700, #00FF7F, #00BFFF, #8A2BE2);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        font-size: 3em;
        font-weight: 800;
    }

    </style>
    """,
    unsafe_allow_html=True
)


st.set_page_config(
    layout="wide",
    page_title="English Keyphrase"
)



# --- Comet ML Setup ---
COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)

if not comet_initialized:
    st.warning("Comet ML not initialized. Check environment variables.")





# --- UI Header and Notes ---
st.subheader("AcademiaMiner", divider="rainbow")
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")

expander = st.expander("**Important notes*")
expander.write('''
**Named Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
    
Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.      

**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.      

**Usage Limits:** You can request results unlimited times for one (1) month.      

**Supported Languages:** English     

**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. 

For any errors or inquiries, please contact us at [email protected]'''
)



with st.sidebar:
    st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
    code = '''
    <iframe
	src="https://aiecosystem-business-core.hf.space"
	frameborder="0"
	width="850"
	height="450"
    ></iframe>
    '''
    st.code(code, language="html")
    st.text("")
    st.text("")
    st.divider()
    st.subheader("🚀 Ready to build your own NER Web App?", divider="rainbow")
    st.link_button("NER Builder", "https://nlpblogs.com", type="primary")


@st.cache_resource
def load_ner_model():
    """Loads the GLiNER model and caches it."""
    try:
        return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints= labels)
    except Exception as e:
        st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
        st.stop()
model = load_ner_model()


@st.cache_resource
def load_ner_model():
    return pipeline("token-classification",
                    model="ml6team/keyphrase-extraction-kbir-inspec",
                    aggregation_strategy="max",
                    stride=128,
                    ignore_labels=["O"])

model = load_ner_model()
            


text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')

def clear_text():
    """Clears the text area."""
    st.session_state['my_text_area'] = ""

st.button("Clear text", on_click=clear_text)


if st.button("Results"):
    start_time = time.time()
    if not text.strip():
        st.warning("Please enter some text to extract entities.")
    else:
        with st.spinner("Analyzing text...", show_time=True):
            entities = model(text_for_ner)
            data = []
            if entities:
                for entity in entities:
                    if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
                        data.append({
                                    'word': entity['word'],
                                    'entity_group': entity['entity_group'],
                                    'score': entity['score'],
                                    'start': entity['start'],
                                    'end': entity['end']
                                })
                    else:
                        st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
                        df = pd.DataFrame(data)
                    else:
                        df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])

                    if not df.empty:
                        pattern = r'[^\w\s]'
                        df['word'] = df['word'].replace(pattern, '', regex=True)
                        df = df.replace('', 'Unknown')

                        st.subheader("All Extracted Keyphrases", divider="rainbow")
                        st.dataframe(df, use_container_width=True)

                        with st.expander("See Glossary of tags"):
                            st.write('''
                            **word**: ['entity extracted from your text data']
                            
                            **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
                            
                            **entity_group**: ['label (tag) assigned to a given extracted entity']
                            
                            **start**: ['index of the start of the corresponding entity']
                            
                            **end**: ['index of the end of the corresponding entity']
                            
                            ''')
                        st.divider()

                        st.subheader("Most Frequent Keyphrases", divider="rainbow")
                        word_counts = df['word'].value_counts().reset_index()
                        word_counts.columns = ['word', 'count']
                        df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)

                        if not df_frequent.empty:
                            tab1, tab2 = st.tabs(["Table", "Chart"])

                            with tab1:
                                st.dataframe(df_frequent, use_container_width=True)
                            
                            with tab2:
                                fig_frequent_bar = px.bar(
                                    df_frequent,
                                    x='count',
                                    y='word',
                                    orientation='h',
                                    title='Top Frequent Keyphrases by Count',
                                    color='count',
                                    color_continuous_scale=px.colors.sequential.Viridis
                                )
                                fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
                                st.plotly_chart(fig_frequent_bar, use_container_width=True)
                                
                                if comet_initialized and 'experiment' in locals():
                                    experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
                        else:
                            st.info("No keyphrases found with more than one occurrence to display in tabs.")
                        
                        st.divider()

                        experiment = None
                        if comet_initialized:
                            experiment = Experiment(
                                api_key=COMET_API_KEY,
                                workspace=COMET_WORKSPACE,
                                project_name=COMET_PROJECT_NAME,
                            )
                            experiment.log_parameter("input_source_type", source_type)
                            experiment.log_parameter("input_content_length", len(text_for_ner))
                            experiment.log_table("predicted_entities", df)

                        st.subheader("Treemap of All Keyphrases", divider="rainbow")
                        fig_treemap = px.treemap(
                            df,
                            path=[px.Constant("all"), 'entity_group', 'word'],
                            values='score',
                            color='word',
                            color_continuous_scale=px.colors.sequential.Plasma
                        )
                        fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
                        st.plotly_chart(fig_treemap, use_container_width=True)

                        if comet_initialized and experiment:
                            experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")

                        # --- Download Section ---
                        dfa = pd.DataFrame(
                            data={
                                'Column Name': ['word', 'entity_group', 'score', 'start', 'end'],
                                'Description': [
                                    'entity extracted from your text data',
                                    'label (tag) assigned to a given extracted entity',
                                    'accuracy score; how accurately a tag has been assigned to a given entity',
                                    'index of the start of the corresponding entity',
                                    'index of the end of the corresponding entity'
                                ]
                            }
                        )
                        buf = io.BytesIO()
                        with zipfile.ZipFile(buf, "w") as myzip:
                            if not df.empty:
                                myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
                                myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
                            myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))

                        with stylable_container(
                            key="download_button",
                            css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
                        ):
                            st.download_button(
                                label="Download zip file",
                                data=buf.getvalue(),
                                file_name="nlpblogs_ner_results.zip",
                                mime="application/zip",
                            )
                        st.divider()
                    else:
                        st.warning("No entities found to generate visualizations.")
            else:
                st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
        except Exception as e:
            st.error(f"An unexpected error occurred during processing: {e}")
        finally:
            if comet_initialized and experiment is not None:
                try:
                    experiment.end()
                except Exception as comet_e:
                    st.warning(f"Comet ML experiment.end() failed: {comet_e}")
            if start_time_overall is not None:
                end_time_overall = time.time()
                elapsed_time_overall = end_time_overall - start_time_overall
                st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
            st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")
    else:
        st.warning("Please enter some text, a URL, or upload a file to analyze.")