import os os.environ['HF_HOME'] = '/tmp' import time import streamlit as st import pandas as pd import io import plotly.express as px import zipfile from streamlit_extras.stylable_container import stylable_container from transformers import pipeline from comet_ml import Experiment # --- App Configuration and Styling --- st.set_page_config( layout="wide", page_title="English Keyphrase" ) st.markdown( """ """, unsafe_allow_html=True ) # --- Comet ML Setup --- COMET_API_KEY = os.environ.get("COMET_API_KEY") COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE") COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME") comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME) if not comet_initialized: st.warning("Comet ML not initialized. Check environment variables.") # --- UI Header and Notes --- st.subheader("AcademiaMiner", divider="rainbow") st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") expander = st.expander("**Important notes**") expander.write('''**Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers. Results are presented in easy-to-read tables, visualized in an interactive tree map and a bar chart, and are available for download along with a Glossary of tags. **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data. **Usage Limits:** You can request results unlimited times for one (1) month. **Supported Languages:** English **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. For any errors or inquiries, please contact us at info@nlpblogs.com''') with st.sidebar: st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.") code = ''' ''' st.code(code, language="html") st.text("") st.text("") st.divider() st.subheader("🚀 Ready to build your own AI Web App?", divider="rainbow") st.link_button("AI Web App Builder", "https://nlpblogs.com/custom-web-app-development/", type="primary") # --- Model Loading --- @st.cache_resource def load_ner_model(): """Loads the keyphrase extraction model and caches it.""" try: return pipeline( "token-classification", model="ml6team/keyphrase-extraction-kbir-inspec", aggregation_strategy="max", stride=128, ignore_labels=["O"] ) except Exception as e: st.error(f"Failed to load NER model: {e}") st.stop() model = load_ner_model() # --- Main App Logic --- text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area') def clear_text(): """Clears the text area.""" st.session_state['my_text_area'] = "" st.session_state.text_processed = False st.button("Clear text", on_click=clear_text) if st.button("Results"): if not text.strip(): st.warning("Please enter some text to extract keyphrases.") else: start_time_overall = time.time() # Initialize Comet ML experiment at the start experiment = None if comet_initialized: try: experiment = Experiment( api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME, ) except Exception as e: st.warning(f"Could not initialize Comet ML experiment: {e}") experiment = None try: with st.spinner("Analyzing text...", ): # The pipeline model returns a list of dictionaries. entities = model(text) data = [] for entity in entities: # 'ml6team/keyphrase-extraction-kbir-inspec' model doesn't have 'entity_group' # It just uses 'label' data.append({ 'word': entity['word'], 'label': entity['entity_group'], # This is the correct key 'score': entity['score'], 'start': entity['start'], 'end': entity['end'] }) if not data: st.warning("No keyphrases found in the text.") st.stop() df = pd.DataFrame(data) # --- Data Cleaning and Processing --- pattern = r'[^\w\s]' df['word'] = df['word'].replace(pattern, '', regex=True) df = df.replace('', 'Unknown') # --- All Extracted Keyphrases --- st.subheader("All Extracted Keyphrases", divider="rainbow") st.dataframe(df, use_container_width=True) with st.expander("See Glossary of tags"): st.write(''' **word**: ['keyphrase extracted from your text data'] **score**: ['accuracy score; how accurately a tag has been assigned'] **label**: ['label (tag) assigned to a given extracted keyphrase'] **start**: ['index of the start of the corresponding entity'] **end**: ['index of the end of the corresponding entity'] ''') # --- Most Frequent Keyphrases --- st.subheader("Most Frequent Keyphrases", divider="rainbow") word_counts = df['word'].value_counts().reset_index() word_counts.columns = ['word', 'count'] df_frequent = word_counts.sort_values(by='count', ascending=False).head(15) if not df_frequent.empty: tab1, tab2 = st.tabs(["Table", "Chart"]) with tab1: st.dataframe(df_frequent, use_container_width=True) with tab2: fig_frequent_bar = px.bar( df_frequent, x='count', y='word', orientation='h', title='Top Frequent Keyphrases by Count', color='count', color_continuous_scale=px.colors.sequential.Viridis ) fig_frequent_bar.update_layout( yaxis={'categoryorder': 'total ascending'}, paper_bgcolor='#f0f8ff', # Sets the background color of the entire figure plot_bgcolor='#f0f8ff' # Sets the background color of the plotting area ) st.plotly_chart(fig_frequent_bar, use_container_width=True) if experiment: experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart") else: st.info("No keyphrases found with more than one occurrence.") # --- Treemap of All Keyphrases --- st.subheader("Treemap of All Keyphrases", divider="rainbow") # Use 'label' instead of 'entity_group' fig_treemap = px.treemap( df, path=[px.Constant("all"), 'label', 'word'], values='score', color='word', color_continuous_scale=px.colors.sequential.Plasma ) fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#f0f8ff', plot_bgcolor='#f0f8ff') st.plotly_chart(fig_treemap, use_container_width=True) if experiment: experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap") # --- Download Section --- dfa = pd.DataFrame( data={ 'Column Name': ['word', 'label', 'score', 'start', 'end'], 'Description': [ 'keyphrase extracted from your text data', 'label (tag) assigned to a given keyphrase', 'accuracy score; how accurately a tag has been assigned', 'index of the start of the corresponding entity', 'index of the end of the corresponding entity' ] } ) buf = io.BytesIO() with zipfile.ZipFile(buf, "w") as myzip: myzip.writestr("Summary_of_results.csv", df.to_csv(index=False)) myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False)) myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False)) with stylable_container( key="download_button", css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""", ): st.download_button( label="Download zip file", data=buf.getvalue(), file_name="nlpblogs_ner_results.zip", mime="application/zip", ) st.divider() except Exception as e: st.error(f"An unexpected error occurred during processing: {e}") finally: if experiment: try: # Log parameters and tables before ending the experiment experiment.log_parameter("input_source_type", "text_area") experiment.log_parameter("input_content_length", len(text)) experiment.log_table("predicted_entities", df) experiment.end() except Exception as comet_e: st.warning(f"Comet ML experiment.end() failed: {comet_e}") # Show elapsed time end_time_overall = time.time() elapsed_time_overall = end_time_overall - start_time_overall st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")