import os
os.environ['HF_HOME'] = '/tmp'
import time
import streamlit as st
import pandas as pd
import io
import plotly.express as px
import zipfile
from streamlit_extras.stylable_container import stylable_container
from transformers import pipeline
from comet_ml import Experiment
# --- App Configuration and Styling ---
st.set_page_config(
layout="wide",
page_title="English Keyphrase"
)
st.markdown(
"""
""",
unsafe_allow_html=True
)
# --- Comet ML Setup ---
COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
if not comet_initialized:
st.warning("Comet ML not initialized. Check environment variables.")
# --- UI Header and Notes ---
st.subheader("AcademiaMiner", divider="rainbow")
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
expander = st.expander("**Important notes**")
expander.write('''**Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
Results are presented in easy-to-read tables, visualized in an interactive tree map and a bar chart, and are available for download along with a Glossary of tags.
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
**Usage Limits:** You can request results unlimited times for one (1) month.
**Supported Languages:** English
**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
For any errors or inquiries, please contact us at info@nlpblogs.com''')
with st.sidebar:
st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
code = '''
'''
st.code(code, language="html")
st.text("")
st.text("")
st.divider()
st.subheader("🚀 Ready to build your own AI Web App?", divider="rainbow")
st.link_button("AI Web App Builder", "https://nlpblogs.com/custom-web-app-development/", type="primary")
# --- Model Loading ---
@st.cache_resource
def load_ner_model():
"""Loads the keyphrase extraction model and caches it."""
try:
return pipeline(
"token-classification",
model="ml6team/keyphrase-extraction-kbir-inspec",
aggregation_strategy="max", stride=128, ignore_labels=["O"]
)
except Exception as e:
st.error(f"Failed to load NER model: {e}")
st.stop()
model = load_ner_model()
# --- Main App Logic ---
text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')
def clear_text():
"""Clears the text area."""
st.session_state['my_text_area'] = ""
st.session_state.text_processed = False
st.button("Clear text", on_click=clear_text)
if st.button("Results"):
if not text.strip():
st.warning("Please enter some text to extract keyphrases.")
else:
start_time_overall = time.time()
# Initialize Comet ML experiment at the start
experiment = None
if comet_initialized:
try:
experiment = Experiment(
api_key=COMET_API_KEY,
workspace=COMET_WORKSPACE,
project_name=COMET_PROJECT_NAME,
)
except Exception as e:
st.warning(f"Could not initialize Comet ML experiment: {e}")
experiment = None
try:
with st.spinner("Analyzing text...", ):
# The pipeline model returns a list of dictionaries.
entities = model(text)
data = []
for entity in entities:
# 'ml6team/keyphrase-extraction-kbir-inspec' model doesn't have 'entity_group'
# It just uses 'label'
data.append({
'word': entity['word'],
'label': entity['entity_group'], # This is the correct key
'score': entity['score'],
'start': entity['start'],
'end': entity['end']
})
if not data:
st.warning("No keyphrases found in the text.")
st.stop()
df = pd.DataFrame(data)
# --- Data Cleaning and Processing ---
pattern = r'[^\w\s]'
df['word'] = df['word'].replace(pattern, '', regex=True)
df = df.replace('', 'Unknown')
# --- All Extracted Keyphrases ---
st.subheader("All Extracted Keyphrases", divider="rainbow")
st.dataframe(df, use_container_width=True)
with st.expander("See Glossary of tags"):
st.write('''
**word**: ['keyphrase extracted from your text data']
**score**: ['accuracy score; how accurately a tag has been assigned']
**label**: ['label (tag) assigned to a given extracted keyphrase']
**start**: ['index of the start of the corresponding entity']
**end**: ['index of the end of the corresponding entity']
''')
# --- Most Frequent Keyphrases ---
st.subheader("Most Frequent Keyphrases", divider="rainbow")
word_counts = df['word'].value_counts().reset_index()
word_counts.columns = ['word', 'count']
df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
if not df_frequent.empty:
tab1, tab2 = st.tabs(["Table", "Chart"])
with tab1:
st.dataframe(df_frequent, use_container_width=True)
with tab2:
fig_frequent_bar = px.bar(
df_frequent,
x='count',
y='word',
orientation='h',
title='Top Frequent Keyphrases by Count',
color='count',
color_continuous_scale=px.colors.sequential.Viridis
)
fig_frequent_bar.update_layout(
yaxis={'categoryorder': 'total ascending'},
paper_bgcolor='#f0f8ff', # Sets the background color of the entire figure
plot_bgcolor='#f0f8ff' # Sets the background color of the plotting area
)
st.plotly_chart(fig_frequent_bar, use_container_width=True)
if experiment:
experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
else:
st.info("No keyphrases found with more than one occurrence.")
# --- Treemap of All Keyphrases ---
st.subheader("Treemap of All Keyphrases", divider="rainbow")
# Use 'label' instead of 'entity_group'
fig_treemap = px.treemap(
df,
path=[px.Constant("all"), 'label', 'word'],
values='score',
color='word',
color_continuous_scale=px.colors.sequential.Plasma
)
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#f0f8ff', plot_bgcolor='#f0f8ff')
st.plotly_chart(fig_treemap, use_container_width=True)
if experiment:
experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
# --- Download Section ---
dfa = pd.DataFrame(
data={
'Column Name': ['word', 'label', 'score', 'start', 'end'],
'Description': [
'keyphrase extracted from your text data',
'label (tag) assigned to a given keyphrase',
'accuracy score; how accurately a tag has been assigned',
'index of the start of the corresponding entity',
'index of the end of the corresponding entity'
]
}
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as myzip:
myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
with stylable_container(
key="download_button",
css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""",
):
st.download_button(
label="Download zip file",
data=buf.getvalue(),
file_name="nlpblogs_ner_results.zip",
mime="application/zip",
)
st.divider()
except Exception as e:
st.error(f"An unexpected error occurred during processing: {e}")
finally:
if experiment:
try:
# Log parameters and tables before ending the experiment
experiment.log_parameter("input_source_type", "text_area")
experiment.log_parameter("input_content_length", len(text))
experiment.log_table("predicted_entities", df)
experiment.end()
except Exception as comet_e:
st.warning(f"Comet ML experiment.end() failed: {comet_e}")
# Show elapsed time
end_time_overall = time.time()
elapsed_time_overall = end_time_overall - start_time_overall
st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")