Spaces:
Sleeping
Sleeping
import os | |
os.environ['HF_HOME'] = '/tmp' | |
import time | |
import streamlit as st | |
import pandas as pd | |
import io | |
import plotly.express as px | |
import zipfile | |
from streamlit_extras.stylable_container import stylable_container | |
from transformers import pipeline | |
from comet_ml import Experiment | |
# --- App Configuration and Styling --- | |
st.set_page_config( | |
layout="wide", | |
page_title="English Keyphrase" | |
) | |
st.markdown( | |
""" | |
<style> | |
.stApp { | |
background-color: #f0f8ff; /* A single, solid color */ | |
color: #000000; | |
font-family: 'Inter', sans-serif; | |
} | |
.stButton > button { | |
background-color: #FF69B4; | |
color: #FFFFFF; | |
font-weight: bold; | |
border-radius: 12px; | |
transition: all 0.2s ease-in-out; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
} | |
.stButton > button:hover { | |
background-color: #FFB6C1; | |
box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15); | |
transform: translateY(-2px); | |
} | |
/* Text Area background and text color */ | |
.stTextArea textarea { | |
background-color: #FFC0CB; /* A nice pink color */ | |
color: #000000; | |
border: 1px solid #FF69B4; /* A pink border to match the button */ | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
# --- Comet ML Setup --- | |
COMET_API_KEY = os.environ.get("COMET_API_KEY") | |
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE") | |
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME") | |
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME) | |
if not comet_initialized: | |
st.warning("Comet ML not initialized. Check environment variables.") | |
# --- UI Header and Notes --- | |
st.subheader("AcademiaMiner", divider="rainbow") | |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") | |
expander = st.expander("**Important notes**") | |
expander.write('''**Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers. | |
Results are presented in easy-to-read tables, visualized in an interactive tree map and a bar chart, and are available for download along with a Glossary of tags. | |
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data. | |
**Usage Limits:** You can request results unlimited times for one (1) month. | |
**Supported Languages:** English | |
**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. | |
For any errors or inquiries, please contact us at [email protected]''') | |
with st.sidebar: | |
st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.") | |
code = ''' | |
<iframe | |
src="https://aiecosystem-academiaminer.hf.space" | |
frameborder="0" | |
width="850" | |
height="450" | |
></iframe> | |
''' | |
st.code(code, language="html") | |
st.text("") | |
st.text("") | |
st.divider() | |
st.subheader("π Ready to build your own NER Web App?", divider="rainbow") | |
st.link_button("NER Builder", "https://nlpblogs.com", type="primary") | |
# --- Model Loading --- | |
def load_ner_model(): | |
"""Loads the keyphrase extraction model and caches it.""" | |
try: | |
return pipeline( | |
"token-classification", | |
model="ml6team/keyphrase-extraction-kbir-inspec", | |
aggregation_strategy="max", stride=128, ignore_labels=["O"] | |
) | |
except Exception as e: | |
st.error(f"Failed to load NER model: {e}") | |
st.stop() | |
model = load_ner_model() | |
# --- Main App Logic --- | |
text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area') | |
def clear_text(): | |
"""Clears the text area.""" | |
st.session_state['my_text_area'] = "" | |
st.session_state.text_processed = False | |
st.button("Clear text", on_click=clear_text) | |
if st.button("Results"): | |
if not text.strip(): | |
st.warning("Please enter some text to extract keyphrases.") | |
else: | |
start_time_overall = time.time() | |
# Initialize Comet ML experiment at the start | |
experiment = None | |
if comet_initialized: | |
try: | |
experiment = Experiment( | |
api_key=COMET_API_KEY, | |
workspace=COMET_WORKSPACE, | |
project_name=COMET_PROJECT_NAME, | |
) | |
except Exception as e: | |
st.warning(f"Could not initialize Comet ML experiment: {e}") | |
experiment = None | |
try: | |
with st.spinner("Analyzing text...", ): | |
# The pipeline model returns a list of dictionaries. | |
entities = model(text) | |
data = [] | |
for entity in entities: | |
# 'ml6team/keyphrase-extraction-kbir-inspec' model doesn't have 'entity_group' | |
# It just uses 'label' | |
data.append({ | |
'word': entity['word'], | |
'label': entity['entity_group'], # This is the correct key | |
'score': entity['score'], | |
'start': entity['start'], | |
'end': entity['end'] | |
}) | |
if not data: | |
st.warning("No keyphrases found in the text.") | |
st.stop() | |
df = pd.DataFrame(data) | |
# --- Data Cleaning and Processing --- | |
pattern = r'[^\w\s]' | |
df['word'] = df['word'].replace(pattern, '', regex=True) | |
df = df.replace('', 'Unknown') | |
# --- All Extracted Keyphrases --- | |
st.subheader("All Extracted Keyphrases", divider="rainbow") | |
st.dataframe(df, use_container_width=True) | |
with st.expander("See Glossary of tags"): | |
st.write(''' | |
**word**: ['keyphrase extracted from your text data'] | |
**score**: ['accuracy score; how accurately a tag has been assigned'] | |
**label**: ['label (tag) assigned to a given extracted keyphrase'] | |
**start**: ['index of the start of the corresponding entity'] | |
**end**: ['index of the end of the corresponding entity'] | |
''') | |
# --- Most Frequent Keyphrases --- | |
st.subheader("Most Frequent Keyphrases", divider="rainbow") | |
word_counts = df['word'].value_counts().reset_index() | |
word_counts.columns = ['word', 'count'] | |
df_frequent = word_counts.sort_values(by='count', ascending=False).head(15) | |
if not df_frequent.empty: | |
tab1, tab2 = st.tabs(["Table", "Chart"]) | |
with tab1: | |
st.dataframe(df_frequent, use_container_width=True) | |
with tab2: | |
fig_frequent_bar = px.bar( | |
df_frequent, | |
x='count', | |
y='word', | |
orientation='h', | |
title='Top Frequent Keyphrases by Count', | |
color='count', | |
color_continuous_scale=px.colors.sequential.Viridis | |
) | |
fig_frequent_bar.update_layout( | |
yaxis={'categoryorder': 'total ascending'}, | |
paper_bgcolor='#f0f8ff', # Sets the background color of the entire figure | |
plot_bgcolor='#f0f8ff' # Sets the background color of the plotting area | |
) | |
st.plotly_chart(fig_frequent_bar, use_container_width=True) | |
if experiment: | |
experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart") | |
else: | |
st.info("No keyphrases found with more than one occurrence.") | |
# --- Treemap of All Keyphrases --- | |
st.subheader("Treemap of All Keyphrases", divider="rainbow") | |
# Use 'label' instead of 'entity_group' | |
fig_treemap = px.treemap( | |
df, | |
path=[px.Constant("all"), 'label', 'word'], | |
values='score', | |
color='word', | |
color_continuous_scale=px.colors.sequential.Plasma | |
) | |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#f0f8ff', plot_bgcolor='#f0f8ff') | |
st.plotly_chart(fig_treemap, use_container_width=True) | |
if experiment: | |
experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap") | |
# --- Download Section --- | |
dfa = pd.DataFrame( | |
data={ | |
'Column Name': ['word', 'label', 'score', 'start', 'end'], | |
'Description': [ | |
'keyphrase extracted from your text data', | |
'label (tag) assigned to a given keyphrase', | |
'accuracy score; how accurately a tag has been assigned', | |
'index of the start of the corresponding entity', | |
'index of the end of the corresponding entity' | |
] | |
} | |
) | |
buf = io.BytesIO() | |
with zipfile.ZipFile(buf, "w") as myzip: | |
myzip.writestr("Summary_of_results.csv", df.to_csv(index=False)) | |
myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False)) | |
myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False)) | |
with stylable_container( | |
key="download_button", | |
css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""", | |
): | |
st.download_button( | |
label="Download zip file", | |
data=buf.getvalue(), | |
file_name="nlpblogs_ner_results.zip", | |
mime="application/zip", | |
) | |
st.divider() | |
except Exception as e: | |
st.error(f"An unexpected error occurred during processing: {e}") | |
finally: | |
if experiment: | |
try: | |
# Log parameters and tables before ending the experiment | |
experiment.log_parameter("input_source_type", "text_area") | |
experiment.log_parameter("input_content_length", len(text)) | |
experiment.log_table("predicted_entities", df) | |
experiment.end() | |
except Exception as comet_e: | |
st.warning(f"Comet ML experiment.end() failed: {comet_e}") | |
# Show elapsed time | |
end_time_overall = time.time() | |
elapsed_time_overall = end_time_overall - start_time_overall | |
st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.") | |