AcademiaMiner / src /streamlit_app.py
AIEcosystem's picture
Update src/streamlit_app.py
19caa3e verified
raw
history blame
14.2 kB
import os
os.environ['HF_HOME'] = '/tmp'
import time
import streamlit as st
import pandas as pd
import io
import plotly.express as px
import zipfile
import json
from cryptography.fernet import Fernet
from streamlit_extras.stylable_container import stylable_container
from typing import Optional
from gliner import GLiNER
from comet_ml import Experiment
from transformers import pipeline
st.markdown(
"""
<style>
/* Main app background with a subtle rainbow gradient */
.stApp {
background: linear-gradient(135deg, #f0f8ff, #f5f0ff, #fff0f5);
color: #000000;
font-family: 'Inter', sans-serif;
}
/* Rainbow gradient for the sidebar */
.css-1d36184, .css-1d36184:hover, .css-1d36184:focus {
background: linear-gradient(180deg, #FFC0CB, #FFD700, #98FB98, #ADD8E6, #BA55D3);
secondary-background-color: #FFC080;
}
/* Expander background color with a slight transparency */
.streamlit-expanderContent {
background-color: rgba(255, 255, 255, 0.7);
border-radius: 10px;
}
/* Expander header with a gentle gradient and bold text */
.streamlit-expanderHeader {
background: linear-gradient(90deg, #FADADD, #FFF9E0, #E0FFF8);
border-radius: 10px;
font-weight: bold;
}
/* Text Area with a light background and subtle border */
.stTextArea textarea {
background-color: #FFF0F5;
color: #000000;
border: 1px solid #ccc;
border-radius: 8px;
}
/* Button with a solid color and elegant hover effect */
.stButton > button {
background-color: #FF69B4;
color: #FFFFFF;
font-weight: bold;
border-radius: 12px;
transition: all 0.2s ease-in-out;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.stButton > button:hover {
background-color: #FFB6C1;
box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
transform: translateY(-2px);
}
/* Warning box with a soft orange and rounded corners */
.stAlert.st-warning {
background-color: #FFDDAA;
color: #000000;
border-radius: 10px;
border-left: 5px solid #FFA500;
}
/* Success box with a fresh green and rounded corners */
.stAlert.st-success {
background-color: #D4EDDA;
color: #155724;
border-radius: 10px;
border-left: 5px solid #28A745;
}
/* Custom CSS to make the title text rainbow-colored */
h1 {
background: linear-gradient(45deg, #FF69B4, #FFD700, #00FF7F, #00BFFF, #8A2BE2);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 3em;
font-weight: 800;
}
</style>
""",
unsafe_allow_html=True
)
st.set_page_config(
layout="wide",
page_title="English Keyphrase"
)
# --- Comet ML Setup ---
COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
if not comet_initialized:
st.warning("Comet ML not initialized. Check environment variables.")
# --- UI Header and Notes ---
st.subheader("AcademiaMiner", divider="rainbow")
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
expander = st.expander("**Important notes*")
expander.write('''
**Named Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers.
Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
**Usage Limits:** You can request results unlimited times for one (1) month.
**Supported Languages:** English
**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
For any errors or inquiries, please contact us at [email protected]'''
)
with st.sidebar:
st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
code = '''
<iframe
src="https://aiecosystem-business-core.hf.space"
frameborder="0"
width="850"
height="450"
></iframe>
'''
st.code(code, language="html")
st.text("")
st.text("")
st.divider()
st.subheader("🚀 Ready to build your own NER Web App?", divider="rainbow")
st.link_button("NER Builder", "https://nlpblogs.com", type="primary")
@st.cache_resource
def load_ner_model():
"""Loads the GLiNER model and caches it."""
try:
return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints= labels)
except Exception as e:
st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
st.stop()
model = load_ner_model()
@st.cache_resource
def load_ner_model():
return pipeline("token-classification",
model="ml6team/keyphrase-extraction-kbir-inspec",
aggregation_strategy="max",
stride=128,
ignore_labels=["O"])
model = load_ner_model()
text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area')
def clear_text():
"""Clears the text area."""
st.session_state['my_text_area'] = ""
st.button("Clear text", on_click=clear_text)
if st.button("Results"):
start_time = time.time()
if not text.strip():
st.warning("Please enter some text to extract entities.")
else:
with st.spinner("Analyzing text...", show_time=True):
entities = model(text_for_ner)
data = []
if entities:
for entity in entities:
if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']):
data.append({
'word': entity['word'],
'entity_group': entity['entity_group'],
'score': entity['score'],
'start': entity['start'],
'end': entity['end']
})
else:
st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.")
df = pd.DataFrame(data)
else:
df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end'])
if not df.empty:
pattern = r'[^\w\s]'
df['word'] = df['word'].replace(pattern, '', regex=True)
df = df.replace('', 'Unknown')
st.subheader("All Extracted Keyphrases", divider="rainbow")
st.dataframe(df, use_container_width=True)
with st.expander("See Glossary of tags"):
st.write('''
**word**: ['entity extracted from your text data']
**score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
**entity_group**: ['label (tag) assigned to a given extracted entity']
**start**: ['index of the start of the corresponding entity']
**end**: ['index of the end of the corresponding entity']
''')
st.divider()
st.subheader("Most Frequent Keyphrases", divider="rainbow")
word_counts = df['word'].value_counts().reset_index()
word_counts.columns = ['word', 'count']
df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
if not df_frequent.empty:
tab1, tab2 = st.tabs(["Table", "Chart"])
with tab1:
st.dataframe(df_frequent, use_container_width=True)
with tab2:
fig_frequent_bar = px.bar(
df_frequent,
x='count',
y='word',
orientation='h',
title='Top Frequent Keyphrases by Count',
color='count',
color_continuous_scale=px.colors.sequential.Viridis
)
fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'})
st.plotly_chart(fig_frequent_bar, use_container_width=True)
if comet_initialized and 'experiment' in locals():
experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
else:
st.info("No keyphrases found with more than one occurrence to display in tabs.")
st.divider()
experiment = None
if comet_initialized:
experiment = Experiment(
api_key=COMET_API_KEY,
workspace=COMET_WORKSPACE,
project_name=COMET_PROJECT_NAME,
)
experiment.log_parameter("input_source_type", source_type)
experiment.log_parameter("input_content_length", len(text_for_ner))
experiment.log_table("predicted_entities", df)
st.subheader("Treemap of All Keyphrases", divider="rainbow")
fig_treemap = px.treemap(
df,
path=[px.Constant("all"), 'entity_group', 'word'],
values='score',
color='word',
color_continuous_scale=px.colors.sequential.Plasma
)
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
st.plotly_chart(fig_treemap, use_container_width=True)
if comet_initialized and experiment:
experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
# --- Download Section ---
dfa = pd.DataFrame(
data={
'Column Name': ['word', 'entity_group', 'score', 'start', 'end'],
'Description': [
'entity extracted from your text data',
'label (tag) assigned to a given extracted entity',
'accuracy score; how accurately a tag has been assigned to a given entity',
'index of the start of the corresponding entity',
'index of the end of the corresponding entity'
]
}
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as myzip:
if not df.empty:
myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
with stylable_container(
key="download_button",
css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
):
st.download_button(
label="Download zip file",
data=buf.getvalue(),
file_name="nlpblogs_ner_results.zip",
mime="application/zip",
)
st.divider()
else:
st.warning("No entities found to generate visualizations.")
else:
st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.")
except Exception as e:
st.error(f"An unexpected error occurred during processing: {e}")
finally:
if comet_initialized and experiment is not None:
try:
experiment.end()
except Exception as comet_e:
st.warning(f"Comet ML experiment.end() failed: {comet_e}")
if start_time_overall is not None:
end_time_overall = time.time()
elapsed_time_overall = end_time_overall - start_time_overall
st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")
else:
st.warning("Please enter some text, a URL, or upload a file to analyze.")