Spaces:
Sleeping
Sleeping
import os | |
os.environ['HF_HOME'] = '/tmp' | |
import time | |
import streamlit as st | |
import pandas as pd | |
import io | |
import plotly.express as px | |
import zipfile | |
import json | |
from cryptography.fernet import Fernet | |
from streamlit_extras.stylable_container import stylable_container | |
from typing import Optional | |
from gliner import GLiNER | |
from comet_ml import Experiment | |
from transformers import pipeline | |
st.markdown( | |
""" | |
<style> | |
/* Main app background with a subtle rainbow gradient */ | |
.stApp { | |
background: linear-gradient(135deg, #f0f8ff, #f5f0ff, #fff0f5); | |
color: #000000; | |
font-family: 'Inter', sans-serif; | |
} | |
/* Rainbow gradient for the sidebar */ | |
.css-1d36184, .css-1d36184:hover, .css-1d36184:focus { | |
background: linear-gradient(180deg, #FFC0CB, #FFD700, #98FB98, #ADD8E6, #BA55D3); | |
secondary-background-color: #FFC080; | |
} | |
/* Expander background color with a slight transparency */ | |
.streamlit-expanderContent { | |
background-color: rgba(255, 255, 255, 0.7); | |
border-radius: 10px; | |
} | |
/* Expander header with a gentle gradient and bold text */ | |
.streamlit-expanderHeader { | |
background: linear-gradient(90deg, #FADADD, #FFF9E0, #E0FFF8); | |
border-radius: 10px; | |
font-weight: bold; | |
} | |
/* Text Area with a light background and subtle border */ | |
.stTextArea textarea { | |
background-color: #FFF0F5; | |
color: #000000; | |
border: 1px solid #ccc; | |
border-radius: 8px; | |
} | |
/* Button with a solid color and elegant hover effect */ | |
.stButton > button { | |
background-color: #FF69B4; | |
color: #FFFFFF; | |
font-weight: bold; | |
border-radius: 12px; | |
transition: all 0.2s ease-in-out; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
} | |
.stButton > button:hover { | |
background-color: #FFB6C1; | |
box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15); | |
transform: translateY(-2px); | |
} | |
/* Warning box with a soft orange and rounded corners */ | |
.stAlert.st-warning { | |
background-color: #FFDDAA; | |
color: #000000; | |
border-radius: 10px; | |
border-left: 5px solid #FFA500; | |
} | |
/* Success box with a fresh green and rounded corners */ | |
.stAlert.st-success { | |
background-color: #D4EDDA; | |
color: #155724; | |
border-radius: 10px; | |
border-left: 5px solid #28A745; | |
} | |
/* Custom CSS to make the title text rainbow-colored */ | |
h1 { | |
background: linear-gradient(45deg, #FF69B4, #FFD700, #00FF7F, #00BFFF, #8A2BE2); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
font-size: 3em; | |
font-weight: 800; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
st.set_page_config( | |
layout="wide", | |
page_title="English Keyphrase" | |
) | |
# --- Comet ML Setup --- | |
COMET_API_KEY = os.environ.get("COMET_API_KEY") | |
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE") | |
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME") | |
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME) | |
if not comet_initialized: | |
st.warning("Comet ML not initialized. Check environment variables.") | |
# --- UI Header and Notes --- | |
st.subheader("AcademiaMiner", divider="rainbow") | |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") | |
expander = st.expander("**Important notes*") | |
expander.write(''' | |
**Named Entities:** This AcademiaMiner extracts keyphrases from English academic and scientific papers. | |
Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags. | |
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data. | |
**Usage Limits:** You can request results unlimited times for one (1) month. | |
**Supported Languages:** English | |
**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. | |
For any errors or inquiries, please contact us at [email protected]''' | |
) | |
with st.sidebar: | |
st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.") | |
code = ''' | |
<iframe | |
src="https://aiecosystem-business-core.hf.space" | |
frameborder="0" | |
width="850" | |
height="450" | |
></iframe> | |
''' | |
st.code(code, language="html") | |
st.text("") | |
st.text("") | |
st.divider() | |
st.subheader("🚀 Ready to build your own NER Web App?", divider="rainbow") | |
st.link_button("NER Builder", "https://nlpblogs.com", type="primary") | |
def load_ner_model(): | |
"""Loads the GLiNER model and caches it.""" | |
try: | |
return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints= labels) | |
except Exception as e: | |
st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}") | |
st.stop() | |
model = load_ner_model() | |
def load_ner_model(): | |
return pipeline("token-classification", | |
model="ml6team/keyphrase-extraction-kbir-inspec", | |
aggregation_strategy="max", | |
stride=128, | |
ignore_labels=["O"]) | |
model = load_ner_model() | |
text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", height=250, key='my_text_area') | |
def clear_text(): | |
"""Clears the text area.""" | |
st.session_state['my_text_area'] = "" | |
st.button("Clear text", on_click=clear_text) | |
if st.button("Results"): | |
start_time = time.time() | |
if not text.strip(): | |
st.warning("Please enter some text to extract entities.") | |
else: | |
with st.spinner("Analyzing text...", show_time=True): | |
entities = model(text_for_ner) | |
data = [] | |
if entities: | |
for entity in entities: | |
if all(k in entity for k in ['word', 'entity_group', 'score', 'start', 'end']): | |
data.append({ | |
'word': entity['word'], | |
'entity_group': entity['entity_group'], | |
'score': entity['score'], | |
'start': entity['start'], | |
'end': entity['end'] | |
}) | |
else: | |
st.warning(f"Skipping malformed entity encountered: {entity}. Missing expected keys.") | |
df = pd.DataFrame(data) | |
else: | |
df = pd.DataFrame(columns=['word', 'entity_group', 'score', 'start', 'end']) | |
if not df.empty: | |
pattern = r'[^\w\s]' | |
df['word'] = df['word'].replace(pattern, '', regex=True) | |
df = df.replace('', 'Unknown') | |
st.subheader("All Extracted Keyphrases", divider="rainbow") | |
st.dataframe(df, use_container_width=True) | |
with st.expander("See Glossary of tags"): | |
st.write(''' | |
**word**: ['entity extracted from your text data'] | |
**score**: ['accuracy score; how accurately a tag has been assigned to a given entity'] | |
**entity_group**: ['label (tag) assigned to a given extracted entity'] | |
**start**: ['index of the start of the corresponding entity'] | |
**end**: ['index of the end of the corresponding entity'] | |
''') | |
st.divider() | |
st.subheader("Most Frequent Keyphrases", divider="rainbow") | |
word_counts = df['word'].value_counts().reset_index() | |
word_counts.columns = ['word', 'count'] | |
df_frequent = word_counts.sort_values(by='count', ascending=False).head(15) | |
if not df_frequent.empty: | |
tab1, tab2 = st.tabs(["Table", "Chart"]) | |
with tab1: | |
st.dataframe(df_frequent, use_container_width=True) | |
with tab2: | |
fig_frequent_bar = px.bar( | |
df_frequent, | |
x='count', | |
y='word', | |
orientation='h', | |
title='Top Frequent Keyphrases by Count', | |
color='count', | |
color_continuous_scale=px.colors.sequential.Viridis | |
) | |
fig_frequent_bar.update_layout(yaxis={'categoryorder':'total ascending'}) | |
st.plotly_chart(fig_frequent_bar, use_container_width=True) | |
if comet_initialized and 'experiment' in locals(): | |
experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart") | |
else: | |
st.info("No keyphrases found with more than one occurrence to display in tabs.") | |
st.divider() | |
experiment = None | |
if comet_initialized: | |
experiment = Experiment( | |
api_key=COMET_API_KEY, | |
workspace=COMET_WORKSPACE, | |
project_name=COMET_PROJECT_NAME, | |
) | |
experiment.log_parameter("input_source_type", source_type) | |
experiment.log_parameter("input_content_length", len(text_for_ner)) | |
experiment.log_table("predicted_entities", df) | |
st.subheader("Treemap of All Keyphrases", divider="rainbow") | |
fig_treemap = px.treemap( | |
df, | |
path=[px.Constant("all"), 'entity_group', 'word'], | |
values='score', | |
color='word', | |
color_continuous_scale=px.colors.sequential.Plasma | |
) | |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25)) | |
st.plotly_chart(fig_treemap, use_container_width=True) | |
if comet_initialized and experiment: | |
experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap") | |
# --- Download Section --- | |
dfa = pd.DataFrame( | |
data={ | |
'Column Name': ['word', 'entity_group', 'score', 'start', 'end'], | |
'Description': [ | |
'entity extracted from your text data', | |
'label (tag) assigned to a given extracted entity', | |
'accuracy score; how accurately a tag has been assigned to a given entity', | |
'index of the start of the corresponding entity', | |
'index of the end of the corresponding entity' | |
] | |
} | |
) | |
buf = io.BytesIO() | |
with zipfile.ZipFile(buf, "w") as myzip: | |
if not df.empty: | |
myzip.writestr("Summary_of_results.csv", df.to_csv(index=False)) | |
myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False)) | |
myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False)) | |
with stylable_container( | |
key="download_button", | |
css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""", | |
): | |
st.download_button( | |
label="Download zip file", | |
data=buf.getvalue(), | |
file_name="nlpblogs_ner_results.zip", | |
mime="application/zip", | |
) | |
st.divider() | |
else: | |
st.warning("No entities found to generate visualizations.") | |
else: | |
st.warning("No meaningful text found to process. Please enter a URL, upload a text file, or type/paste text.") | |
except Exception as e: | |
st.error(f"An unexpected error occurred during processing: {e}") | |
finally: | |
if comet_initialized and experiment is not None: | |
try: | |
experiment.end() | |
except Exception as comet_e: | |
st.warning(f"Comet ML experiment.end() failed: {comet_e}") | |
if start_time_overall is not None: | |
end_time_overall = time.time() | |
elapsed_time_overall = end_time_overall - start_time_overall | |
st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.") | |
st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**") | |
else: | |
st.warning("Please enter some text, a URL, or upload a file to analyze.") |