Spaces:
Build error
Build error
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| import spacy | |
| import transformers | |
| import os | |
| from spacy.lang.en import English | |
| from transformers import AutoModel, AutoTokenizer | |
| from utils.utils import * | |
| transformers.utils.logging.disable_progress_bar() | |
| os.system("python3 -m spacy download en") | |
| st.markdown("""### TL;DR: give me the keywords! | |
| Здесь вы можете получить отранжированный список ключевых слов по названию и аннотации статьи. | |
| Единственным поддерживаемым языком является английский.""") | |
| st.markdown("<p style=\"text-align:center\"><img width=100% src='https://c.tenor.com/IKt-6tAk9CUAAAAd/thats-a-lot-of-words-lots-of-words.gif'></p>", unsafe_allow_html=True) | |
| #from transformers import pipeline | |
| #pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl") | |
| #st.markdown("#### Title:") | |
| title = st.text_area("Заголовок:", value="How to cook a neural network", height=16, help="Заголовок статьи") | |
| abstract = st.text_area("Аннотация:", | |
| value="""My dad fits hellish models in general. | |
| Well, this is about an average recipe, because there are a lot of variations. | |
| The model is taken, it is not finetuned, finetuning is not about my dad. | |
| He takes this model, dumps it into the tensorboard and starts frying it. | |
| Adds a huge amount of noize, convolutions, batch and spectral normalization DROPOUT! for regularization, maxpooling on top. | |
| All this is fitted to smoke. | |
| Then the computer is removed from the fire and cools on the balcony. | |
| Then dad brings it in and generously sprinkles it with crossvalidation and starts predicting. | |
| At the same time, he gets data from the web, scraping it with a fork. | |
| Predicts and sentences in a half-whisper oh god. | |
| At the same time, he has sweat on his forehead. | |
| Kindly offers me sometimes, but I refuse. | |
| Do I need to talk about what the wildest overfitting then? | |
| The overfitting is such that the val loss peels off the walls. | |
| """, | |
| height=512, help="Аннотация статьи") | |
| # Spacy | |
| def get_nlp(nlp_name): | |
| return spacy.load(nlp_name) | |
| # Вообще, стоит найти pipeline, заточенный под научный текст. | |
| # Но этим займёмся потом, если будет время. | |
| nlp_name = 'en_core_web_sm' | |
| main_nlp = get_nlp(nlp_name) | |
| # Получение модели. | |
| #@st.cache(hash_funcs={transformers.tokenizers.Tokenizer: lambda _: None}) | |
| def get_model_and_tokenizer(model_name): | |
| model = AutoModel.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| return model, tokenizer | |
| model_name = "distilroberta-base" | |
| main_model, main_tokenizer = get_model_and_tokenizer(model_name) | |
| # Обработка текста. | |
| text = preprocess([title + ". " + abstract])[0] | |
| if not text is None and len(text) > 0: | |
| #keywords = get_candidates(text, main_nlp) | |
| keywords = get_keywords(text, main_nlp, main_model, main_tokenizer) | |
| labels = [kw[0].replace(' ', '\n') for kw in keywords] | |
| scores = [kw[1] for kw in keywords] | |
| #st.markdown(f"{keywords}") | |
| # Топ 5 слов. | |
| top = 5 | |
| top = min(len(labels, top)) | |
| st.markdown("Топ %d ключевых слов: **%s**" % (top, ', '.join(labels[0:5]))) | |
| # График важности слов. | |
| fig, ax = plt.subplots(figsize=(8, 16)) | |
| ax.set_title("95% самых важных ключевых слов") | |
| ax.grid(color='#000000', alpha=0.15, linestyle='-', linewidth=1, which='major') | |
| ax.grid(color='#000000', alpha=0.1, linestyle='-', linewidth=0.5, which='minor') | |
| bar_width = 0.75 | |
| indexes = -np.arange(len(labels)) | |
| ax.barh(indexes, scores, bar_width) | |
| plt.yticks(indexes, labels=labels) | |
| st.pyplot(fig) | |
| else: | |
| st.markdown("Please, try to enter something.") |