Spaces:

vonewman
/

ner_app

Runtime error

App Files Files Community

ner_app / app.py

vonewman

Update app.py

fab7e8b almost 2 years ago

raw

history blame

5.6 kB

	import streamlit as st
	import pandas as pd
	import re
	import json
	import transformers
	import torch
	from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer

	st.set_page_config(
	page_title="Named Entity Recognition Wolof",
	page_icon="📘"
	)

	def convert_df(df: pd.DataFrame):
	return df.to_csv(index=False).encode('utf-8')

	def convert_json(df: pd.DataFrame):
	result = df.to_json(orient="index")
	parsed = json.loads(result)
	json_string = json.dumps(parsed)
	return json_string

	def load_model():
	model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
	trainer = Trainer(model=model)
	tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
	return trainer, model, tokenizer

	def align_word_ids(texts):
	# Utilisez le tokenizer pour obtenir les tokens de chaque mot
	tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
	input_ids = tokenized_inputs["input_ids"][0]

	# Créez une liste pour stocker les IDs correspondant à chaque mot
	word_ids = []

	for i, input_id in enumerate(input_ids):
	# Si le token est un token de début de mot, ajoutez son ID à la liste
	if tokenizer.decode(input_id) == tokenizer.decode(tokenizer.encode(tokenizer.decode(input_id), add_special_tokens=False)):
	word_ids.append(i)

	label_ids = []

	# Parcourez les word_ids pour étiqueter les tokens de début de mot comme 1
	for i in range(len(input_ids)):
	if i in word_ids:
	label_ids.append(1)
	else:
	label_ids.append(-100) # -100 pour les tokens qui ne sont pas le début d'un mot

	return label_ids


	def predict_ner_labels(model, tokenizer, sentence):
	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")

	if use_cuda:
	model = model.cuda()

	text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
	mask = text['attention_mask'].to(device)
	input_id = text['input_ids'].to(device)
	label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

	logits = model(input_id, mask, None)
	logits_clean = logits[0][label_ids != -100]

	predictions = logits_clean.argmax(dim=1).tolist()
	prediction_label = [id2tag[i] for i in predictions]

	return prediction_label

	id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}


	def tag_sentence(text):
	trainer, model, tokenizer = load_model()

	# Utilisez votre modèle pour prédire les tags
	predictions = predict_ner_labels(model, tokenizer, text)

	# Obtenez les probabilités associées aux prédictions
	inputs = tokenizer(text, truncation=True, return_tensors="pt")
	outputs = model(**inputs)
	probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

	# Calcul des probabilités que le tag prédit soit correct
	word_tags = []
	for i, tag in enumerate(predictions):
	tag_id = id2tag.get(tag, -1) # Vérifiez si la clé existe, sinon utilisez -1 comme indice
	if tag_id != -1:
	prob = np.round(probs[0, i, tag_id].item() * 100, 2)
	word_tags.append((tokenizer.decode(inputs['input_ids'][0][i].item()), tag, prob))

	# Créez un DataFrame avec les colonnes dans l'ordre spécifié
	df = pd.DataFrame(word_tags, columns=['word', 'tag', 'probability'])

	return df


	st.title("📘 Named Entity Recognition Wolof")

	with st.form(key='my_form'):
	x1 = st.text_input(label='Enter a sentence:', max_chars=250)
	submit_button = st.form_submit_button(label='🏷️ Create tags')

	if submit_button:
	if re.sub('\s+', '', x1) == '':
	st.error('Please enter a non-empty sentence.')
	elif re.match(r'\A\s\w+\s\Z', x1):
	st.error("Please enter a sentence with at least one word")
	else:
	st.markdown("### Tagged Sentence")
	st.header("")

	results = tag_sentence(x1)

	cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])

	with c1:
	csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
	file_name="results.csv", mime='text/csv', key='csv')
	with c2:
	textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results),
	file_name="results.text", mime='text/plain', key='text')
	with c3:
	jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
	file_name="results.json", mime='application/json', key='json')

	st.header("")

	c1, c2, c3 = st.columns([1, 3, 1])

	with c2:
	st.table(results.style.background_gradient(subset=['probability']).format(precision=2))

	st.header("")
	st.header("")
	st.header("")
	with st.expander("ℹ️ - About this app", expanded=True):
	st.write(
	"""
	- The Named Entity Recognition Wolof app is a tool that performs named entity recognition in Wolof.
	- The available entities are: corporation, location, person, and date.
	- The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.
	- The model uses the byte-level BPE tokenizer. Each sentence is first tokenized.
	"""
	)