Spaces:

shroom-semeval25
/

cogumelo-visualizer

Running

App Files Files Community

cogumelo-visualizer / visualize.py

acmc

Update visualize.py

0bcb019 verified 9 months ago

raw

history blame contribute delete

14.4 kB

	import gradio as gr
	import datasets
	import difflib
	import transformers
	import torch
	import logging


	tokenizer = transformers.AutoTokenizer.from_pretrained("google/flan-t5-base")

	dataset = (
	datasets.load_dataset(
	"shroom-semeval25/hallucinated_answer_generated_dataset",
	split="test",
	)
	.take(10000)
	.to_pandas()
	.sort_values("question")
	)

	# Show columns in this order: question, correct_answer_generated, hallucinated_answer_generated, everything else
	dataset = dataset[
	["question", "correct_answer_generated", "hallucinated_answer_generated"]
	+ [
	col
	for col in dataset.columns
	if col
	not in ["question", "correct_answer_generated", "hallucinated_answer_generated"]
	]
	]


	def show_hallucinations(element):
	original_text = element["correct_answer_generated"]
	hallucinated_text = element["hallucinated_answer_generated"]
	# tokenize both texts
	original_tokens = tokenizer(
	original_text, return_offsets_mapping=True, add_special_tokens=False
	)
	hallucinated_tokens = tokenizer(
	hallucinated_text, return_offsets_mapping=True, add_special_tokens=False
	)
	# Find the tokens that are different. We have two lists of numbers, we need to find the differences (mind the order)
	diff = difflib.SequenceMatcher(
	None,
	original_tokens["input_ids"],
	hallucinated_tokens["input_ids"],
	).get_opcodes()
	entities = []
	# Follows this structure:
	# {
	# "entity": "+" or "-",
	# "start": 0,
	# "end": 0,
	# }
	for tag, i1, i2, j1, j2 in diff:
	try:
	if tag == "equal":
	continue
	# Anything that is not equal is a hallucination

	start_char = hallucinated_tokens["offset_mapping"][j1][0]
	end_char = hallucinated_tokens["offset_mapping"][j2 - 1][1] + 1
	entity = {
	"entity": "hal",
	"start": start_char,
	"end": end_char,
	}
	# entity_2 = {
	# "entity": "-",
	# "start": start,
	# "end": end,
	# }
	entities.append(entity)
	# entities.append(entity_2)
	except IndexError as e:
	gr.Error(f"There was an error in the tokenization process: {e}")

	return [
	{
	"calculated_diffs": diff,
	"tokenized_original": original_tokens,
	"tokenized_hallucinated": hallucinated_tokens,
	**element.to_dict(),
	},
	element["correct_answer_generated"],
	{
	"text": hallucinated_text,
	"entities": entities,
	},
	]


	roberta_base_predictor = transformers.AutoModelForTokenClassification.from_pretrained(
	"shroom-semeval25/cogumelo-hallucinations-detector-roberta-base"
	)
	roberta_base_tokenizer = transformers.AutoTokenizer.from_pretrained(
	"shroom-semeval25/cogumelo-hallucinations-detector-roberta-base"
	)
	roberta_large_qa_predictor = (
	transformers.AutoModelForTokenClassification.from_pretrained(
	"shroom-semeval25/cogumelo-hallucinations-detector-roberta-large-qa-15000"
	)
	)
	flan_t5_qa_predictor = transformers.AutoModelForTokenClassification.from_pretrained(
	"shroom-semeval25/cogumelo-hallucinations-detector-flan-t5-xl-qa-v3"
	)
	flan_t5_qa_tokenizer = transformers.AutoTokenizer.from_pretrained(
	"shroom-semeval25/cogumelo-hallucinations-detector-flan-t5-xl-qa-v3"
	)


	def trim_to_answer_only(
	logits: torch.Tensor, special_tokens_mask: torch.Tensor, offsets: torch.Tensor
	):
	token_type_ids = torch.zeros_like(special_tokens_mask)
	# Do a cumsum on the special tokens mask
	cumsum_special_tokens_mask = special_tokens_mask.cumsum(dim=-1)
	# Restrict it to the non-special tokens
	cumsum_special_tokens_mask = cumsum_special_tokens_mask[
	special_tokens_mask == 0
	] # Now this has a different shape, we don't know what exactly it is, but we'll use it consistently so it's fine
	# In case that the first token is a special token, we'd have cumsums on non-special tokens that are 1 and 2; otherwise, we'd have 0 and 1. To make sure that we always have 0 and 1, we'll subtract the minimum
	cumsum_special_tokens_mask -= cumsum_special_tokens_mask.min()
	token_type_ids[special_tokens_mask == 0] = cumsum_special_tokens_mask
	return logits[token_type_ids != 0], offsets[token_type_ids != 0]


	def mark_hallucinations(logits, hallucinated_text, offsets):
	# Get the highest value for each token
	predictions = logits.argmax(dim=-1).squeeze(0).tolist()
	entities = []
	current_entity = None
	for i, prediction in enumerate(predictions):
	if prediction == 0:
	if current_entity is not None:
	entities.append(current_entity)
	current_entity = None
	continue
	if prediction == 1:
	if current_entity is not None:
	entities.append(current_entity)
	current_entity = {
	"entity": "hal",
	"start": offsets[i][0],
	"end": offsets[i][1] + 1,
	}
	if prediction == 2:
	if current_entity is None:
	current_entity = {
	"entity": "hal",
	"start": offsets[i][0],
	"end": offsets[i][1] + 1,
	}
	else:
	current_entity["end"] = offsets[i][1] + 1
	if current_entity is not None:
	entities.append(current_entity)
	return {
	"text": hallucinated_text,
	"entities": entities,
	}


	def update_selection(evt: gr.SelectData):
	selected_row = evt.index[0]
	element = dataset.iloc[selected_row]
	question = element["question"]
	hallucinated_answer = element["hallucinated_answer_generated"]
	json_example, original_text, highlighted_text = show_hallucinations(element)
	return (
	json_example,
	original_text,
	highlighted_text,
	*get_hallucinations(hallucinated_answer, question),
	)


	def get_hallucinations(hallucinated_answer: str, question: str):
	try:
	hallucinated_tokens = roberta_base_tokenizer(
	text=hallucinated_answer,
	return_offsets_mapping=True,
	add_special_tokens=True,
	return_tensors="pt",
	return_special_tokens_mask=True,
	)
	with torch.no_grad():
	outputs_roberta_base = roberta_base_predictor(
	input_ids=hallucinated_tokens.input_ids,
	attention_mask=hallucinated_tokens.attention_mask,
	)
	# Take only the outputs that are NOT special tokens and where the attention mask is 1
	logits_roberta_base = outputs_roberta_base.logits[
	...,
	(hallucinated_tokens.special_tokens_mask == 0)
	& (hallucinated_tokens.attention_mask == 1),
	:,
	]
	highlighted_text_predicted_roberta_base = mark_hallucinations(
	hallucinated_text=hallucinated_answer,
	logits=logits_roberta_base,
	# Discard the first token, which is the BOS token
	offsets=hallucinated_tokens["offset_mapping"][0][1:],
	)
	if question:
	q_a_tokens = roberta_base_tokenizer(
	# We have to batch into a single-example batch, because otherwise the tokenizer will interpret that the second element of the pair is example #2 of the batch (while actually it is the second part of the pair of example #1)
	text=[(question, hallucinated_answer)],
	return_offsets_mapping=True,
	add_special_tokens=True,
	return_tensors="pt",
	return_special_tokens_mask=True,
	)
	flan_t5_q_a_tokens = flan_t5_qa_tokenizer(
	text=[(question, hallucinated_answer)],
	return_offsets_mapping=True,
	add_special_tokens=True,
	return_tensors="pt",
	return_special_tokens_mask=True,
	)
	with torch.no_grad():
	logits_roberta_large_qa = roberta_large_qa_predictor(
	input_ids=q_a_tokens.input_ids,
	attention_mask=q_a_tokens.attention_mask,
	).logits
	# Take only the outputs after the first special token and where the attention mask is 1 and the special tokens mask is 0
	logits_roberta_large_qa, offsets_roberta_large_qa = trim_to_answer_only(
	logits=logits_roberta_large_qa,
	special_tokens_mask=q_a_tokens.special_tokens_mask,
	offsets=q_a_tokens["offset_mapping"],
	)
	logits_flan_t5_qa = flan_t5_qa_predictor(
	input_ids=flan_t5_q_a_tokens.input_ids,
	attention_mask=flan_t5_q_a_tokens.attention_mask,
	).logits
	logits_flan_t5_qa, offsets_flan_t5_qa = trim_to_answer_only(
	logits=logits_flan_t5_qa,
	special_tokens_mask=flan_t5_q_a_tokens.special_tokens_mask,
	offsets=flan_t5_q_a_tokens["offset_mapping"],
	)

	highlighted_text_predicted_roberta_large_qa = mark_hallucinations(
	hallucinated_text=hallucinated_answer,
	logits=logits_roberta_large_qa,
	offsets=offsets_roberta_large_qa,
	)
	highlighted_text_predicted_flan_t5_qa = mark_hallucinations(
	hallucinated_text=hallucinated_answer,
	logits=logits_flan_t5_qa,
	offsets=offsets_flan_t5_qa,
	)
	else:
	highlighted_text_predicted_roberta_large_qa = {"text": "", "entities": []}
	highlighted_text_predicted_flan_t5_qa = {"text": "", "entities": []}
	except Exception as e:
	logging.exception(f"An error occurred: {e}")
	gr.Error(f"An error occurred: {e}")
	highlighted_text_predicted_roberta_base = {"text": "", "entities": []}
	highlighted_text_predicted_roberta_large_qa = {"text": "", "entities": []}
	highlighted_text_predicted_flan_t5_qa = {"text": "", "entities": []}
	return (
	highlighted_text_predicted_roberta_base,
	highlighted_text_predicted_roberta_large_qa,
	highlighted_text_predicted_flan_t5_qa,
	)


	def predict_hallucinations_manual_input(text: str, question: str = ""):
	return get_hallucinations(text, question)


	with gr.Blocks(title="Hallucinations Explorer") as demo:
	# A selectable dataframe with the dataset
	# print(dataset)
	gr.Markdown(
	"""# COGUMELO

	_SHROOM '25: Detection of Hallucinated Content_"""
	)

	with gr.Accordion(label="Manual Input", open=True) as manual_input:
	model_question_input = gr.Textbox(
	value="",
	label="Question (only for RoBERTa Large QA and Flan T5 QA)",
	placeholder="Type the question here",
	type="text",
	)

	# A textbox where the user can input any text to try the model
	model_manual_input = gr.Textbox(
	value="",
	label="Try your own text",
	placeholder="Type your own text here",
	type="text",
	)

	manual_input_highlighted_text_roberta_base = gr.HighlightedText(
	label="Predicted Hallucinations (RoBERTa Base)",
	color_map={"+": "red", "-": "blue", "hal": "red"},
	combine_adjacent=True,
	)

	manual_input_highlighted_text_roberta_large_qa = gr.HighlightedText(
	label="Predicted Hallucinations (RoBERTa Large QA)",
	color_map={"+": "red", "-": "blue", "hal": "red"},
	combine_adjacent=True,
	)

	manual_input_highlighted_text_flan_t5_qa = gr.HighlightedText(
	label="Predicted Hallucinations (Flan T5 QA)",
	color_map={"+": "red", "-": "blue", "hal": "red"},
	combine_adjacent=True,
	)

	model_manual_input.change(
	predict_hallucinations_manual_input,
	inputs=[model_manual_input, model_question_input],
	outputs=[
	manual_input_highlighted_text_roberta_base,
	manual_input_highlighted_text_roberta_large_qa,
	manual_input_highlighted_text_flan_t5_qa,
	],
	)

	# model_question_input.change(
	# predict_hallucinations_manual_input_roberta_qa_large,
	# inputs=[model_manual_input, model_question_input],
	# outputs=[
	# manual_input_highlighted_text_roberta_large_qa,
	# ],
	# )

	gr.Markdown(
	"""# Dataset
	⚠️ These rows are part of the test set of the dataset, not the entire dataset (the model has therefore not seen them)
	"""
	)
	df = gr.Dataframe(dataset)

	original_text = gr.Textbox(label="Original Text", interactive=False)
	highlighted_text = gr.HighlightedText(
	label="Real Hallucinations (ground truth)",
	color_map={"+": "red", "-": "blue", "hal": "red"},
	combine_adjacent=True,
	)
	highlighted_text_predicted_roberta_base = gr.HighlightedText(
	label="Predicted Hallucinations (RoBERTa Base)",
	color_map={"+": "red", "-": "blue", "hal": "red"},
	combine_adjacent=True,
	)
	highlighted_text_predicted_roberta_large_qa = gr.HighlightedText(
	label="Predicted Hallucinations (RoBERTa Large QA)",
	color_map={"+": "red", "-": "blue", "hal": "red"},
	combine_adjacent=True,
	)
	highlighted_text_predicted_flan_t5_qa = gr.HighlightedText(
	label="Predicted Hallucinations (Flan T5 QA)",
	color_map={"+": "red", "-": "blue", "hal": "red"},
	combine_adjacent=True,
	)
	json_example = gr.JSON()

	df.select(
	update_selection,
	inputs=[],
	outputs=[
	json_example,
	original_text,
	highlighted_text,
	highlighted_text_predicted_roberta_base,
	highlighted_text_predicted_roberta_large_qa,
	highlighted_text_predicted_flan_t5_qa,
	],
	)


	demo.launch(show_error=True)