H2H-eval-comparator

Sleeping

App Files Files Community

H2H-eval-comparator / app.py

rohansampath

Update app.py with a basic demonstration of loading Llama-3.1-instruct and running a simple eval on some Math

3195f7f verified 9 months ago

raw

history blame

4.78 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import evaluate
	import re
	import matplotlib
	matplotlib.use('Agg') # for non-interactive envs
	import matplotlib.pyplot as plt
	import io
	import base64

	# ---------------------------------------------------------------------------
	# 1. Define model name and load model/tokenizer
	# ---------------------------------------------------------------------------
	model_name = "meta-llama/Llama-3.2-1B-Instruct" # fictional placeholder

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)

	# ---------------------------------------------------------------------------
	# 2. Define a tiny "dataset" for demonstration
	# In reality, you'll load a real dataset from HF or custom code.
	# ---------------------------------------------------------------------------
	test_data = [
	{"question": "What is 2+2?", "answer": "4"},
	{"question": "What is 3*3?", "answer": "9"},
	{"question": "What is 10/2?", "answer": "5"},
	]

	# ---------------------------------------------------------------------------
	# 3. Load a metric (accuracy) from Hugging Face evaluate library
	# ---------------------------------------------------------------------------
	accuracy_metric = evaluate.load("accuracy")

	# ---------------------------------------------------------------------------
	# 4. Inference helper functions
	# ---------------------------------------------------------------------------
	def generate_answer(question):
	"""
	Generates an answer to the given question using the loaded model.
	"""
	# Simple prompt
	prompt = f"Question: {question}\nAnswer:"

	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=30,
	temperature=0.0, # deterministic
	)
	text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return text_output

	def parse_answer(model_output):
	"""
	Heuristic to extract the final numeric answer from model's text.
	You can customize this regex or logic as needed.
	"""
	# Example: find digits (possibly multiple, but we keep the first match)
	match = re.search(r"(\d+)", model_output)
	if match:
	return match.group(1)
	# fallback to entire text if no digits found
	return model_output.strip()

	# ---------------------------------------------------------------------------
	# 5. Evaluation routine
	# ---------------------------------------------------------------------------
	def run_evaluation():
	predictions = []
	references = []

	for sample in test_data:
	question = sample["question"]
	reference_answer = sample["answer"]

	# Model inference
	model_output = generate_answer(question)
	predicted_answer = parse_answer(model_output)

	predictions.append(predicted_answer)
	references.append(reference_answer)

	# Normalize answers (simple: just remove spaces/punctuation, lower case)
	def normalize_answer(ans):
	return ans.lower().strip()

	norm_preds = [normalize_answer(p) for p in predictions]
	norm_refs = [normalize_answer(r) for r in references]

	# Compute accuracy
	results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
	accuracy = results["accuracy"]

	# Create a simple bar chart: correct vs. incorrect
	correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
	incorrect_count = len(test_data) - correct_count

	fig, ax = plt.subplots()
	ax.bar(["Correct", "Incorrect"], [correct_count, incorrect_count], color=["green", "red"])
	ax.set_title("Evaluation Results")
	ax.set_ylabel("Count")
	ax.set_ylim([0, len(test_data)])

	# Convert the plot to a base64-encoded PNG for Gradio display
	buf = io.BytesIO()
	plt.savefig(buf, format="png")
	buf.seek(0)
	plt.close(fig)
	data = base64.b64encode(buf.read()).decode("utf-8")
	image_url = f"data:image/png;base64,{data}"

	# Return text and the plot
	return f"Accuracy: {accuracy:.2f}", image_url

	# ---------------------------------------------------------------------------
	# 6. Gradio App
	# ---------------------------------------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# Simple Math Evaluation with 'Llama 3.2'")

	eval_button = gr.Button("Run Evaluation")
	output_text = gr.Textbox(label="Results")
	output_plot = gr.HTML(label="Plot")

	eval_button.click(
	fn=run_evaluation,
	inputs=None,
	outputs=[output_text, output_plot]
	)

	demo.launch()