Spaces:

davanstrien
/

doab-title-extraction-eval

Sleeping

App Files Files Community

doab-title-extraction-eval / app.py

davanstrien HF Staff

Upload app.py with huggingface_hub

8f9e935 verified 6 months ago

raw

history blame contribute delete

9.62 kB

	import marimo

	__generated_with = "0.10.9"
	app = marimo.App(width="medium")


	@app.cell
	def _():
	import marimo as mo
	return (mo,)


	@app.cell
	def _(mo):
	mo.md(
	"""
	# VLM vs Text: Extracting Metadata from Book Covers

	The Task: Libraries and archives have millions of digitized book covers where metadata is incomplete or missing. Can we use AI to automatically extract titles and other metadata?

	The Question: Should we use Vision-Language Models (VLMs) that "see" the cover image, or extract text first and send it to a standard LLM?

	The Answer: VLMs win decisively for this task.

	---

	This evaluation uses the [DOAB (Directory of Open Access Books)](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset of academic book covers. We compare two approaches:

	\| Approach \| How it works \|
	\|----------\|-------------\|
	\| VLM \| Send the cover image directly to a Vision-Language Model \|
	\| Text \| Extract text from image first (OCR), then send to an LLM \|

	---

	## Evaluation Results

	Select a task below to see how different models performed:
	"""
	)
	return


	@app.cell
	def _():
	import pandas as pd
	import altair as alt
	from inspect_ai.analysis import evals_df
	return alt, evals_df, pd


	@app.cell
	def _(evals_df, mo):
	# Load evaluation results with persistent caching
	with mo.persistent_cache(name="doab_evals"):
	df_raw = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals", quiet=True)

	# Add metadata columns
	df_raw["approach"] = df_raw["task_name"].apply(lambda x: "VLM" if "vlm" in x else "Text")
	df_raw["model_short"] = df_raw["model"].apply(lambda x: x.split("/")[-1])

	# Determine task category
	def get_task_category(task_name):
	if "llm_judge" in task_name:
	return "Full Metadata"
	else:
	return "Title Extraction"

	df_raw["task_category"] = df_raw["task_name"].apply(get_task_category)

	# Convert score to percentage
	df_raw["accuracy"] = df_raw["score_headline_value"] * 100

	# Parameter sizes and URLs
	model_info = {
	"hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": {
	"params": 8,
	"url": "https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct"
	},
	"hf-inference-providers/Qwen/Qwen3-VL-30B-A3B-Thinking": {
	"params": 30,
	"url": "https://huggingface.co/Qwen/Qwen3-VL-30B-A3B"
	},
	"hf-inference-providers/zai-org/GLM-4.6V-Flash": {
	"params": 9,
	"url": "https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking"
	},
	"hf-inference-providers/openai/gpt-oss-20b": {
	"params": 20,
	"url": "https://huggingface.co/openai/gpt-oss-20b"
	},
	"hf-inference-providers/Qwen/Qwen3-4B-Instruct-2507": {
	"params": 4,
	"url": "https://huggingface.co/Qwen/Qwen3-4B"
	},
	"hf-inference-providers/allenai/Olmo-3-7B-Instruct": {
	"params": 7,
	"url": "https://huggingface.co/allenai/OLMo-2-0325-32B-Instruct"
	},
	}
	df_raw["param_size_b"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("params"))
	df_raw["model_url"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("url", ""))

	df_raw
	return df_raw, get_task_category, model_info


	@app.cell
	def _(alt, df_raw, mo):
	def make_task_content(task_name):
	"""Generate the complete results view for a task."""
	df = df_raw[df_raw["task_category"] == task_name].copy()

	# Calculate summary stats
	vlm_avg = df[df["approach"] == "VLM"]["accuracy"].mean()
	text_avg = df[df["approach"] == "Text"]["accuracy"].mean()
	diff = vlm_avg - text_avg

	task_desc = "book titles" if task_name == "Title Extraction" else "full metadata (title, subtitle, publisher, year, ISBN)"

	# Results summary
	results_md = mo.md(
	f"""
	### Summary

	\| Approach \| Average Accuracy \|
	\|----------\|-----------------\|
	\| VLM (Vision) \| {vlm_avg:.0f}% \|
	\| Text Extraction \| {text_avg:.0f}% \|

	VLM advantage: +{diff:.0f} percentage points

	VLMs {'significantly ' if diff > 15 else ''}outperform text extraction for extracting {task_desc}.
	"""
	)

	# Scatter plot
	chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode(
	x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)),
	y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])),
	color=alt.Color("approach:N", title="Approach", scale=alt.Scale(domain=["VLM", "Text"], range=["#1f77b4", "#ff7f0e"])),
	tooltip=[
	alt.Tooltip("model_short:N", title="Model"),
	alt.Tooltip("approach:N", title="Approach"),
	alt.Tooltip("param_size_b:Q", title="Params (B)"),
	alt.Tooltip("accuracy:Q", title="Accuracy", format=".1f"),
	],
	).properties(
	width=500,
	height=300,
	title="Model Size vs Accuracy"
	).configure_axis(
	labelFontSize=12,
	titleFontSize=14,
	)

	# Leaderboard
	leaderboard_md = "### Model Leaderboard\n\n\| Model \| Approach \| Params (B) \| Accuracy (%) \|\n\|-------\|----------\|------------\|-------------\|\n"
	for _, row in df.sort_values("accuracy", ascending=False).iterrows():
	model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short']
	leaderboard_md += f"\| {model_link} \| {row['approach']} \| {row['param_size_b']} \| {row['accuracy']:.1f} \|\n"

	return mo.vstack([
	results_md,
	mo.md("### Model Size vs Accuracy"),
	mo.as_html(chart),
	mo.md("Hover over points to see model details"),
	mo.md(leaderboard_md),
	])

	# Create tabs
	tabs = mo.ui.tabs({
	"📄 Title Extraction": make_task_content("Title Extraction"),
	"📚 Full Metadata": make_task_content("Full Metadata"),
	})

	tabs
	return make_task_content, tabs


	@app.cell
	def _(mo):
	mo.md(
	"""
	---

	## Why VLMs Win

	Book covers are visually structured documents:

	- Spatial layout: Titles appear in specific locations (usually top/center)
	- Typography: Larger text = more important (likely the title)
	- Visual hierarchy: Authors, publishers, and other info have distinct styling

	When you extract text first (OCR), you flatten this structure into a linear sequence. The model loses the visual cues that make it obvious what's a title vs. a subtitle vs. author name.

	Interesting finding: Qwen3-VL-8B achieves 94% even when used as a text-only model, suggesting it has strong general text understanding - but it still does better (98%) when given the actual images.
	"""
	)
	return


	@app.cell
	def _(mo):
	mo.md(
	"""
	## The Dataset

	We use the [DOAB Metadata Extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset - academic book covers from the Directory of Open Access Books.

	Each sample has:
	- Cover image (rendered from PDF)
	- Pre-extracted page text
	- Ground truth metadata (title, subtitle, publisher, year, ISBN)
	"""
	)
	return


	@app.cell
	def _(mo):
	mo.Html(
	"""
	<iframe
	src="https://huggingface.co/datasets/biglam/doab-metadata-extraction/embed/viewer/default/train"
	frameborder="0"
	width="100%"
	height="400px"
	></iframe>
	"""
	)
	return


	@app.cell
	def _(mo):
	mo.md(
	"""
	## Methodology

	Evaluation Framework: [Inspect AI](https://inspect.aisi.org.uk/) - an open-source framework for evaluating language models

	Sample Size: 50 books (randomly sampled with fixed seed for reproducibility)

	Scoring Methods:
	- Title Extraction: Custom flexible matching scorer
	- Case-insensitive comparison
	- Accepts if ground truth is substring of prediction (handles subtitles)
	- More robust than exact match for this task
	- Full Metadata: LLM-as-judge with partial credit
	- Correct (1.0): Title + year + at least one other field
	- Partial (0.5): Some fields correct
	- Incorrect (0.0): Mostly wrong

	Models via: [HuggingFace Inference Providers](https://huggingface.co/docs/inference-providers)

	---

	## Replicate This

	The evaluation logs are stored on HuggingFace and can be loaded directly:

	```python
	from inspect_ai.analysis import evals_df

	df = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals")
	```

	---

	Built with [Marimo](https://marimo.io) \| Evaluation framework: [Inspect AI](https://inspect.aisi.org.uk/) \| Dataset: [biglam/doab-metadata-extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction)
	"""
	)
	return


	if __name__ == "__main__":
	app.run()