In [None]:
import os
import sys
import subprocess
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
import evaluate
import torch
import optuna

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Menggunakan perangkat: {device}")

# Load dataset
try:
    dataset = load_dataset("indonlp/indonlu", "nergrit", trust_remote_code=True)
except Exception as e:
    print(f"Gagal memuat dataset: {e}")
    sys.exit(1)

# Verify dataset structure
if "train" not in dataset or "validation" not in dataset or "test" not in dataset:
    print("Dataset tidak memiliki split train/validation/test yang diharapkan.")
    sys.exit(1)
if "tokens" not in dataset["train"].column_names or "ner_tags" not in dataset["train"].column_names:
    print("Dataset tidak memiliki kolom 'tokens' atau 'ner_tags'.")
    sys.exit(1)

# Define label list
try:
    label_list = dataset["train"].features["ner_tags"].feature.names
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}
except Exception as e:
    print(f"Gagal mendapatkan label: {e}")
    sys.exit(1)

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
except Exception as e:
    print(f"Gagal memuat tokenizer: {e}")
    sys.exit(1)

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize dataset
try:
    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
except Exception as e:
    print(f"Gagal menokenisasi dataset: {e}")
    sys.exit(1)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load evaluation metric
metric = evaluate.load("seqeval")

# Compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    results = metric.compute(predictions=pred_labels, references=true_labels)
    per_entity = {}
    for entity in ["PERSON", "ORGANISATION", "PLACE", "DATE"]:
        if entity.lower() in results:
            per_entity[entity] = {
                "precision": results[entity.lower()]["precision"],
                "recall": results[entity.lower()]["recall"],
                "f1": results[entity.lower()]["f1"],
            }
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "per_entity": per_entity,
    }

# Define objective function for Optuna
def objective(trial):
    # Define hyperparameter search space
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_epochs", 3, 5)

    # Load model for each trial
    model = AutoModelForTokenClassification.from_pretrained(
        "indobenchmark/indobert-base-p1",
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )
    model.to(device)

    # Set training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_trial_{trial.number}",
        eval_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir=f"./logs_trial_{trial.number}",
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_f1"]

# Run Optuna optimization
print("Memulai hyperparameter tuning dengan Optuna...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)  # Adjust n_trials as needed
print("\nHyperparameter terbaik:")
print(study.best_params)
print(f"F1-Score terbaik: {study.best_value:.4f}")

# Train final model with best hyperparameters
best_params = study.best_params
model = AutoModelForTokenClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    num_train_epochs=best_params["num_epochs"],
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("\nMemulai pelatihan dengan hyperparameter terbaik...")
try:
    trainer.train()
except Exception as e:
    print(f"Gagal melatih model: {e}")
    sys.exit(1)

# Evaluate on test set
print("\nMengevaluasi model pada data test...")
try:
    results = trainer.evaluate(tokenized_dataset["test"])
except Exception as e:
    print(f"Gagal mengevaluasi model: {e}")
    sys.exit(1)

# Print evaluation results
print("\nHasil Evaluasi:")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1-Score: {results['eval_f1']:.4f}")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print("\nMetrik per Entitas:")
for entity, metrics in results.get("eval_per_entity", {}).items():
    print(f"{entity}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1']:.4f}")

# Save the model
try:
    model.save_pretrained("./ner_model")
    tokenizer.save_pretrained("./ner_model")
    print("\nModel dan tokenizer telah disimpan ke './ner_model'")
except Exception as e:
    print(f"Gagal menyimpan model: {e}")
    sys.exit(1)

# Example inference on test samples
print("\nContoh Prediksi pada Data Test (5 Sampel):")
try:
    for i in range(min(5, len(tokenized_dataset["test"]))):
        sample = tokenized_dataset["test"][i]
        input_ids = torch.tensor([sample["input_ids"]], device=device)
        attention_mask = torch.tensor([sample["attention_mask"]], device=device)
        model.eval()
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()
        tokens = tokenizer.convert_ids_to_tokens(sample["input_ids"])
        labels = [id2label[pred] for pred, label in zip(predictions, sample["labels"]) if label != -100]
        true_labels = [id2label[label] for label in sample["labels"] if label != -100]
        print(f"\nSampel {i+1}:")
        print(f"Tokens: {' '.join(tokens)}")
        print(f"True Labels: {true_labels}")
        print(f"Predicted Labels: {labels}")
except Exception as e:
    print(f"Gagal melakukan inferensi: {e}")
    sys.exit(1)

# Analyze error patterns (DATE predicted as LOC)
print("\nAnalisis Pola Error (Tanggal diprediksi sebagai Lokasi):")
found_error = False
for i in range(min(100, len(tokenized_dataset["test"]))):
    sample = tokenized_dataset["test"][i]
    input_ids = torch.tensor([sample["input_ids"]], device=device)
    attention_mask = torch.tensor([sample["attention_mask"]], device=device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()
    true_labels = [id2label[label] for label in sample["labels"] if label != -100]
    pred_labels = [id2label[pred] for pred, label in zip(predictions, sample["labels"]) if label != -100]
    for j, (true, pred) in enumerate(zip(true_labels, pred_labels)):
        if true.startswith("B-DATE") and pred.startswith("B-LOC"):
            tokens = tokenizer.convert_ids_to_tokens(sample["input_ids"])
            print(f"\nSampel dengan Error (DATE diprediksi sebagai LOC):")
            print(f"Tokens: {' '.join(tokens)}")
            print(f"True Labels: {true_labels}")
            print(f"Predicted Labels: {pred_labels}")
            found_error = True
            break
    if found_error:
        break
if not found_error:
    print("Tidak ditemukan contoh tanggal yang diprediksi sebagai lokasi dalam 100 sampel.")

# Data Security, Privacy, and Ethics
print("\nPertimbangan Keamanan Data, Privasi, dan Etika:")
print("- Dataset bersumber dari berita publik, tidak mengandung informasi sensitif seperti alamat atau nomor identitas.")
print("- Nama orang dalam dataset berasal dari media publik, aman untuk digunakan.")
print("- Dataset mencakup berbagai topik berita, mengurangi risiko bias terhadap entitas tertentu.")


Menggunakan perangkat: cuda


[I 2025-07-18 06:26:20,055] A new study created in memory with name: no-name-50af0249-7af4-476f-988c-7342adeab58c


Memulai hyperparameter tuning dengan Optuna...


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Per Entity
1,0.1247,0.166868,0.748068,0.731118,0.739496,0.945582,{}
2,0.1038,0.157893,0.750355,0.799094,0.773958,0.952456,{}
3,0.0961,0.171932,0.800613,0.78852,0.794521,0.955606,{}
4,0.0328,0.178615,0.750704,0.805136,0.776968,0.954031,{}


Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempt

Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2025-07-18 06:29:29,091] Trial 0 finished with value: 0.7945205479452055 and parameters: {'learning_rate': 2.3555847899573657e-05, 'batch_size': 8, 'num_epochs': 4}. Best is trial 0 with value: 0.7945205479452055.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Per Entity
1,0.1235,0.163488,0.728788,0.726586,0.727685,0.945009,{}
2,0.1088,0.155614,0.737346,0.814199,0.773869,0.953745,{}
3,0.1103,0.17047,0.763314,0.779456,0.7713,0.953172,{}
4,0.0458,0.182373,0.765557,0.799094,0.781966,0.954031,{}
5,0.0224,0.191159,0.758571,0.802115,0.779736,0.953315,{}


Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempt

Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2025-07-18 06:33:40,086] Trial 1 finished with value: 0.7819660014781965 and parameters: {'learning_rate': 1.7904807706862636e-05, 'batch_size': 8, 'num_epochs': 5}. Best is trial 0 with value: 0.7945205479452055.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Per Entity
1,0.1386,0.18555,0.738769,0.670695,0.703088,0.942432,{}
2,0.1098,0.154619,0.781899,0.796073,0.788922,0.955463,{}
3,0.0698,0.155078,0.80775,0.818731,0.813203,0.960332,{}
4,0.0272,0.174292,0.765292,0.812689,0.788278,0.954747,{}


Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempt

Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2025-07-18 06:39:32,835] Trial 2 finished with value: 0.8132033008252062 and parameters: {'learning_rate': 3.672145523121866e-05, 'batch_size': 16, 'num_epochs': 4}. Best is trial 2 with value: 0.8132033008252062.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Per Entity
1,0.1432,0.17097,0.745514,0.690332,0.716863,0.945869,{}
2,0.1073,0.154406,0.766141,0.806647,0.785872,0.953029,{}
3,0.0751,0.158503,0.79542,0.787009,0.791192,0.956895,{}
4,0.0258,0.179348,0.764791,0.800604,0.782288,0.954461,{}
5,0.0134,0.185257,0.766049,0.811178,0.787968,0.953888,{}


Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempt

Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2025-07-18 06:47:22,280] Trial 3 finished with value: 0.7911921032649962 and parameters: {'learning_rate': 3.713773945286763e-05, 'batch_size': 16, 'num_epochs': 5}. Best is trial 2 with value: 0.8132033008252062.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Per Entity
1,0.1327,0.169205,0.715361,0.717523,0.71644,0.944007,{}
2,0.12,0.15539,0.7507,0.809668,0.77907,0.953458,{}
3,0.1366,0.163555,0.761974,0.793051,0.777202,0.954174,{}
4,0.0679,0.172124,0.766476,0.808157,0.786765,0.953888,{}
5,0.0352,0.180249,0.759943,0.808157,0.783309,0.953745,{}


Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempt

Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2025-07-18 06:51:59,633] Trial 4 finished with value: 0.7867647058823529 and parameters: {'learning_rate': 1.1923156920458335e-05, 'batch_size': 8, 'num_epochs': 5}. Best is trial 2 with value: 0.8132033008252062.



Hyperparameter terbaik:
{'learning_rate': 3.672145523121866e-05, 'batch_size': 16, 'num_epochs': 4}
F1-Score terbaik: 0.8132


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Memulai pelatihan dengan hyperparameter terbaik...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Per Entity
1,0.1386,0.18555,0.738769,0.670695,0.703088,0.942432,{}
2,0.1098,0.154619,0.781899,0.796073,0.788922,0.955463,{}
3,0.0698,0.155078,0.80775,0.818731,0.813203,0.960332,{}
4,0.0272,0.174292,0.765292,0.812689,0.788278,0.954747,{}


Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempt


Mengevaluasi model pada data test...


Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval_per_entity" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "{}" of type <class 'dict'> for key "eval/per_entity" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.



Hasil Evaluasi:
Precision: 0.7528
Recall: 0.7878
F1-Score: 0.7699
Accuracy: 0.9497

Metrik per Entitas:

Model dan tokenizer telah disimpan ke './ner_model'

Contoh Prediksi pada Data Test (5 Sampel):

Sampel 1:
Tokens: [CLS] joe ##tat ##a hadi ##hard ##aja dan dihadiri oleh rektor undip prof . [SEP]
True Labels: ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-ORGANISATION', 'O', 'O']
Predicted Labels: ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-PLACE', 'O', 'O']

Sampel 2:
Tokens: [CLS] sejak masih duduk di bangku sekolah tk kevin sudah belajar alat musik piano secara formal dan ketika ia menginjak sekolah smp pemilik nama asli kevin april ##io sum ##aat ##maj ##a ini , mulai belajar menulis lagu sendiri . [SEP]
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', '