|
import os
|
|
import sys
|
|
import subprocess
|
|
import numpy as np
|
|
from datasets import load_dataset
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
|
|
import torch
|
|
import gradio as gr
|
|
import pandas as pd
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
print(f"Menggunakan perangkat: {device}")
|
|
|
|
|
|
try:
|
|
dataset = load_dataset("indonlp/indonlu", "nergrit", trust_remote_code=True)
|
|
except Exception as e:
|
|
print(f"Gagal memuat dataset: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if "train" not in dataset or "test" not in dataset:
|
|
print("Dataset tidak memiliki split train/test yang diharapkan.")
|
|
sys.exit(1)
|
|
if "tokens" not in dataset["train"].column_names or "ner_tags" not in dataset["train"].column_names:
|
|
print("Dataset tidak memiliki kolom 'tokens' atau 'ner_tags'.")
|
|
sys.exit(1)
|
|
|
|
|
|
try:
|
|
label_list = dataset["train"].features["ner_tags"].feature.names
|
|
id2label = {i: label for i, label in enumerate(label_list)}
|
|
label2id = {label: i for i, label in enumerate(label_list)}
|
|
except Exception as e:
|
|
print(f"Gagal mendapatkan label: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
try:
|
|
tokenizer = AutoTokenizer.from_pretrained("./ner_model")
|
|
model = AutoModelForTokenClassification.from_pretrained(
|
|
"./ner_model",
|
|
num_labels=len(label_list),
|
|
id2label=id2label,
|
|
label2id=label2id
|
|
)
|
|
model.to(device)
|
|
except Exception as e:
|
|
print(f"Gagal memuat model atau tokenizer dari './ner_model': {e}")
|
|
print("Pastikan folder './ner_model' ada dan berisi model yang telah dilatih.")
|
|
sys.exit(1)
|
|
|
|
|
|
def tokenize_and_align_labels(examples):
|
|
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
|
|
labels = []
|
|
for i, label in enumerate(examples["ner_tags"]):
|
|
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
|
previous_word_idx = None
|
|
label_ids = []
|
|
for word_idx in word_ids:
|
|
if word_idx is None:
|
|
label_ids.append(-100)
|
|
elif word_idx != previous_word_idx:
|
|
label_ids.append(label[word_idx])
|
|
else:
|
|
label_ids.append(-100)
|
|
previous_word_idx = word_idx
|
|
labels.append(label_ids)
|
|
tokenized_inputs["labels"] = labels
|
|
return tokenized_inputs
|
|
|
|
|
|
try:
|
|
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
|
|
except Exception as e:
|
|
print(f"Gagal menokenisasi dataset: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def predict_entities(input_text):
|
|
if not input_text.strip():
|
|
return "Masukkan teks untuk diprediksi."
|
|
|
|
|
|
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
|
|
input_ids = inputs["input_ids"].to(device)
|
|
attention_mask = inputs["attention_mask"].to(device)
|
|
|
|
|
|
model.eval()
|
|
with torch.no_grad():
|
|
outputs = model(input_ids, attention_mask=attention_mask)
|
|
predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()
|
|
|
|
|
|
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
|
|
labels = [id2label[pred] for pred in predictions]
|
|
|
|
|
|
result = []
|
|
for token, label in zip(tokens, labels):
|
|
if token not in ["[CLS]", "[SEP]"]:
|
|
result.append({"Token": token, "Entity": label})
|
|
|
|
|
|
return pd.DataFrame(result)
|
|
|
|
|
|
with gr.Blocks() as demo:
|
|
gr.Markdown("# Named Entity Recognition (NER) dengan IndoBERT")
|
|
gr.Markdown("Masukkan teks dalam bahasa Indonesia untuk mendeteksi entitas seperti PERSON, ORGANISATION, PLACE, dll.")
|
|
|
|
gr.Markdown("## Keterangan Label Entitas")
|
|
gr.Markdown("""
|
|
- **O**: Token bukan entitas (contoh: "dan", "mengunjungi").
|
|
- **B-PERSON**: Awal nama orang (contoh: "Joko" dalam "Joko Widodo").
|
|
- **I-PERSON**: Lanjutan nama orang (contoh: "Widodo" atau "##do" dalam "Joko Widodo").
|
|
- **B-PLACE**: Awal nama tempat (contoh: "Bali").
|
|
- **I-PLACE**: Lanjutan nama tempat (contoh: "Indonesia" dalam "Bali, Indonesia").
|
|
""")
|
|
|
|
with gr.Row():
|
|
text_input = gr.Textbox(
|
|
label="Masukkan Teks",
|
|
placeholder="Contoh: Joko Widodo menghadiri acara di Universitas Indonesia pada tanggal 14 Juni 2025",
|
|
lines=3
|
|
)
|
|
submit_button = gr.Button("Prediksi")
|
|
clear_button = gr.Button("Bersihkan")
|
|
|
|
output_table = gr.Dataframe(label="Hasil Prediksi")
|
|
|
|
gr.Markdown("## Contoh Teks")
|
|
gr.Markdown("- SBY berkunjung ke Bali bersama Jokowi.\n- Universitas Gadjah Mada menyelenggarakan seminar pada 10 Maret 2025.")
|
|
|
|
gr.Markdown("## Pertimbangan Keamanan Data, Privasi, dan Etika")
|
|
gr.Markdown("""
|
|
- **Keamanan Data**: Dataset bersumber dari berita publik, tidak mengandung informasi sensitif seperti alamat atau nomor identitas.
|
|
- **Privasi**: Input pengguna tidak disimpan, menjaga privasi.
|
|
- **Etika AI**: Dataset mencakup berbagai topik berita (politik, olahraga, budaya), mengurangi risiko bias terhadap entitas tertentu.
|
|
""")
|
|
|
|
submit_button.click(fn=predict_entities, inputs=text_input, outputs=output_table)
|
|
clear_button.click(fn=lambda: "", inputs=None, outputs=text_input)
|
|
|
|
|
|
demo.launch() |