File size: 5,752 Bytes
3ac3892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import sys
import subprocess
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
import torch
import gradio as gr
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Menggunakan perangkat: {device}")

# Load dataset to get label list
try:
    dataset = load_dataset("indonlp/indonlu", "nergrit", trust_remote_code=True)
except Exception as e:
    print(f"Gagal memuat dataset: {e}")
    sys.exit(1)

# Verify dataset structure
if "train" not in dataset or "test" not in dataset:
    print("Dataset tidak memiliki split train/test yang diharapkan.")
    sys.exit(1)
if "tokens" not in dataset["train"].column_names or "ner_tags" not in dataset["train"].column_names:
    print("Dataset tidak memiliki kolom 'tokens' atau 'ner_tags'.")
    sys.exit(1)

# Define label list
try:
    label_list = dataset["train"].features["ner_tags"].feature.names
    id2label = {i: label for i, label in enumerate(label_list)}
    label2id = {label: i for i, label in enumerate(label_list)}
except Exception as e:
    print(f"Gagal mendapatkan label: {e}")
    sys.exit(1)

# Load tokenizer and model from saved directory
try:
    tokenizer = AutoTokenizer.from_pretrained("./ner_model")
    model = AutoModelForTokenClassification.from_pretrained(
        "./ner_model",
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )
    model.to(device)
except Exception as e:
    print(f"Gagal memuat model atau tokenizer dari './ner_model': {e}")
    print("Pastikan folder './ner_model' ada dan berisi model yang telah dilatih.")
    sys.exit(1)

# Tokenize and align labels for test data
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize test dataset
try:
    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
except Exception as e:
    print(f"Gagal menokenisasi dataset: {e}")
    sys.exit(1)

# Function to predict entities for input text
def predict_entities(input_text):
    if not input_text.strip():
        return "Masukkan teks untuk diprediksi."
    
    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()
    
    # Get tokens and predicted labels
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    labels = [id2label[pred] for pred in predictions]
    
    # Remove special tokens ([CLS], [SEP]) and align
    result = []
    for token, label in zip(tokens, labels):
        if token not in ["[CLS]", "[SEP]"]:
            result.append({"Token": token, "Entity": label})
    
    # Convert to DataFrame for display
    return pd.DataFrame(result)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Named Entity Recognition (NER) dengan IndoBERT")
    gr.Markdown("Masukkan teks dalam bahasa Indonesia untuk mendeteksi entitas seperti PERSON, ORGANISATION, PLACE, dll.")
    
    gr.Markdown("## Keterangan Label Entitas")
    gr.Markdown("""

    - **O**: Token bukan entitas (contoh: "dan", "mengunjungi").

    - **B-PERSON**: Awal nama orang (contoh: "Joko" dalam "Joko Widodo").

    - **I-PERSON**: Lanjutan nama orang (contoh: "Widodo" atau "##do" dalam "Joko Widodo").

    - **B-PLACE**: Awal nama tempat (contoh: "Bali").

    - **I-PLACE**: Lanjutan nama tempat (contoh: "Indonesia" dalam "Bali, Indonesia").

    """)
    
    with gr.Row():
        text_input = gr.Textbox(
            label="Masukkan Teks",
            placeholder="Contoh: Joko Widodo menghadiri acara di Universitas Indonesia pada tanggal 14 Juni 2025",
            lines=3
        )
        submit_button = gr.Button("Prediksi")
        clear_button = gr.Button("Bersihkan")
    
    output_table = gr.Dataframe(label="Hasil Prediksi")
    
    gr.Markdown("## Contoh Teks")
    gr.Markdown("- SBY berkunjung ke Bali bersama Jokowi.\n- Universitas Gadjah Mada menyelenggarakan seminar pada 10 Maret 2025.")
    
    gr.Markdown("## Pertimbangan Keamanan Data, Privasi, dan Etika")
    gr.Markdown("""

    - **Keamanan Data**: Dataset bersumber dari berita publik, tidak mengandung informasi sensitif seperti alamat atau nomor identitas.

    - **Privasi**: Input pengguna tidak disimpan, menjaga privasi.

    - **Etika AI**: Dataset mencakup berbagai topik berita (politik, olahraga, budaya), mengurangi risiko bias terhadap entitas tertentu.

    """)
    
    submit_button.click(fn=predict_entities, inputs=text_input, outputs=output_table)
    clear_button.click(fn=lambda: "", inputs=None, outputs=text_input)

# Launch Gradio interface
demo.launch()