|
|
|
import tensorflow as tf |
|
import numpy as np |
|
import pickle |
|
import re |
|
import os |
|
|
|
|
|
try: |
|
from text_processing import clean_text, simple_sentence_tokenize, tokenize_words |
|
except ImportError: |
|
|
|
def clean_text(text): |
|
"""Pembersihan teks yang lebih robust""" |
|
if not isinstance(text, str): |
|
return "" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'[^\w\s.,!?;:\-()]', '', text) |
|
|
|
|
|
text = re.sub(r'[.,!?;:]{2,}', '.', text) |
|
|
|
return text.strip() |
|
|
|
def simple_sentence_tokenize(text): |
|
"""Tokenisasi kalimat sederhana tanpa NLTK""" |
|
|
|
text = text.replace('\n', ' ').strip() |
|
|
|
|
|
sentences = [] |
|
for part in re.split(r'(?<=[.!?])\s+', text): |
|
if part.strip(): |
|
sentences.append(part.strip()) |
|
|
|
|
|
if not sentences: |
|
return [text] |
|
|
|
return sentences |
|
|
|
def tokenize_words(text): |
|
"""Tokenisasi kata sederhana tanpa NLTK""" |
|
text = text.lower() |
|
|
|
text = re.sub(r'[^\w\s]', ' ', text) |
|
|
|
return [word for word in text.split() if word.strip()] |
|
|
|
class TextSummarizer: |
|
def __init__(self, model_path=None, input_tokenizer_path=None, output_tokenizer_path=None): |
|
"""Inisialisasi text summarizer dengan model dan tokenizer opsional""" |
|
self.model = None |
|
self.input_tokenizer = None |
|
self.output_tokenizer = None |
|
self.max_input_len = 200 |
|
|
|
|
|
if model_path and os.path.exists(model_path) and input_tokenizer_path and os.path.exists(input_tokenizer_path): |
|
self.load(model_path, input_tokenizer_path, output_tokenizer_path) |
|
|
|
def load(self, model_path, input_tokenizer_path, output_tokenizer_path=None): |
|
"""Load model dan tokenizer dari file""" |
|
try: |
|
|
|
self.model = tf.keras.models.load_model(model_path) |
|
print(f"Model berhasil dimuat dari {model_path}") |
|
|
|
|
|
with open(input_tokenizer_path, 'rb') as handle: |
|
self.input_tokenizer = pickle.load(handle) |
|
print(f"Input tokenizer berhasil dimuat dari {input_tokenizer_path}") |
|
|
|
|
|
if output_tokenizer_path and os.path.exists(output_tokenizer_path): |
|
with open(output_tokenizer_path, 'rb') as handle: |
|
self.output_tokenizer = pickle.load(handle) |
|
print(f"Output tokenizer berhasil dimuat dari {output_tokenizer_path}") |
|
|
|
return True |
|
except Exception as e: |
|
print(f"Error saat memuat model dan tokenizer: {e}") |
|
return False |
|
|
|
def predict_sentence_importance(self, sentences): |
|
"""Memprediksi pentingnya kalimat menggunakan model""" |
|
if self.model is None or self.input_tokenizer is None: |
|
raise ValueError("Model atau tokenizer belum dimuat") |
|
|
|
|
|
sequences = [] |
|
for sentence in sentences: |
|
seq = self.input_tokenizer.texts_to_sequences([sentence]) |
|
if seq[0]: |
|
padded_seq = tf.keras.preprocessing.sequence.pad_sequences( |
|
seq, maxlen=self.max_input_len, padding='post' |
|
) |
|
sequences.append(padded_seq) |
|
else: |
|
|
|
sequences.append(np.zeros((1, self.max_input_len))) |
|
|
|
|
|
importance_scores = [] |
|
for seq in sequences: |
|
score = self.model.predict(seq, verbose=0)[0][0] |
|
importance_scores.append(score) |
|
|
|
return importance_scores |
|
|
|
def summarize(self, text, max_sentences=3): |
|
"""Ringkas teks menggunakan model atau pendekatan ekstraktif""" |
|
|
|
cleaned_text = clean_text(text) |
|
if not cleaned_text: |
|
return "Teks tidak valid atau kosong." |
|
|
|
|
|
try: |
|
|
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
nltk.download('punkt', quiet=True) |
|
sentences = sent_tokenize(cleaned_text) |
|
except: |
|
|
|
sentences = simple_sentence_tokenize(cleaned_text) |
|
|
|
|
|
if len(sentences) <= max_sentences: |
|
return cleaned_text |
|
|
|
|
|
if self.model is not None and self.input_tokenizer is not None: |
|
try: |
|
importance_scores = self.predict_sentence_importance(sentences) |
|
|
|
|
|
top_indices = np.argsort(importance_scores)[-max_sentences:] |
|
top_indices = sorted(top_indices) |
|
|
|
|
|
summary_sentences = [sentences[i] for i in top_indices] |
|
|
|
return " ".join(summary_sentences) |
|
|
|
except Exception as e: |
|
print(f"Error saat prediksi model: {e}") |
|
|
|
|
|
|
|
summary_sentences = [sentences[0]] |
|
|
|
if max_sentences >= 2: |
|
summary_sentences.append(sentences[-1]) |
|
|
|
if max_sentences >= 3 and len(sentences) > 2: |
|
|
|
middle_idx = len(sentences) // 2 |
|
if sentences[middle_idx] not in summary_sentences: |
|
summary_sentences.insert(1, sentences[middle_idx]) |
|
|
|
|
|
positions = [] |
|
for sent in summary_sentences: |
|
for i, s in enumerate(sentences): |
|
if sent == s: |
|
positions.append(i) |
|
break |
|
|
|
sorted_pairs = sorted(zip(positions, summary_sentences)) |
|
ordered_summary = [pair[1] for pair in sorted_pairs] |
|
|
|
return " ".join(ordered_summary) |
|
|
|
def summarize_text(text, max_sentences=3): |
|
"""Fungsi praktis untuk meringkas teks tanpa memerlukan model""" |
|
|
|
cleaned_text = clean_text(text) |
|
if not cleaned_text: |
|
return "Teks tidak valid atau kosong." |
|
|
|
|
|
sentences = simple_sentence_tokenize(cleaned_text) |
|
|
|
|
|
if len(sentences) <= max_sentences: |
|
return cleaned_text |
|
|
|
|
|
summary_sentences = [sentences[0]] |
|
|
|
if max_sentences >= 2: |
|
summary_sentences.append(sentences[-1]) |
|
|
|
if max_sentences >= 3 and len(sentences) > 2: |
|
|
|
middle_idx = len(sentences) // 2 |
|
if sentences[middle_idx] not in summary_sentences: |
|
summary_sentences.insert(1, sentences[middle_idx]) |
|
|
|
|
|
positions = [] |
|
for sent in summary_sentences: |
|
for i, s in enumerate(sentences): |
|
if sent == s: |
|
positions.append(i) |
|
break |
|
|
|
sorted_pairs = sorted(zip(positions, summary_sentences)) |
|
ordered_summary = [pair[1] for pair in sorted_pairs] |
|
|
|
return " ".join(ordered_summary) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
sample_text = ''' |
|
Pemerintah Indonesia telah mengumumkan rencana pembangunan ibu kota baru di Kalimantan Timur. |
|
Keputusan ini diambil setelah melalui studi yang panjang terkait berbagai aspek, termasuk |
|
ketahanan terhadap bencana, ketersediaan lahan, dan potensi ekonomi. Ibu kota baru ini diharapkan |
|
dapat mengurangi kepadatan di Jakarta dan mendistribusikan pembangunan ekonomi secara lebih merata. |
|
Proyek ambisius ini membutuhkan investasi besar dan akan dilaksanakan secara bertahap dalam |
|
jangka waktu beberapa tahun. Para ahli menyatakan bahwa perpindahan ibu kota ini juga akan |
|
membawa tantangan tersendiri, terutama dalam hal infrastruktur dan adaptasi masyarakat. |
|
''' |
|
|
|
|
|
print("\nTeks asli:\n", sample_text) |
|
print("\nRingkasan sederhana:\n", summarize_text(sample_text)) |
|
|
|
|
|
try: |
|
|
|
files = os.listdir('.') |
|
model_file = next((f for f in files if f.startswith('text_summarizer_model') and (f.endswith('.keras') or f.endswith('.h5'))), None) |
|
input_tokenizer_file = 'input_tokenizer.pickle' if 'input_tokenizer.pickle' in files else None |
|
|
|
if model_file and input_tokenizer_file: |
|
summarizer = TextSummarizer( |
|
model_path=model_file, |
|
input_tokenizer_path=input_tokenizer_file |
|
) |
|
|
|
print("\nRingkasan dengan model:\n", summarizer.summarize(sample_text)) |
|
else: |
|
print("\nFile model atau tokenizer tidak ditemukan.") |
|
except Exception as e: |
|
print(f"\nTidak dapat menggunakan model: {e}") |
|
|