text-summarize-fix / summarizer.py

Upload folder using huggingface_hub

63aa5b2 verified about 1 month ago

9.83 kB


	import tensorflow as tf
	import numpy as np
	import pickle
	import re
	import os

	# Import fungsi pemrosesan teks jika tersedia
	try:
	from text_processing import clean_text, simple_sentence_tokenize, tokenize_words
	except ImportError:
	# Definisi fungsi inline jika modul tidak tersedia
	def clean_text(text):
	"""Pembersihan teks yang lebih robust"""
	if not isinstance(text, str):
	return ""

	# Remove extra whitespaces
	text = re.sub(r'\s+', ' ', text)

	# Remove special characters but keep punctuation
	text = re.sub(r'[^\w\s.,!?;:\-()]', '', text)

	# Remove multiple punctuation
	text = re.sub(r'[.,!?;:]{2,}', '.', text)

	return text.strip()

	def simple_sentence_tokenize(text):
	"""Tokenisasi kalimat sederhana tanpa NLTK"""
	# Bersihkan teks terlebih dahulu
	text = text.replace('\n', ' ').strip()

	# Pisahkan berdasarkan tanda baca umum
	sentences = []
	for part in re.split(r'(?<=[.!?])\s+', text):
	if part.strip():
	sentences.append(part.strip())

	# Jika tidak ada kalimat yang ditemukan, kembalikan seluruh teks sebagai satu kalimat
	if not sentences:
	return [text]

	return sentences

	def tokenize_words(text):
	"""Tokenisasi kata sederhana tanpa NLTK"""
	text = text.lower()
	# Bersihkan teks
	text = re.sub(r'[^\w\s]', ' ', text)
	# Split kata-kata
	return [word for word in text.split() if word.strip()]

	class TextSummarizer:
	def __init__(self, model_path=None, input_tokenizer_path=None, output_tokenizer_path=None):
	"""Inisialisasi text summarizer dengan model dan tokenizer opsional"""
	self.model = None
	self.input_tokenizer = None
	self.output_tokenizer = None
	self.max_input_len = 200

	# Load model dan tokenizer jika path diberikan
	if model_path and os.path.exists(model_path) and input_tokenizer_path and os.path.exists(input_tokenizer_path):
	self.load(model_path, input_tokenizer_path, output_tokenizer_path)

	def load(self, model_path, input_tokenizer_path, output_tokenizer_path=None):
	"""Load model dan tokenizer dari file"""
	try:
	# Load model
	self.model = tf.keras.models.load_model(model_path)
	print(f"Model berhasil dimuat dari {model_path}")

	# Load input tokenizer
	with open(input_tokenizer_path, 'rb') as handle:
	self.input_tokenizer = pickle.load(handle)
	print(f"Input tokenizer berhasil dimuat dari {input_tokenizer_path}")

	# Load output tokenizer jika tersedia
	if output_tokenizer_path and os.path.exists(output_tokenizer_path):
	with open(output_tokenizer_path, 'rb') as handle:
	self.output_tokenizer = pickle.load(handle)
	print(f"Output tokenizer berhasil dimuat dari {output_tokenizer_path}")

	return True
	except Exception as e:
	print(f"Error saat memuat model dan tokenizer: {e}")
	return False

	def predict_sentence_importance(self, sentences):
	"""Memprediksi pentingnya kalimat menggunakan model"""
	if self.model is None or self.input_tokenizer is None:
	raise ValueError("Model atau tokenizer belum dimuat")

	# Tokenize dan pad setiap kalimat
	sequences = []
	for sentence in sentences:
	seq = self.input_tokenizer.texts_to_sequences([sentence])
	if seq[0]: # Jika tidak kosong
	padded_seq = tf.keras.preprocessing.sequence.pad_sequences(
	seq, maxlen=self.max_input_len, padding='post'
	)
	sequences.append(padded_seq)
	else:
	# Jika tokenisasi gagal, beri nilai 0
	sequences.append(np.zeros((1, self.max_input_len)))

	# Prediksi skor penting untuk setiap kalimat
	importance_scores = []
	for seq in sequences:
	score = self.model.predict(seq, verbose=0)[0][0]
	importance_scores.append(score)

	return importance_scores

	def summarize(self, text, max_sentences=3):
	"""Ringkas teks menggunakan model atau pendekatan ekstraktif"""
	# Preprocessing
	cleaned_text = clean_text(text)
	if not cleaned_text:
	return "Teks tidak valid atau kosong."

	# Tokenisasi kalimat
	try:
	# Coba gunakan NLTK jika tersedia
	import nltk
	from nltk.tokenize import sent_tokenize
	nltk.download('punkt', quiet=True)
	sentences = sent_tokenize(cleaned_text)
	except:
	# Fallback ke tokenisasi sederhana
	sentences = simple_sentence_tokenize(cleaned_text)

	# Jika teks sudah pendek, return as is
	if len(sentences) <= max_sentences:
	return cleaned_text

	# Gunakan model untuk memprediksi kalimat penting jika tersedia
	if self.model is not None and self.input_tokenizer is not None:
	try:
	importance_scores = self.predict_sentence_importance(sentences)

	# Ambil indeks kalimat dengan skor tertinggi
	top_indices = np.argsort(importance_scores)[-max_sentences:]
	top_indices = sorted(top_indices) # Urutkan berdasarkan posisi asli

	# Ambil kalimat-kalimat penting
	summary_sentences = [sentences[i] for i in top_indices]

	return " ".join(summary_sentences)

	except Exception as e:
	print(f"Error saat prediksi model: {e}")
	# Fallback ke strategi ekstraktif

	# Strategi ekstraktif sederhana (kalimat pertama, tengah, terakhir)
	summary_sentences = [sentences[0]] # Kalimat pertama selalu penting

	if max_sentences >= 2:
	summary_sentences.append(sentences[-1]) # Kalimat terakhir

	if max_sentences >= 3 and len(sentences) > 2:
	# Tambahkan kalimat tengah
	middle_idx = len(sentences) // 2
	if sentences[middle_idx] not in summary_sentences:
	summary_sentences.insert(1, sentences[middle_idx])

	# Urutkan berdasarkan posisi asli dalam teks
	positions = []
	for sent in summary_sentences:
	for i, s in enumerate(sentences):
	if sent == s:
	positions.append(i)
	break

	sorted_pairs = sorted(zip(positions, summary_sentences))
	ordered_summary = [pair[1] for pair in sorted_pairs]

	return " ".join(ordered_summary)

	def summarize_text(text, max_sentences=3):
	"""Fungsi praktis untuk meringkas teks tanpa memerlukan model"""
	# Preprocessing
	cleaned_text = clean_text(text)
	if not cleaned_text:
	return "Teks tidak valid atau kosong."

	# Tokenisasi kalimat
	sentences = simple_sentence_tokenize(cleaned_text)

	# Jika teks sudah pendek, return as is
	if len(sentences) <= max_sentences:
	return cleaned_text

	# Strategi ekstraktif sederhana (kalimat pertama, tengah, terakhir)
	summary_sentences = [sentences[0]] # Kalimat pertama selalu penting

	if max_sentences >= 2:
	summary_sentences.append(sentences[-1]) # Kalimat terakhir

	if max_sentences >= 3 and len(sentences) > 2:
	# Tambahkan kalimat tengah
	middle_idx = len(sentences) // 2
	if sentences[middle_idx] not in summary_sentences:
	summary_sentences.insert(1, sentences[middle_idx])

	# Urutkan berdasarkan posisi asli dalam teks
	positions = []
	for sent in summary_sentences:
	for i, s in enumerate(sentences):
	if sent == s:
	positions.append(i)
	break

	sorted_pairs = sorted(zip(positions, summary_sentences))
	ordered_summary = [pair[1] for pair in sorted_pairs]

	return " ".join(ordered_summary)

	# Contoh penggunaan
	if __name__ == "__main__":
	# Contoh teks
	sample_text = '''
	Pemerintah Indonesia telah mengumumkan rencana pembangunan ibu kota baru di Kalimantan Timur.
	Keputusan ini diambil setelah melalui studi yang panjang terkait berbagai aspek, termasuk
	ketahanan terhadap bencana, ketersediaan lahan, dan potensi ekonomi. Ibu kota baru ini diharapkan
	dapat mengurangi kepadatan di Jakarta dan mendistribusikan pembangunan ekonomi secara lebih merata.
	Proyek ambisius ini membutuhkan investasi besar dan akan dilaksanakan secara bertahap dalam
	jangka waktu beberapa tahun. Para ahli menyatakan bahwa perpindahan ibu kota ini juga akan
	membawa tantangan tersendiri, terutama dalam hal infrastruktur dan adaptasi masyarakat.
	'''

	# Ringkas teks dengan fungsi sederhana
	print("\nTeks asli:\n", sample_text)
	print("\nRingkasan sederhana:\n", summarize_text(sample_text))

	# Coba load model dan ringkas teks
	try:
	# Cari file model dan tokenizer di direktori saat ini
	files = os.listdir('.')
	model_file = next((f for f in files if f.startswith('text_summarizer_model') and (f.endswith('.keras') or f.endswith('.h5'))), None)
	input_tokenizer_file = 'input_tokenizer.pickle' if 'input_tokenizer.pickle' in files else None

	if model_file and input_tokenizer_file:
	summarizer = TextSummarizer(
	model_path=model_file,
	input_tokenizer_path=input_tokenizer_file
	)

	print("\nRingkasan dengan model:\n", summarizer.summarize(sample_text))
	else:
	print("\nFile model atau tokenizer tidak ditemukan.")
	except Exception as e:
	print(f"\nTidak dapat menggunakan model: {e}")