fransiskaarthaa
/

text-summarize-fix

Model card Files Files and versions

text-summarize-fix / text_processing.py

fransiskaarthaa's picture

fransiskaarthaa

Upload folder using huggingface_hub

63aa5b2 verified about 1 month ago

history blame contribute delete

1.16 kB


	import re

	def clean_text(text):
	"""Pembersihan teks yang lebih robust"""
	if not isinstance(text, str):
	return ""

	# Remove extra whitespaces
	text = re.sub(r'\s+', ' ', text)

	# Remove special characters but keep punctuation
	text = re.sub(r'[^\w\s.,!?;:\-()]', '', text)

	# Remove multiple punctuation
	text = re.sub(r'[.,!?;:]{2,}', '.', text)

	return text.strip()

	def simple_sentence_tokenize(text):
	"""Tokenisasi kalimat sederhana tanpa NLTK"""
	# Bersihkan teks terlebih dahulu
	text = text.replace('\n', ' ').strip()

	# Pisahkan berdasarkan tanda baca umum
	sentences = []
	for part in re.split(r'(?<=[.!?])\s+', text):
	if part.strip():
	sentences.append(part.strip())

	# Jika tidak ada kalimat yang ditemukan, kembalikan seluruh teks sebagai satu kalimat
	if not sentences:
	return [text]

	return sentences

	def tokenize_words(text):
	"""Tokenisasi kata sederhana tanpa NLTK"""
	text = text.lower()
	# Bersihkan teks
	text = re.sub(r'[^\w\s]', ' ', text)
	# Split kata-kata
	return [word for word in text.split() if word.strip()]