|
|
|
import re |
|
|
|
def clean_text(text): |
|
"""Pembersihan teks yang lebih robust""" |
|
if not isinstance(text, str): |
|
return "" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'[^\w\s.,!?;:\-()]', '', text) |
|
|
|
|
|
text = re.sub(r'[.,!?;:]{2,}', '.', text) |
|
|
|
return text.strip() |
|
|
|
def simple_sentence_tokenize(text): |
|
"""Tokenisasi kalimat sederhana tanpa NLTK""" |
|
|
|
text = text.replace('\n', ' ').strip() |
|
|
|
|
|
sentences = [] |
|
for part in re.split(r'(?<=[.!?])\s+', text): |
|
if part.strip(): |
|
sentences.append(part.strip()) |
|
|
|
|
|
if not sentences: |
|
return [text] |
|
|
|
return sentences |
|
|
|
def tokenize_words(text): |
|
"""Tokenisasi kata sederhana tanpa NLTK""" |
|
text = text.lower() |
|
|
|
text = re.sub(r'[^\w\s]', ' ', text) |
|
|
|
return [word for word in text.split() if word.strip()] |
|
|