fransiskaarthaa commited on
Commit
63aa5b2
·
verified ·
1 Parent(s): 52240c1

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ text_summarizer_model.keras filter=lfs diff=lfs merge=lfs -text
best_summarization_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:574447551ed9579846966f5f96b22ce8f1837f5f8f1b082f7864e95d44ccb167
3
+ size 52147136
input_tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b12569dde48e72535933a34206633e856647a3a17601325e81263eeb36d9336
3
+ size 1323630
output_tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d048a1685f9929ab91e6832a33eea0b07d2e05da99c4ee86794c0bc9467bc6b
3
+ size 644986
summarizer.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ import pickle
5
+ import re
6
+ import os
7
+
8
+ # Import fungsi pemrosesan teks jika tersedia
9
+ try:
10
+ from text_processing import clean_text, simple_sentence_tokenize, tokenize_words
11
+ except ImportError:
12
+ # Definisi fungsi inline jika modul tidak tersedia
13
+ def clean_text(text):
14
+ """Pembersihan teks yang lebih robust"""
15
+ if not isinstance(text, str):
16
+ return ""
17
+
18
+ # Remove extra whitespaces
19
+ text = re.sub(r'\s+', ' ', text)
20
+
21
+ # Remove special characters but keep punctuation
22
+ text = re.sub(r'[^\w\s.,!?;:\-()]', '', text)
23
+
24
+ # Remove multiple punctuation
25
+ text = re.sub(r'[.,!?;:]{2,}', '.', text)
26
+
27
+ return text.strip()
28
+
29
+ def simple_sentence_tokenize(text):
30
+ """Tokenisasi kalimat sederhana tanpa NLTK"""
31
+ # Bersihkan teks terlebih dahulu
32
+ text = text.replace('\n', ' ').strip()
33
+
34
+ # Pisahkan berdasarkan tanda baca umum
35
+ sentences = []
36
+ for part in re.split(r'(?<=[.!?])\s+', text):
37
+ if part.strip():
38
+ sentences.append(part.strip())
39
+
40
+ # Jika tidak ada kalimat yang ditemukan, kembalikan seluruh teks sebagai satu kalimat
41
+ if not sentences:
42
+ return [text]
43
+
44
+ return sentences
45
+
46
+ def tokenize_words(text):
47
+ """Tokenisasi kata sederhana tanpa NLTK"""
48
+ text = text.lower()
49
+ # Bersihkan teks
50
+ text = re.sub(r'[^\w\s]', ' ', text)
51
+ # Split kata-kata
52
+ return [word for word in text.split() if word.strip()]
53
+
54
+ class TextSummarizer:
55
+ def __init__(self, model_path=None, input_tokenizer_path=None, output_tokenizer_path=None):
56
+ """Inisialisasi text summarizer dengan model dan tokenizer opsional"""
57
+ self.model = None
58
+ self.input_tokenizer = None
59
+ self.output_tokenizer = None
60
+ self.max_input_len = 200
61
+
62
+ # Load model dan tokenizer jika path diberikan
63
+ if model_path and os.path.exists(model_path) and input_tokenizer_path and os.path.exists(input_tokenizer_path):
64
+ self.load(model_path, input_tokenizer_path, output_tokenizer_path)
65
+
66
+ def load(self, model_path, input_tokenizer_path, output_tokenizer_path=None):
67
+ """Load model dan tokenizer dari file"""
68
+ try:
69
+ # Load model
70
+ self.model = tf.keras.models.load_model(model_path)
71
+ print(f"Model berhasil dimuat dari {model_path}")
72
+
73
+ # Load input tokenizer
74
+ with open(input_tokenizer_path, 'rb') as handle:
75
+ self.input_tokenizer = pickle.load(handle)
76
+ print(f"Input tokenizer berhasil dimuat dari {input_tokenizer_path}")
77
+
78
+ # Load output tokenizer jika tersedia
79
+ if output_tokenizer_path and os.path.exists(output_tokenizer_path):
80
+ with open(output_tokenizer_path, 'rb') as handle:
81
+ self.output_tokenizer = pickle.load(handle)
82
+ print(f"Output tokenizer berhasil dimuat dari {output_tokenizer_path}")
83
+
84
+ return True
85
+ except Exception as e:
86
+ print(f"Error saat memuat model dan tokenizer: {e}")
87
+ return False
88
+
89
+ def predict_sentence_importance(self, sentences):
90
+ """Memprediksi pentingnya kalimat menggunakan model"""
91
+ if self.model is None or self.input_tokenizer is None:
92
+ raise ValueError("Model atau tokenizer belum dimuat")
93
+
94
+ # Tokenize dan pad setiap kalimat
95
+ sequences = []
96
+ for sentence in sentences:
97
+ seq = self.input_tokenizer.texts_to_sequences([sentence])
98
+ if seq[0]: # Jika tidak kosong
99
+ padded_seq = tf.keras.preprocessing.sequence.pad_sequences(
100
+ seq, maxlen=self.max_input_len, padding='post'
101
+ )
102
+ sequences.append(padded_seq)
103
+ else:
104
+ # Jika tokenisasi gagal, beri nilai 0
105
+ sequences.append(np.zeros((1, self.max_input_len)))
106
+
107
+ # Prediksi skor penting untuk setiap kalimat
108
+ importance_scores = []
109
+ for seq in sequences:
110
+ score = self.model.predict(seq, verbose=0)[0][0]
111
+ importance_scores.append(score)
112
+
113
+ return importance_scores
114
+
115
+ def summarize(self, text, max_sentences=3):
116
+ """Ringkas teks menggunakan model atau pendekatan ekstraktif"""
117
+ # Preprocessing
118
+ cleaned_text = clean_text(text)
119
+ if not cleaned_text:
120
+ return "Teks tidak valid atau kosong."
121
+
122
+ # Tokenisasi kalimat
123
+ try:
124
+ # Coba gunakan NLTK jika tersedia
125
+ import nltk
126
+ from nltk.tokenize import sent_tokenize
127
+ nltk.download('punkt', quiet=True)
128
+ sentences = sent_tokenize(cleaned_text)
129
+ except:
130
+ # Fallback ke tokenisasi sederhana
131
+ sentences = simple_sentence_tokenize(cleaned_text)
132
+
133
+ # Jika teks sudah pendek, return as is
134
+ if len(sentences) <= max_sentences:
135
+ return cleaned_text
136
+
137
+ # Gunakan model untuk memprediksi kalimat penting jika tersedia
138
+ if self.model is not None and self.input_tokenizer is not None:
139
+ try:
140
+ importance_scores = self.predict_sentence_importance(sentences)
141
+
142
+ # Ambil indeks kalimat dengan skor tertinggi
143
+ top_indices = np.argsort(importance_scores)[-max_sentences:]
144
+ top_indices = sorted(top_indices) # Urutkan berdasarkan posisi asli
145
+
146
+ # Ambil kalimat-kalimat penting
147
+ summary_sentences = [sentences[i] for i in top_indices]
148
+
149
+ return " ".join(summary_sentences)
150
+
151
+ except Exception as e:
152
+ print(f"Error saat prediksi model: {e}")
153
+ # Fallback ke strategi ekstraktif
154
+
155
+ # Strategi ekstraktif sederhana (kalimat pertama, tengah, terakhir)
156
+ summary_sentences = [sentences[0]] # Kalimat pertama selalu penting
157
+
158
+ if max_sentences >= 2:
159
+ summary_sentences.append(sentences[-1]) # Kalimat terakhir
160
+
161
+ if max_sentences >= 3 and len(sentences) > 2:
162
+ # Tambahkan kalimat tengah
163
+ middle_idx = len(sentences) // 2
164
+ if sentences[middle_idx] not in summary_sentences:
165
+ summary_sentences.insert(1, sentences[middle_idx])
166
+
167
+ # Urutkan berdasarkan posisi asli dalam teks
168
+ positions = []
169
+ for sent in summary_sentences:
170
+ for i, s in enumerate(sentences):
171
+ if sent == s:
172
+ positions.append(i)
173
+ break
174
+
175
+ sorted_pairs = sorted(zip(positions, summary_sentences))
176
+ ordered_summary = [pair[1] for pair in sorted_pairs]
177
+
178
+ return " ".join(ordered_summary)
179
+
180
+ def summarize_text(text, max_sentences=3):
181
+ """Fungsi praktis untuk meringkas teks tanpa memerlukan model"""
182
+ # Preprocessing
183
+ cleaned_text = clean_text(text)
184
+ if not cleaned_text:
185
+ return "Teks tidak valid atau kosong."
186
+
187
+ # Tokenisasi kalimat
188
+ sentences = simple_sentence_tokenize(cleaned_text)
189
+
190
+ # Jika teks sudah pendek, return as is
191
+ if len(sentences) <= max_sentences:
192
+ return cleaned_text
193
+
194
+ # Strategi ekstraktif sederhana (kalimat pertama, tengah, terakhir)
195
+ summary_sentences = [sentences[0]] # Kalimat pertama selalu penting
196
+
197
+ if max_sentences >= 2:
198
+ summary_sentences.append(sentences[-1]) # Kalimat terakhir
199
+
200
+ if max_sentences >= 3 and len(sentences) > 2:
201
+ # Tambahkan kalimat tengah
202
+ middle_idx = len(sentences) // 2
203
+ if sentences[middle_idx] not in summary_sentences:
204
+ summary_sentences.insert(1, sentences[middle_idx])
205
+
206
+ # Urutkan berdasarkan posisi asli dalam teks
207
+ positions = []
208
+ for sent in summary_sentences:
209
+ for i, s in enumerate(sentences):
210
+ if sent == s:
211
+ positions.append(i)
212
+ break
213
+
214
+ sorted_pairs = sorted(zip(positions, summary_sentences))
215
+ ordered_summary = [pair[1] for pair in sorted_pairs]
216
+
217
+ return " ".join(ordered_summary)
218
+
219
+ # Contoh penggunaan
220
+ if __name__ == "__main__":
221
+ # Contoh teks
222
+ sample_text = '''
223
+ Pemerintah Indonesia telah mengumumkan rencana pembangunan ibu kota baru di Kalimantan Timur.
224
+ Keputusan ini diambil setelah melalui studi yang panjang terkait berbagai aspek, termasuk
225
+ ketahanan terhadap bencana, ketersediaan lahan, dan potensi ekonomi. Ibu kota baru ini diharapkan
226
+ dapat mengurangi kepadatan di Jakarta dan mendistribusikan pembangunan ekonomi secara lebih merata.
227
+ Proyek ambisius ini membutuhkan investasi besar dan akan dilaksanakan secara bertahap dalam
228
+ jangka waktu beberapa tahun. Para ahli menyatakan bahwa perpindahan ibu kota ini juga akan
229
+ membawa tantangan tersendiri, terutama dalam hal infrastruktur dan adaptasi masyarakat.
230
+ '''
231
+
232
+ # Ringkas teks dengan fungsi sederhana
233
+ print("\nTeks asli:\n", sample_text)
234
+ print("\nRingkasan sederhana:\n", summarize_text(sample_text))
235
+
236
+ # Coba load model dan ringkas teks
237
+ try:
238
+ # Cari file model dan tokenizer di direktori saat ini
239
+ files = os.listdir('.')
240
+ model_file = next((f for f in files if f.startswith('text_summarizer_model') and (f.endswith('.keras') or f.endswith('.h5'))), None)
241
+ input_tokenizer_file = 'input_tokenizer.pickle' if 'input_tokenizer.pickle' in files else None
242
+
243
+ if model_file and input_tokenizer_file:
244
+ summarizer = TextSummarizer(
245
+ model_path=model_file,
246
+ input_tokenizer_path=input_tokenizer_file
247
+ )
248
+
249
+ print("\nRingkasan dengan model:\n", summarizer.summarize(sample_text))
250
+ else:
251
+ print("\nFile model atau tokenizer tidak ditemukan.")
252
+ except Exception as e:
253
+ print(f"\nTidak dapat menggunakan model: {e}")
text_processing.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+
4
+ def clean_text(text):
5
+ """Pembersihan teks yang lebih robust"""
6
+ if not isinstance(text, str):
7
+ return ""
8
+
9
+ # Remove extra whitespaces
10
+ text = re.sub(r'\s+', ' ', text)
11
+
12
+ # Remove special characters but keep punctuation
13
+ text = re.sub(r'[^\w\s.,!?;:\-()]', '', text)
14
+
15
+ # Remove multiple punctuation
16
+ text = re.sub(r'[.,!?;:]{2,}', '.', text)
17
+
18
+ return text.strip()
19
+
20
+ def simple_sentence_tokenize(text):
21
+ """Tokenisasi kalimat sederhana tanpa NLTK"""
22
+ # Bersihkan teks terlebih dahulu
23
+ text = text.replace('\n', ' ').strip()
24
+
25
+ # Pisahkan berdasarkan tanda baca umum
26
+ sentences = []
27
+ for part in re.split(r'(?<=[.!?])\s+', text):
28
+ if part.strip():
29
+ sentences.append(part.strip())
30
+
31
+ # Jika tidak ada kalimat yang ditemukan, kembalikan seluruh teks sebagai satu kalimat
32
+ if not sentences:
33
+ return [text]
34
+
35
+ return sentences
36
+
37
+ def tokenize_words(text):
38
+ """Tokenisasi kata sederhana tanpa NLTK"""
39
+ text = text.lower()
40
+ # Bersihkan teks
41
+ text = re.sub(r'[^\w\s]', ' ', text)
42
+ # Split kata-kata
43
+ return [word for word in text.split() if word.strip()]
text_summarizer_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb6ae300f65676aee543b9cf392ed381eec2e851f5ae6e77ca6529c071668544
3
+ size 52147778