Yermia commited on
Commit
7af8df4
·
1 Parent(s): e43a761

Fix text procesor

Browse files
Files changed (1) hide show
  1. utils/text_processor.py +371 -127
utils/text_processor.py CHANGED
@@ -1,42 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import (
2
  AutoTokenizer,
3
  AutoModelForSeq2SeqLM,
4
- AutoModelForTokenClassification,
5
  pipeline
6
  )
7
  from keybert import KeyBERT
8
- from summarizer import Summarizer
9
  import re
10
  import nltk
11
- nltk.download('punkt')
12
 
13
  class TextProcessor:
14
  def __init__(self):
15
- # Initialize summarization model
16
- self.summarizer = Summarizer('bert-base-multilingual-cased')
17
 
18
- # Initialize KeyBERT for keyword extraction
19
- self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
 
 
 
 
 
 
 
 
 
20
 
21
- # Initialize NER for action item detection
22
- self.ner_pipeline = pipeline(
23
- "ner",
24
- model="cahya/bert-base-indonesian-NER",
25
- aggregation_strategy="simple"
26
- )
27
 
28
- # Action item patterns
29
  self.action_patterns = [
30
- r"akan\s+(\w+)",
31
- r"harus\s+(\w+)",
32
- r"perlu\s+(\w+)",
33
- r"mohon\s+(\w+)",
34
- r"tolong\s+(\w+)",
35
- r"segera\s+(\w+)",
36
- r"follow\s*up",
37
- r"action\s*item",
38
- r"to\s*do",
39
- r"deadline"
40
  ]
41
 
42
  # Decision patterns
@@ -47,45 +275,73 @@ class TextProcessor:
47
  r"keputusan(?:nya)?\s+(.+)",
48
  r"final(?:isasi)?\s+(.+)"
49
  ]
 
 
50
 
51
  def summarize_transcript(self, transcript_segments, ratio=0.3):
52
- """
53
- Hierarchical summarization untuk transcript panjang
54
- """
55
- # Gabungkan text dari semua segments
56
  full_text = ' '.join([seg['text'] for seg in transcript_segments])
57
 
58
- # Chunking untuk dokumen panjang
59
- chunks = self._create_chunks(full_text)
60
 
61
- if len(chunks) == 1:
62
- # Direct summarization untuk dokumen pendek
63
- return self.summarizer(
64
- chunks[0],
65
- ratio=ratio,
66
- num_sentences=5
67
- )
68
- else:
69
- # Hierarchical summarization
70
- return self._hierarchical_summarization(chunks, ratio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def extract_key_information(self, transcript_segments):
73
- """
74
- Extract action items, decisions, dan key topics
75
- """
76
  full_text = ' '.join([seg['text'] for seg in transcript_segments])
77
 
78
  # Extract keywords/topics
79
- keywords = self.kw_model.extract_keywords(
80
- full_text,
81
- keyphrase_ngram_range=(1, 3),
82
- stop_words='indonesian',
83
- top_n=10,
84
- use_mmr=True,
85
- diversity=0.5
86
- )
87
-
88
- # Extract action items dan decisions
 
 
 
 
 
 
 
 
 
89
  action_items = []
90
  decisions = []
91
 
@@ -95,8 +351,7 @@ class TextProcessor:
95
  action_items.append({
96
  'text': segment['text'],
97
  'speaker': segment['speaker'],
98
- 'timestamp': f"{segment['start']:.1f}s",
99
- 'entities': self._extract_entities(segment['text'])
100
  })
101
 
102
  # Check for decisions
@@ -113,60 +368,81 @@ class TextProcessor:
113
  'decisions': decisions
114
  }
115
 
116
- def _create_chunks(self, text, max_length=3000):
117
- """
118
- Create overlapping chunks for long documents
119
- """
120
- sentences = nltk.sent_tokenize(text)
121
  chunks = []
122
  current_chunk = []
123
  current_length = 0
124
 
125
- for sentence in sentences:
126
- sentence_length = len(sentence)
 
127
 
128
- if current_length + sentence_length > max_length and current_chunk:
129
  chunks.append(' '.join(current_chunk))
130
- # Keep last 2 sentences for overlap
131
- current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
132
- current_length = sum(len(s) for s in current_chunk)
133
-
134
- current_chunk.append(sentence)
135
- current_length += sentence_length
136
 
137
  if current_chunk:
138
  chunks.append(' '.join(current_chunk))
139
 
140
  return chunks
141
 
142
- def _hierarchical_summarization(self, chunks, ratio):
143
- """
144
- Two-level summarization for long documents
145
- """
146
- # Level 1: Summarize each chunk
147
- chunk_summaries = []
148
- for chunk in chunks:
149
- summary = self.summarizer(
150
- chunk,
151
- ratio=0.4, # Higher ratio for first level
152
- num_sentences=4
153
- )
154
- chunk_summaries.append(summary)
155
 
156
- # Level 2: Summarize the summaries
157
- combined_summary = ' '.join(chunk_summaries)
158
- final_summary = self.summarizer(
159
- combined_summary,
160
- ratio=ratio,
161
- num_sentences=6
162
- )
163
 
164
- return final_summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  def _is_action_item(self, text):
167
- """
168
- Detect if text contains action item
169
- """
170
  text_lower = text.lower()
171
 
172
  # Check patterns
@@ -184,43 +460,11 @@ class TextProcessor:
184
  return first_word in imperative_verbs
185
 
186
  def _is_decision(self, text):
187
- """
188
- Detect if text contains decision
189
- """
190
  text_lower = text.lower()
191
 
192
  for pattern in self.decision_patterns:
193
  if re.search(pattern, text_lower):
194
  return True
195
 
196
- return False
197
-
198
- def _extract_entities(self, text):
199
- """
200
- Extract named entities (person, date, etc)
201
- """
202
- entities = self.ner_pipeline(text)
203
-
204
- return {
205
- 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
206
- 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
207
- 'dates': self._extract_dates(text)
208
- }
209
-
210
- def _extract_dates(self, text):
211
- """
212
- Extract date mentions
213
- """
214
- date_patterns = [
215
- r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
216
- r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
217
- r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
218
- r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
219
- ]
220
-
221
- dates = []
222
- for pattern in date_patterns:
223
- matches = re.findall(pattern, text.lower())
224
- dates.extend(matches)
225
-
226
- return dates
 
1
+ # from transformers import (
2
+ # AutoTokenizer,
3
+ # AutoModelForSeq2SeqLM,
4
+ # AutoModelForTokenClassification,
5
+ # pipeline
6
+ # )
7
+ # from keybert import KeyBERT
8
+ # from summarizer import Summarizer
9
+ # import re
10
+ # import nltk
11
+ # nltk.download('punkt')
12
+
13
+ # class TextProcessor:
14
+ # def __init__(self):
15
+ # # Initialize summarization model
16
+ # self.summarizer = Summarizer('bert-base-multilingual-cased')
17
+
18
+ # # Initialize KeyBERT for keyword extraction
19
+ # self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
20
+
21
+ # # Initialize NER for action item detection
22
+ # self.ner_pipeline = pipeline(
23
+ # "ner",
24
+ # model="cahya/bert-base-indonesian-NER",
25
+ # aggregation_strategy="simple"
26
+ # )
27
+
28
+ # # Action item patterns
29
+ # self.action_patterns = [
30
+ # r"akan\s+(\w+)",
31
+ # r"harus\s+(\w+)",
32
+ # r"perlu\s+(\w+)",
33
+ # r"mohon\s+(\w+)",
34
+ # r"tolong\s+(\w+)",
35
+ # r"segera\s+(\w+)",
36
+ # r"follow\s*up",
37
+ # r"action\s*item",
38
+ # r"to\s*do",
39
+ # r"deadline"
40
+ # ]
41
+
42
+ # # Decision patterns
43
+ # self.decision_patterns = [
44
+ # r"(diputuskan|memutuskan)\s+(.+)",
45
+ # r"(disepakati|menyepakati)\s+(.+)",
46
+ # r"(setuju|persetujuan)\s+(.+)",
47
+ # r"keputusan(?:nya)?\s+(.+)",
48
+ # r"final(?:isasi)?\s+(.+)"
49
+ # ]
50
+
51
+ # def summarize_transcript(self, transcript_segments, ratio=0.3):
52
+ # """
53
+ # Hierarchical summarization untuk transcript panjang
54
+ # """
55
+ # # Gabungkan text dari semua segments
56
+ # full_text = ' '.join([seg['text'] for seg in transcript_segments])
57
+
58
+ # # Chunking untuk dokumen panjang
59
+ # chunks = self._create_chunks(full_text)
60
+
61
+ # if len(chunks) == 1:
62
+ # # Direct summarization untuk dokumen pendek
63
+ # return self.summarizer(
64
+ # chunks[0],
65
+ # ratio=ratio,
66
+ # num_sentences=5
67
+ # )
68
+ # else:
69
+ # # Hierarchical summarization
70
+ # return self._hierarchical_summarization(chunks, ratio)
71
+
72
+ # def extract_key_information(self, transcript_segments):
73
+ # """
74
+ # Extract action items, decisions, dan key topics
75
+ # """
76
+ # full_text = ' '.join([seg['text'] for seg in transcript_segments])
77
+
78
+ # # Extract keywords/topics
79
+ # keywords = self.kw_model.extract_keywords(
80
+ # full_text,
81
+ # keyphrase_ngram_range=(1, 3),
82
+ # stop_words='indonesian',
83
+ # top_n=10,
84
+ # use_mmr=True,
85
+ # diversity=0.5
86
+ # )
87
+
88
+ # # Extract action items dan decisions
89
+ # action_items = []
90
+ # decisions = []
91
+
92
+ # for segment in transcript_segments:
93
+ # # Check for action items
94
+ # if self._is_action_item(segment['text']):
95
+ # action_items.append({
96
+ # 'text': segment['text'],
97
+ # 'speaker': segment['speaker'],
98
+ # 'timestamp': f"{segment['start']:.1f}s",
99
+ # 'entities': self._extract_entities(segment['text'])
100
+ # })
101
+
102
+ # # Check for decisions
103
+ # if self._is_decision(segment['text']):
104
+ # decisions.append({
105
+ # 'text': segment['text'],
106
+ # 'speaker': segment['speaker'],
107
+ # 'timestamp': f"{segment['start']:.1f}s"
108
+ # })
109
+
110
+ # return {
111
+ # 'keywords': keywords,
112
+ # 'action_items': action_items,
113
+ # 'decisions': decisions
114
+ # }
115
+
116
+ # def _create_chunks(self, text, max_length=3000):
117
+ # """
118
+ # Create overlapping chunks for long documents
119
+ # """
120
+ # sentences = nltk.sent_tokenize(text)
121
+ # chunks = []
122
+ # current_chunk = []
123
+ # current_length = 0
124
+
125
+ # for sentence in sentences:
126
+ # sentence_length = len(sentence)
127
+
128
+ # if current_length + sentence_length > max_length and current_chunk:
129
+ # chunks.append(' '.join(current_chunk))
130
+ # # Keep last 2 sentences for overlap
131
+ # current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
132
+ # current_length = sum(len(s) for s in current_chunk)
133
+
134
+ # current_chunk.append(sentence)
135
+ # current_length += sentence_length
136
+
137
+ # if current_chunk:
138
+ # chunks.append(' '.join(current_chunk))
139
+
140
+ # return chunks
141
+
142
+ # def _hierarchical_summarization(self, chunks, ratio):
143
+ # """
144
+ # Two-level summarization for long documents
145
+ # """
146
+ # # Level 1: Summarize each chunk
147
+ # chunk_summaries = []
148
+ # for chunk in chunks:
149
+ # summary = self.summarizer(
150
+ # chunk,
151
+ # ratio=0.4, # Higher ratio for first level
152
+ # num_sentences=4
153
+ # )
154
+ # chunk_summaries.append(summary)
155
+
156
+ # # Level 2: Summarize the summaries
157
+ # combined_summary = ' '.join(chunk_summaries)
158
+ # final_summary = self.summarizer(
159
+ # combined_summary,
160
+ # ratio=ratio,
161
+ # num_sentences=6
162
+ # )
163
+
164
+ # return final_summary
165
+
166
+ # def _is_action_item(self, text):
167
+ # """
168
+ # Detect if text contains action item
169
+ # """
170
+ # text_lower = text.lower()
171
+
172
+ # # Check patterns
173
+ # for pattern in self.action_patterns:
174
+ # if re.search(pattern, text_lower):
175
+ # return True
176
+
177
+ # # Check for imperative sentences
178
+ # first_word = text.split()[0].lower() if text.split() else ""
179
+ # imperative_verbs = [
180
+ # 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
181
+ # 'follow', 'prepare', 'send', 'contact', 'create'
182
+ # ]
183
+
184
+ # return first_word in imperative_verbs
185
+
186
+ # def _is_decision(self, text):
187
+ # """
188
+ # Detect if text contains decision
189
+ # """
190
+ # text_lower = text.lower()
191
+
192
+ # for pattern in self.decision_patterns:
193
+ # if re.search(pattern, text_lower):
194
+ # return True
195
+
196
+ # return False
197
+
198
+ # def _extract_entities(self, text):
199
+ # """
200
+ # Extract named entities (person, date, etc)
201
+ # """
202
+ # entities = self.ner_pipeline(text)
203
+
204
+ # return {
205
+ # 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
206
+ # 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
207
+ # 'dates': self._extract_dates(text)
208
+ # }
209
+
210
+ # def _extract_dates(self, text):
211
+ # """
212
+ # Extract date mentions
213
+ # """
214
+ # date_patterns = [
215
+ # r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
216
+ # r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
217
+ # r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
218
+ # r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
219
+ # ]
220
+
221
+ # dates = []
222
+ # for pattern in date_patterns:
223
+ # matches = re.findall(pattern, text.lower())
224
+ # dates.extend(matches)
225
+
226
+ # return dates
227
+
228
+
229
+
230
  from transformers import (
231
  AutoTokenizer,
232
  AutoModelForSeq2SeqLM,
 
233
  pipeline
234
  )
235
  from keybert import KeyBERT
 
236
  import re
237
  import nltk
238
+ from typing import List, Dict
239
 
240
  class TextProcessor:
241
  def __init__(self):
242
+ print("Initializing Text Processor...")
 
243
 
244
+ # Use transformers pipeline for summarization instead
245
+ try:
246
+ self.summarizer = pipeline(
247
+ "summarization",
248
+ model="sshleifer/distilbart-cnn-12-6",
249
+ device=-1 # CPU
250
+ )
251
+ except:
252
+ # Fallback to simple extractive summarization
253
+ self.summarizer = None
254
+ print("Warning: Summarization model not loaded, using fallback")
255
 
256
+ # Initialize KeyBERT for keyword extraction
257
+ try:
258
+ self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
259
+ except:
260
+ self.kw_model = None
261
+ print("Warning: KeyBERT not loaded")
262
 
263
+ # Action item patterns
264
  self.action_patterns = [
265
+ r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)",
266
+ r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)",
267
+ r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline"
 
 
 
 
 
 
 
268
  ]
269
 
270
  # Decision patterns
 
275
  r"keputusan(?:nya)?\s+(.+)",
276
  r"final(?:isasi)?\s+(.+)"
277
  ]
278
+
279
+ print("Text Processor ready!")
280
 
281
  def summarize_transcript(self, transcript_segments, ratio=0.3):
282
+ """Summarization with fallback methods"""
283
+ # Combine text from all segments
 
 
284
  full_text = ' '.join([seg['text'] for seg in transcript_segments])
285
 
286
+ if not full_text.strip():
287
+ return "No content to summarize."
288
 
289
+ # Try using the summarization pipeline
290
+ if self.summarizer:
291
+ try:
292
+ # Split into chunks if too long
293
+ max_chunk_length = 1024
294
+ if len(full_text) > max_chunk_length:
295
+ chunks = self._split_into_chunks(full_text, max_chunk_length)
296
+ summaries = []
297
+
298
+ for chunk in chunks[:3]: # Limit to first 3 chunks
299
+ summary = self.summarizer(
300
+ chunk,
301
+ max_length=130,
302
+ min_length=30,
303
+ do_sample=False
304
+ )[0]['summary_text']
305
+ summaries.append(summary)
306
+
307
+ return ' '.join(summaries)
308
+ else:
309
+ return self.summarizer(
310
+ full_text,
311
+ max_length=150,
312
+ min_length=30,
313
+ do_sample=False
314
+ )[0]['summary_text']
315
+ except:
316
+ pass
317
+
318
+ # Fallback: Simple extractive summarization
319
+ return self._simple_extractive_summary(full_text, ratio)
320
 
321
  def extract_key_information(self, transcript_segments):
322
+ """Extract action items, decisions, and key topics"""
 
 
323
  full_text = ' '.join([seg['text'] for seg in transcript_segments])
324
 
325
  # Extract keywords/topics
326
+ keywords = []
327
+ if self.kw_model:
328
+ try:
329
+ keywords = self.kw_model.extract_keywords(
330
+ full_text,
331
+ keyphrase_ngram_range=(1, 3),
332
+ stop_words=None,
333
+ top_n=10,
334
+ use_mmr=True,
335
+ diversity=0.5
336
+ )
337
+ except:
338
+ pass
339
+
340
+ # If KeyBERT fails, use simple frequency-based extraction
341
+ if not keywords:
342
+ keywords = self._extract_keywords_simple(full_text)
343
+
344
+ # Extract action items and decisions
345
  action_items = []
346
  decisions = []
347
 
 
351
  action_items.append({
352
  'text': segment['text'],
353
  'speaker': segment['speaker'],
354
+ 'timestamp': f"{segment['start']:.1f}s"
 
355
  })
356
 
357
  # Check for decisions
 
368
  'decisions': decisions
369
  }
370
 
371
+ def _split_into_chunks(self, text, max_length):
372
+ """Split text into chunks"""
373
+ words = text.split()
 
 
374
  chunks = []
375
  current_chunk = []
376
  current_length = 0
377
 
378
+ for word in words:
379
+ current_chunk.append(word)
380
+ current_length += len(word) + 1
381
 
382
+ if current_length >= max_length:
383
  chunks.append(' '.join(current_chunk))
384
+ current_chunk = []
385
+ current_length = 0
 
 
 
 
386
 
387
  if current_chunk:
388
  chunks.append(' '.join(current_chunk))
389
 
390
  return chunks
391
 
392
+ def _simple_extractive_summary(self, text, ratio=0.3):
393
+ """Simple extractive summarization fallback"""
394
+ sentences = nltk.sent_tokenize(text)
 
 
 
 
 
 
 
 
 
 
395
 
396
+ if len(sentences) <= 3:
397
+ return text
 
 
 
 
 
398
 
399
+ # Calculate number of sentences to include
400
+ num_sentences = max(3, int(len(sentences) * ratio))
401
+
402
+ # Simple scoring: prefer sentences with more content words
403
+ scored_sentences = []
404
+ for i, sent in enumerate(sentences):
405
+ # Score based on length and position
406
+ score = len(sent.split())
407
+ if i < 3: # Boost first sentences
408
+ score *= 1.5
409
+ if i >= len(sentences) - 2: # Boost last sentences
410
+ score *= 1.2
411
+ scored_sentences.append((score, sent))
412
+
413
+ # Sort by score and select top sentences
414
+ scored_sentences.sort(reverse=True)
415
+ selected = [sent for _, sent in scored_sentences[:num_sentences]]
416
+
417
+ # Return in original order
418
+ return ' '.join([s for s in sentences if s in selected])
419
+
420
+ def _extract_keywords_simple(self, text):
421
+ """Simple keyword extraction fallback"""
422
+ # Remove common words
423
+ stopwords = {
424
+ 'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah',
425
+ 'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan',
426
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
427
+ 'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were'
428
+ }
429
+
430
+ # Count word frequency
431
+ words = re.findall(r'\b\w+\b', text.lower())
432
+ word_freq = {}
433
+
434
+ for word in words:
435
+ if len(word) > 3 and word not in stopwords:
436
+ word_freq[word] = word_freq.get(word, 0) + 1
437
+
438
+ # Get top keywords
439
+ keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
440
+
441
+ # Format like KeyBERT output
442
+ return [(word, freq/len(words)) for word, freq in keywords]
443
 
444
  def _is_action_item(self, text):
445
+ """Detect if text contains action item"""
 
 
446
  text_lower = text.lower()
447
 
448
  # Check patterns
 
460
  return first_word in imperative_verbs
461
 
462
  def _is_decision(self, text):
463
+ """Detect if text contains decision"""
 
 
464
  text_lower = text.lower()
465
 
466
  for pattern in self.decision_patterns:
467
  if re.search(pattern, text_lower):
468
  return True
469
 
470
+ return False