Yermia commited on
Commit
5da9a16
·
1 Parent(s): b162d3c

First init

Browse files
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from utils.speech_processor import SpeechProcessor
4
+ from utils.text_processor import TextProcessor
5
+ from utils.output_generator import OutputGenerator
6
+ import tempfile
7
+ import os
8
+
9
+ # Initialize processors
10
+ speech_processor = SpeechProcessor()
11
+ text_processor = TextProcessor()
12
+ output_generator = OutputGenerator()
13
+
14
+ def process_meeting(audio_file, language="id", summary_ratio=0.3):
15
+ """
16
+ Main pipeline untuk memproses audio meeting
17
+ """
18
+ try:
19
+ # Step 1: Speech Processing
20
+ gr.Info("🎤 Memproses audio...")
21
+ transcript_with_speakers = speech_processor.process_audio(
22
+ audio_file,
23
+ language=language
24
+ )
25
+
26
+ # Step 2: Text Processing & Summarization
27
+ gr.Info("📝 Membuat ringkasan...")
28
+ summary = text_processor.summarize_transcript(
29
+ transcript_with_speakers,
30
+ ratio=summary_ratio
31
+ )
32
+
33
+ # Step 3: Information Extraction
34
+ gr.Info("🔍 Mengekstrak informasi penting...")
35
+ extracted_info = text_processor.extract_key_information(
36
+ transcript_with_speakers
37
+ )
38
+
39
+ # Step 4: Generate Output
40
+ gr.Info("📄 Membuat notulensi...")
41
+ outputs = output_generator.generate_all_formats(
42
+ transcript_with_speakers,
43
+ summary,
44
+ extracted_info
45
+ )
46
+
47
+ return (
48
+ outputs['markdown'],
49
+ outputs['json'],
50
+ outputs['transcript_table'],
51
+ outputs['action_items_table'],
52
+ outputs['decisions_table']
53
+ )
54
+
55
+ except Exception as e:
56
+ gr.Error(f"Error: {str(e)}")
57
+ return None, None, None, None, None
58
+
59
+ # Gradio Interface
60
+ with gr.Blocks(title="🤖 AI Meeting Minutes Generator") as demo:
61
+ gr.Markdown("""
62
+ # 🤖 AI Meeting Minutes Generator
63
+
64
+ Upload audio rapat Anda dan dapatkan notulensi otomatis dengan:
65
+ - 🎯 Identifikasi pembicara
66
+ - 📝 Ringkasan otomatis
67
+ - ✅ Action items
68
+ - 📊 Keputusan penting
69
+ """)
70
+
71
+ with gr.Row():
72
+ with gr.Column():
73
+ audio_input = gr.Audio(
74
+ label="Upload Audio Rapat",
75
+ type="filepath",
76
+ sources=["upload", "microphone"]
77
+ )
78
+
79
+ with gr.Row():
80
+ language = gr.Dropdown(
81
+ choices=[
82
+ ("Indonesia", "id"),
83
+ ("English", "en")
84
+ ],
85
+ value="id",
86
+ label="Bahasa"
87
+ )
88
+
89
+ summary_ratio = gr.Slider(
90
+ minimum=0.1,
91
+ maximum=0.5,
92
+ value=0.3,
93
+ step=0.05,
94
+ label="Rasio Ringkasan"
95
+ )
96
+
97
+ process_btn = gr.Button("🚀 Proses Audio", variant="primary")
98
+
99
+ with gr.Row():
100
+ with gr.Column():
101
+ gr.Markdown("### 📄 Notulensi (Markdown)")
102
+ markdown_output = gr.Textbox(
103
+ label="Preview Notulensi",
104
+ lines=20,
105
+ max_lines=30
106
+ )
107
+
108
+ json_download = gr.File(
109
+ label="📥 Download JSON"
110
+ )
111
+
112
+ with gr.Row():
113
+ with gr.Column():
114
+ gr.Markdown("### 📊 Transkrip Lengkap")
115
+ transcript_table = gr.Dataframe(
116
+ headers=["Waktu", "Pembicara", "Teks"],
117
+ label="Transkrip dengan Pembicara"
118
+ )
119
+
120
+ with gr.Row():
121
+ with gr.Column():
122
+ gr.Markdown("### ✅ Action Items")
123
+ action_items_table = gr.Dataframe(
124
+ headers=["Action Item", "Penanggung Jawab", "Timestamp"],
125
+ label="Daftar Action Items"
126
+ )
127
+
128
+ with gr.Column():
129
+ gr.Markdown("### 📌 Keputusan")
130
+ decisions_table = gr.Dataframe(
131
+ headers=["Keputusan", "Pembicara", "Timestamp"],
132
+ label="Daftar Keputusan"
133
+ )
134
+
135
+ # Process button action
136
+ process_btn.click(
137
+ fn=process_meeting,
138
+ inputs=[audio_input, language, summary_ratio],
139
+ outputs=[
140
+ markdown_output,
141
+ json_download,
142
+ transcript_table,
143
+ action_items_table,
144
+ decisions_table
145
+ ]
146
+ )
147
+
148
+ # Examples
149
+ gr.Examples(
150
+ examples=[
151
+ ["examples/meeting_sample_id.wav", "id", 0.3],
152
+ ["examples/meeting_sample_en.wav", "en", 0.25]
153
+ ],
154
+ inputs=[audio_input, language, summary_ratio]
155
+ )
156
+
157
+ if __name__ == "__main__":
158
+ demo.launch()
models/config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass
3
+
4
+ @dataclass
5
+ class ModelConfig:
6
+ # Whisper ASR
7
+ whisper_model: str = "openai/whisper-medium"
8
+ whisper_language: str = "id"
9
+
10
+ # Speaker Diarization
11
+ diarization_model: str = "pyannote/speaker-diarization-3.1"
12
+ min_speakers: int = 1
13
+ max_speakers: int = 10
14
+
15
+ # Text Processing
16
+ summarization_model: str = "bert-base-multilingual-cased"
17
+ ner_model: str = "cahya/bert-base-indonesian-NER"
18
+ keyword_model: str = "paraphrase-multilingual-MiniLM-L12-v2"
19
+
20
+ # Processing Parameters
21
+ chunk_size: int = 3000
22
+ chunk_overlap: int = 200
23
+ summary_ratio: float = 0.3
24
+ max_summary_sentences: int = 6
25
+
26
+ # Output
27
+ output_formats: list = None
28
+
29
+ def __post_init__(self):
30
+ if self.output_formats is None:
31
+ self.output_formats = ["markdown", "json", "html"]
32
+
33
+ # Set HF token from environment
34
+ self.hf_token = os.environ.get("HF_TOKEN", None)
35
+
36
+ # Global config instance
37
+ config = ModelConfig()
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ gradio==4.19.2
3
+ transformers==4.37.2
4
+ torch==2.1.2
5
+ torchaudio==2.1.2
6
+
7
+ # Audio processing
8
+ pyannote.audio==3.1.1
9
+ speechbrain==0.5.16
10
+ librosa==0.10.1
11
+ pydub==0.25.1
12
+
13
+ # NLP
14
+ keybert==0.8.3
15
+ bert-extractive-summarizer==0.10.1
16
+ nltk==3.8.1
17
+ sentencepiece==0.1.99
18
+
19
+ # Utils
20
+ pandas==2.1.4
21
+ markdown==3.5.2
22
+ python-dotenv==1.0.0
utils/output_generator.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from datetime import datetime
4
+ import tempfile
5
+
6
+ class OutputGenerator:
7
+ def __init__(self):
8
+ self.templates = {
9
+ 'markdown': self._load_markdown_template(),
10
+ 'html': self._load_html_template()
11
+ }
12
+
13
+ def generate_all_formats(self, transcript, summary, extracted_info):
14
+ """
15
+ Generate output dalam berbagai format
16
+ """
17
+ # Prepare data
18
+ meeting_data = {
19
+ 'date': datetime.now().strftime('%d %B %Y'),
20
+ 'time': datetime.now().strftime('%H:%M'),
21
+ 'duration': self._calculate_duration(transcript),
22
+ 'participants': self._extract_participants(transcript),
23
+ 'summary': summary,
24
+ 'keywords': extracted_info['keywords'],
25
+ 'action_items': extracted_info['action_items'],
26
+ 'decisions': extracted_info['decisions'],
27
+ 'transcript': transcript
28
+ }
29
+
30
+ # Generate outputs
31
+ outputs = {
32
+ 'markdown': self._generate_markdown(meeting_data),
33
+ 'json': self._generate_json(meeting_data),
34
+ 'transcript_table': self._generate_transcript_table(transcript),
35
+ 'action_items_table': self._generate_action_items_table(
36
+ extracted_info['action_items']
37
+ ),
38
+ 'decisions_table': self._generate_decisions_table(
39
+ extracted_info['decisions']
40
+ )
41
+ }
42
+
43
+ return outputs
44
+
45
+ def _generate_markdown(self, data):
46
+ """
47
+ Generate markdown format meeting minutes
48
+ """
49
+ markdown = f"""# 📋 Notulensi Rapat - {data['date']}
50
+
51
+ ## 📊 Informasi Rapat
52
+ - **Tanggal**: {data['date']}
53
+ - **Waktu**: {data['time']}
54
+ - **Durasi**: {data['duration']}
55
+ - **Peserta**: {', '.join(data['participants'])}
56
+
57
+ ## 📝 Ringkasan Eksekutif
58
+ {data['summary']}
59
+
60
+ ## 🎯 Topik Utama
61
+ {self._format_keywords(data['keywords'])}
62
+
63
+ ## ✅ Action Items
64
+ {self._format_action_items_md(data['action_items'])}
65
+
66
+ ## 📌 Keputusan Penting
67
+ {self._format_decisions_md(data['decisions'])}
68
+
69
+ ## 💬 Transkrip Lengkap
70
+ {self._format_transcript_md(data['transcript'])}
71
+
72
+ ---
73
+ *Dokumen ini dihasilkan secara otomatis menggunakan AI Meeting Minutes Generator*
74
+ """
75
+ return markdown
76
+
77
+ def _generate_json(self, data):
78
+ """
79
+ Generate JSON output and save to file
80
+ """
81
+ json_data = {
82
+ 'metadata': {
83
+ 'generated_at': datetime.now().isoformat(),
84
+ 'version': '1.0'
85
+ },
86
+ 'meeting_info': {
87
+ 'date': data['date'],
88
+ 'duration': data['duration'],
89
+ 'participants': data['participants']
90
+ },
91
+ 'content': {
92
+ 'summary': data['summary'],
93
+ 'keywords': [kw[0] for kw in data['keywords'][:5]],
94
+ 'action_items': [
95
+ {
96
+ 'description': item['text'],
97
+ 'assigned_to': item['speaker'],
98
+ 'timestamp': item['timestamp'],
99
+ 'mentioned_persons': item['entities']['persons'],
100
+ 'mentioned_dates': item['entities']['dates']
101
+ }
102
+ for item in data['action_items']
103
+ ],
104
+ 'decisions': [
105
+ {
106
+ 'description': dec['text'],
107
+ 'made_by': dec['speaker'],
108
+ 'timestamp': dec['timestamp']
109
+ }
110
+ for dec in data['decisions']
111
+ ]
112
+ },
113
+ 'full_transcript': [
114
+ {
115
+ 'speaker': seg['speaker'],
116
+ 'start_time': seg['start'],
117
+ 'end_time': seg['end'],
118
+ 'text': seg['text']
119
+ }
120
+ for seg in data['transcript']
121
+ ]
122
+ }
123
+
124
+ # Save to temporary file
125
+ temp_file = tempfile.NamedTemporaryFile(
126
+ mode='w',
127
+ suffix='.json',
128
+ delete=False
129
+ )
130
+ json.dump(json_data, temp_file, indent=2, ensure_ascii=False)
131
+ temp_file.close()
132
+
133
+ return temp_file.name
134
+
135
+ def _generate_transcript_table(self, transcript):
136
+ """
137
+ Generate transcript table for Gradio DataFrame
138
+ """
139
+ data = []
140
+ for seg in transcript:
141
+ data.append([
142
+ f"{seg['start']:.1f}s - {seg['end']:.1f}s",
143
+ seg['speaker'],
144
+ seg['text']
145
+ ])
146
+
147
+ return pd.DataFrame(data, columns=['Waktu', 'Pembicara', 'Teks'])
148
+
149
+ def _generate_action_items_table(self, action_items):
150
+ """
151
+ Generate action items table
152
+ """
153
+ data = []
154
+ for item in action_items:
155
+ # Extract mentioned persons for assignment
156
+ assignees = item['entities']['persons'] if item['entities']['persons'] else [item['speaker']]
157
+ dates = ', '.join(item['entities']['dates']) if item['entities']['dates'] else 'TBD'
158
+
159
+ data.append([
160
+ item['text'],
161
+ ', '.join(assignees),
162
+ item['timestamp']
163
+ ])
164
+
165
+ return pd.DataFrame(
166
+ data,
167
+ columns=['Action Item', 'Penanggung Jawab', 'Timestamp']
168
+ )
169
+
170
+ def _generate_decisions_table(self, decisions):
171
+ """
172
+ Generate decisions table
173
+ """
174
+ data = []
175
+ for dec in decisions:
176
+ data.append([
177
+ dec['text'],
178
+ dec['speaker'],
179
+ dec['timestamp']
180
+ ])
181
+
182
+ return pd.DataFrame(
183
+ data,
184
+ columns=['Keputusan', 'Pembicara', 'Timestamp']
185
+ )
186
+
187
+ # Helper methods
188
+ def _calculate_duration(self, transcript):
189
+ if not transcript:
190
+ return "0:00"
191
+
192
+ total_seconds = transcript[-1]['end']
193
+ hours = int(total_seconds // 3600)
194
+ minutes = int((total_seconds % 3600) // 60)
195
+ seconds = int(total_seconds % 60)
196
+
197
+ if hours > 0:
198
+ return f"{hours}:{minutes:02d}:{seconds:02d}"
199
+ else:
200
+ return f"{minutes}:{seconds:02d}"
201
+
202
+ def _extract_participants(self, transcript):
203
+ speakers = list(set([seg['speaker'] for seg in transcript]))
204
+ return sorted(speakers)
205
+
206
+ def _format_keywords(self, keywords):
207
+ return '\n'.join([f"- **{kw[0]}** (score: {kw[1]:.2f})"
208
+ for kw in keywords[:5]])
209
+
210
+ def _format_action_items_md(self, action_items):
211
+ if not action_items:
212
+ return "*Tidak ada action items yang terdeteksi*"
213
+
214
+ formatted = []
215
+ for i, item in enumerate(action_items, 1):
216
+ assignees = item['entities']['persons'] if item['entities']['persons'] else [item['speaker']]
217
+ formatted.append(f"{i}. {item['text']}\n - **Penanggung Jawab**: {', '.join(assignees)}\n - **Waktu**: {item['timestamp']}")
218
+
219
+ return '\n\n'.join(formatted)
220
+
221
+ def _format_decisions_md(self, decisions):
222
+ if not decisions:
223
+ return "*Tidak ada keputusan yang terdeteksi*"
224
+
225
+ formatted = []
226
+ for i, dec in enumerate(decisions, 1):
227
+ formatted.append(f"{i}. {dec['text']}\n - **Diputuskan oleh**: {dec['speaker']}\n - **Waktu**: {dec['timestamp']}")
228
+
229
+ return '\n\n'.join(formatted)
230
+
231
+ def _format_transcript_md(self, transcript):
232
+ formatted = []
233
+ current_speaker = None
234
+
235
+ for seg in transcript:
236
+ if seg['speaker'] != current_speaker:
237
+ formatted.append(f"\n**{seg['speaker']}** ({seg['start']:.1f}s):")
238
+ current_speaker = seg['speaker']
239
+
240
+ formatted.append(f"> {seg['text']}")
241
+
242
+ return '\n'.join(formatted)
243
+
244
+ def _load_markdown_template(self):
245
+ # Template bisa di-customize
246
+ return """# Meeting Minutes Template
247
+ {content}
248
+ """
249
+
250
+ def _load_html_template(self):
251
+ return """<!DOCTYPE html>
252
+ <html>
253
+ <head>
254
+ <style>
255
+ body { font-family: Arial, sans-serif; margin: 40px; }
256
+ h1 { color: #333; }
257
+ .metadata { background: #f0f0f0; padding: 15px; border-radius: 5px; }
258
+ .action-item { background: #e8f5e9; padding: 10px; margin: 10px 0; border-left: 4px solid #4caf50; }
259
+ .decision { background: #e3f2fd; padding: 10px; margin: 10px 0; border-left: 4px solid #2196f3; }
260
+ </style>
261
+ </head>
262
+ <body>
263
+ {content}
264
+ </body>
265
+ </html>"""
utils/speech_processor.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from transformers import (
4
+ WhisperProcessor,
5
+ WhisperForConditionalGeneration,
6
+ pipeline
7
+ )
8
+ from pyannote.audio import Pipeline
9
+ import librosa
10
+ import numpy as np
11
+ from pydub import AudioSegment
12
+ import tempfile
13
+
14
+ class SpeechProcessor:
15
+ def __init__(self):
16
+ # Load Whisper for ASR
17
+ self.whisper_processor = WhisperProcessor.from_pretrained(
18
+ "openai/whisper-medium"
19
+ )
20
+ self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
21
+ "openai/whisper-medium"
22
+ )
23
+
24
+ # Load speaker diarization
25
+ self.diarization_pipeline = Pipeline.from_pretrained(
26
+ "pyannote/speaker-diarization-3.1",
27
+ use_auth_token=os.environ.get("HF_TOKEN")
28
+ )
29
+
30
+ def process_audio(self, audio_path, language="id"):
31
+ """
32
+ Process audio file untuk ASR dan speaker diarization
33
+ """
34
+ # Convert to WAV if needed
35
+ audio_path = self._ensure_wav_format(audio_path)
36
+
37
+ # Load audio
38
+ waveform, sample_rate = torchaudio.load(audio_path)
39
+
40
+ # Speaker diarization
41
+ diarization = self.diarization_pipeline(audio_path)
42
+
43
+ # Process each speaker segment
44
+ transcript_segments = []
45
+
46
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
47
+ # Extract segment audio
48
+ start_sample = int(turn.start * sample_rate)
49
+ end_sample = int(turn.end * sample_rate)
50
+ segment_waveform = waveform[:, start_sample:end_sample]
51
+
52
+ # ASR on segment
53
+ text = self._transcribe_segment(
54
+ segment_waveform,
55
+ sample_rate,
56
+ language
57
+ )
58
+
59
+ transcript_segments.append({
60
+ "start": round(turn.start, 2),
61
+ "end": round(turn.end, 2),
62
+ "speaker": speaker,
63
+ "text": text
64
+ })
65
+
66
+ return self._merge_consecutive_segments(transcript_segments)
67
+
68
+ def _transcribe_segment(self, waveform, sample_rate, language):
69
+ """
70
+ Transcribe audio segment menggunakan Whisper
71
+ """
72
+ # Resample if needed
73
+ if sample_rate != 16000:
74
+ resampler = torchaudio.transforms.Resample(sample_rate, 16000)
75
+ waveform = resampler(waveform)
76
+
77
+ # Prepare input
78
+ input_features = self.whisper_processor(
79
+ waveform.squeeze().numpy(),
80
+ sampling_rate=16000,
81
+ return_tensors="pt"
82
+ ).input_features
83
+
84
+ # Generate transcription
85
+ forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(
86
+ language=language,
87
+ task="transcribe"
88
+ )
89
+
90
+ predicted_ids = self.whisper_model.generate(
91
+ input_features,
92
+ forced_decoder_ids=forced_decoder_ids,
93
+ max_length=448
94
+ )
95
+
96
+ transcription = self.whisper_processor.batch_decode(
97
+ predicted_ids,
98
+ skip_special_tokens=True
99
+ )[0]
100
+
101
+ return transcription.strip()
102
+
103
+ def _ensure_wav_format(self, audio_path):
104
+ """
105
+ Convert audio to WAV format if needed
106
+ """
107
+ if not audio_path.endswith('.wav'):
108
+ audio = AudioSegment.from_file(audio_path)
109
+ wav_path = tempfile.mktemp(suffix='.wav')
110
+ audio.export(wav_path, format='wav')
111
+ return wav_path
112
+ return audio_path
113
+
114
+ def _merge_consecutive_segments(self, segments):
115
+ """
116
+ Merge consecutive segments from same speaker
117
+ """
118
+ if not segments:
119
+ return segments
120
+
121
+ merged = [segments[0]]
122
+
123
+ for current in segments[1:]:
124
+ last = merged[-1]
125
+
126
+ # Merge if same speaker and close in time
127
+ if (last['speaker'] == current['speaker'] and
128
+ current['start'] - last['end'] < 1.0):
129
+ last['end'] = current['end']
130
+ last['text'] += ' ' + current['text']
131
+ else:
132
+ merged.append(current)
133
+
134
+ return merged
utils/text_processor.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ AutoModelForSeq2SeqLM,
4
+ AutoModelForTokenClassification,
5
+ pipeline
6
+ )
7
+ from keybert import KeyBERT
8
+ from summarizer import Summarizer
9
+ import re
10
+ import nltk
11
+ nltk.download('punkt')
12
+
13
+ class TextProcessor:
14
+ def __init__(self):
15
+ # Initialize summarization model
16
+ self.summarizer = Summarizer('bert-base-multilingual-cased')
17
+
18
+ # Initialize KeyBERT for keyword extraction
19
+ self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
20
+
21
+ # Initialize NER for action item detection
22
+ self.ner_pipeline = pipeline(
23
+ "ner",
24
+ model="cahya/bert-base-indonesian-NER",
25
+ aggregation_strategy="simple"
26
+ )
27
+
28
+ # Action item patterns
29
+ self.action_patterns = [
30
+ r"akan\s+(\w+)",
31
+ r"harus\s+(\w+)",
32
+ r"perlu\s+(\w+)",
33
+ r"mohon\s+(\w+)",
34
+ r"tolong\s+(\w+)",
35
+ r"segera\s+(\w+)",
36
+ r"follow\s*up",
37
+ r"action\s*item",
38
+ r"to\s*do",
39
+ r"deadline"
40
+ ]
41
+
42
+ # Decision patterns
43
+ self.decision_patterns = [
44
+ r"(diputuskan|memutuskan)\s+(.+)",
45
+ r"(disepakati|menyepakati)\s+(.+)",
46
+ r"(setuju|persetujuan)\s+(.+)",
47
+ r"keputusan(?:nya)?\s+(.+)",
48
+ r"final(?:isasi)?\s+(.+)"
49
+ ]
50
+
51
+ def summarize_transcript(self, transcript_segments, ratio=0.3):
52
+ """
53
+ Hierarchical summarization untuk transcript panjang
54
+ """
55
+ # Gabungkan text dari semua segments
56
+ full_text = ' '.join([seg['text'] for seg in transcript_segments])
57
+
58
+ # Chunking untuk dokumen panjang
59
+ chunks = self._create_chunks(full_text)
60
+
61
+ if len(chunks) == 1:
62
+ # Direct summarization untuk dokumen pendek
63
+ return self.summarizer(
64
+ chunks[0],
65
+ ratio=ratio,
66
+ num_sentences=5
67
+ )
68
+ else:
69
+ # Hierarchical summarization
70
+ return self._hierarchical_summarization(chunks, ratio)
71
+
72
+ def extract_key_information(self, transcript_segments):
73
+ """
74
+ Extract action items, decisions, dan key topics
75
+ """
76
+ full_text = ' '.join([seg['text'] for seg in transcript_segments])
77
+
78
+ # Extract keywords/topics
79
+ keywords = self.kw_model.extract_keywords(
80
+ full_text,
81
+ keyphrase_ngram_range=(1, 3),
82
+ stop_words='indonesian',
83
+ top_n=10,
84
+ use_mmr=True,
85
+ diversity=0.5
86
+ )
87
+
88
+ # Extract action items dan decisions
89
+ action_items = []
90
+ decisions = []
91
+
92
+ for segment in transcript_segments:
93
+ # Check for action items
94
+ if self._is_action_item(segment['text']):
95
+ action_items.append({
96
+ 'text': segment['text'],
97
+ 'speaker': segment['speaker'],
98
+ 'timestamp': f"{segment['start']:.1f}s",
99
+ 'entities': self._extract_entities(segment['text'])
100
+ })
101
+
102
+ # Check for decisions
103
+ if self._is_decision(segment['text']):
104
+ decisions.append({
105
+ 'text': segment['text'],
106
+ 'speaker': segment['speaker'],
107
+ 'timestamp': f"{segment['start']:.1f}s"
108
+ })
109
+
110
+ return {
111
+ 'keywords': keywords,
112
+ 'action_items': action_items,
113
+ 'decisions': decisions
114
+ }
115
+
116
+ def _create_chunks(self, text, max_length=3000):
117
+ """
118
+ Create overlapping chunks for long documents
119
+ """
120
+ sentences = nltk.sent_tokenize(text)
121
+ chunks = []
122
+ current_chunk = []
123
+ current_length = 0
124
+
125
+ for sentence in sentences:
126
+ sentence_length = len(sentence)
127
+
128
+ if current_length + sentence_length > max_length and current_chunk:
129
+ chunks.append(' '.join(current_chunk))
130
+ # Keep last 2 sentences for overlap
131
+ current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
132
+ current_length = sum(len(s) for s in current_chunk)
133
+
134
+ current_chunk.append(sentence)
135
+ current_length += sentence_length
136
+
137
+ if current_chunk:
138
+ chunks.append(' '.join(current_chunk))
139
+
140
+ return chunks
141
+
142
+ def _hierarchical_summarization(self, chunks, ratio):
143
+ """
144
+ Two-level summarization for long documents
145
+ """
146
+ # Level 1: Summarize each chunk
147
+ chunk_summaries = []
148
+ for chunk in chunks:
149
+ summary = self.summarizer(
150
+ chunk,
151
+ ratio=0.4, # Higher ratio for first level
152
+ num_sentences=4
153
+ )
154
+ chunk_summaries.append(summary)
155
+
156
+ # Level 2: Summarize the summaries
157
+ combined_summary = ' '.join(chunk_summaries)
158
+ final_summary = self.summarizer(
159
+ combined_summary,
160
+ ratio=ratio,
161
+ num_sentences=6
162
+ )
163
+
164
+ return final_summary
165
+
166
+ def _is_action_item(self, text):
167
+ """
168
+ Detect if text contains action item
169
+ """
170
+ text_lower = text.lower()
171
+
172
+ # Check patterns
173
+ for pattern in self.action_patterns:
174
+ if re.search(pattern, text_lower):
175
+ return True
176
+
177
+ # Check for imperative sentences
178
+ first_word = text.split()[0].lower() if text.split() else ""
179
+ imperative_verbs = [
180
+ 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
181
+ 'follow', 'prepare', 'send', 'contact', 'create'
182
+ ]
183
+
184
+ return first_word in imperative_verbs
185
+
186
+ def _is_decision(self, text):
187
+ """
188
+ Detect if text contains decision
189
+ """
190
+ text_lower = text.lower()
191
+
192
+ for pattern in self.decision_patterns:
193
+ if re.search(pattern, text_lower):
194
+ return True
195
+
196
+ return False
197
+
198
+ def _extract_entities(self, text):
199
+ """
200
+ Extract named entities (person, date, etc)
201
+ """
202
+ entities = self.ner_pipeline(text)
203
+
204
+ return {
205
+ 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
206
+ 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
207
+ 'dates': self._extract_dates(text)
208
+ }
209
+
210
+ def _extract_dates(self, text):
211
+ """
212
+ Extract date mentions
213
+ """
214
+ date_patterns = [
215
+ r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
216
+ r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
217
+ r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
218
+ r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
219
+ ]
220
+
221
+ dates = []
222
+ for pattern in date_patterns:
223
+ matches = re.findall(pattern, text.lower())
224
+ dates.extend(matches)
225
+
226
+ return dates