Spaces:
Sleeping
Sleeping
First init
Browse files- app.py +158 -0
- models/config.py +37 -0
- requirements.txt +22 -0
- utils/output_generator.py +265 -0
- utils/speech_processor.py +134 -0
- utils/text_processor.py +226 -0
app.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from utils.speech_processor import SpeechProcessor
|
4 |
+
from utils.text_processor import TextProcessor
|
5 |
+
from utils.output_generator import OutputGenerator
|
6 |
+
import tempfile
|
7 |
+
import os
|
8 |
+
|
9 |
+
# Initialize processors
|
10 |
+
speech_processor = SpeechProcessor()
|
11 |
+
text_processor = TextProcessor()
|
12 |
+
output_generator = OutputGenerator()
|
13 |
+
|
14 |
+
def process_meeting(audio_file, language="id", summary_ratio=0.3):
|
15 |
+
"""
|
16 |
+
Main pipeline untuk memproses audio meeting
|
17 |
+
"""
|
18 |
+
try:
|
19 |
+
# Step 1: Speech Processing
|
20 |
+
gr.Info("🎤 Memproses audio...")
|
21 |
+
transcript_with_speakers = speech_processor.process_audio(
|
22 |
+
audio_file,
|
23 |
+
language=language
|
24 |
+
)
|
25 |
+
|
26 |
+
# Step 2: Text Processing & Summarization
|
27 |
+
gr.Info("📝 Membuat ringkasan...")
|
28 |
+
summary = text_processor.summarize_transcript(
|
29 |
+
transcript_with_speakers,
|
30 |
+
ratio=summary_ratio
|
31 |
+
)
|
32 |
+
|
33 |
+
# Step 3: Information Extraction
|
34 |
+
gr.Info("🔍 Mengekstrak informasi penting...")
|
35 |
+
extracted_info = text_processor.extract_key_information(
|
36 |
+
transcript_with_speakers
|
37 |
+
)
|
38 |
+
|
39 |
+
# Step 4: Generate Output
|
40 |
+
gr.Info("📄 Membuat notulensi...")
|
41 |
+
outputs = output_generator.generate_all_formats(
|
42 |
+
transcript_with_speakers,
|
43 |
+
summary,
|
44 |
+
extracted_info
|
45 |
+
)
|
46 |
+
|
47 |
+
return (
|
48 |
+
outputs['markdown'],
|
49 |
+
outputs['json'],
|
50 |
+
outputs['transcript_table'],
|
51 |
+
outputs['action_items_table'],
|
52 |
+
outputs['decisions_table']
|
53 |
+
)
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
gr.Error(f"Error: {str(e)}")
|
57 |
+
return None, None, None, None, None
|
58 |
+
|
59 |
+
# Gradio Interface
|
60 |
+
with gr.Blocks(title="🤖 AI Meeting Minutes Generator") as demo:
|
61 |
+
gr.Markdown("""
|
62 |
+
# 🤖 AI Meeting Minutes Generator
|
63 |
+
|
64 |
+
Upload audio rapat Anda dan dapatkan notulensi otomatis dengan:
|
65 |
+
- 🎯 Identifikasi pembicara
|
66 |
+
- 📝 Ringkasan otomatis
|
67 |
+
- ✅ Action items
|
68 |
+
- 📊 Keputusan penting
|
69 |
+
""")
|
70 |
+
|
71 |
+
with gr.Row():
|
72 |
+
with gr.Column():
|
73 |
+
audio_input = gr.Audio(
|
74 |
+
label="Upload Audio Rapat",
|
75 |
+
type="filepath",
|
76 |
+
sources=["upload", "microphone"]
|
77 |
+
)
|
78 |
+
|
79 |
+
with gr.Row():
|
80 |
+
language = gr.Dropdown(
|
81 |
+
choices=[
|
82 |
+
("Indonesia", "id"),
|
83 |
+
("English", "en")
|
84 |
+
],
|
85 |
+
value="id",
|
86 |
+
label="Bahasa"
|
87 |
+
)
|
88 |
+
|
89 |
+
summary_ratio = gr.Slider(
|
90 |
+
minimum=0.1,
|
91 |
+
maximum=0.5,
|
92 |
+
value=0.3,
|
93 |
+
step=0.05,
|
94 |
+
label="Rasio Ringkasan"
|
95 |
+
)
|
96 |
+
|
97 |
+
process_btn = gr.Button("🚀 Proses Audio", variant="primary")
|
98 |
+
|
99 |
+
with gr.Row():
|
100 |
+
with gr.Column():
|
101 |
+
gr.Markdown("### 📄 Notulensi (Markdown)")
|
102 |
+
markdown_output = gr.Textbox(
|
103 |
+
label="Preview Notulensi",
|
104 |
+
lines=20,
|
105 |
+
max_lines=30
|
106 |
+
)
|
107 |
+
|
108 |
+
json_download = gr.File(
|
109 |
+
label="📥 Download JSON"
|
110 |
+
)
|
111 |
+
|
112 |
+
with gr.Row():
|
113 |
+
with gr.Column():
|
114 |
+
gr.Markdown("### 📊 Transkrip Lengkap")
|
115 |
+
transcript_table = gr.Dataframe(
|
116 |
+
headers=["Waktu", "Pembicara", "Teks"],
|
117 |
+
label="Transkrip dengan Pembicara"
|
118 |
+
)
|
119 |
+
|
120 |
+
with gr.Row():
|
121 |
+
with gr.Column():
|
122 |
+
gr.Markdown("### ✅ Action Items")
|
123 |
+
action_items_table = gr.Dataframe(
|
124 |
+
headers=["Action Item", "Penanggung Jawab", "Timestamp"],
|
125 |
+
label="Daftar Action Items"
|
126 |
+
)
|
127 |
+
|
128 |
+
with gr.Column():
|
129 |
+
gr.Markdown("### 📌 Keputusan")
|
130 |
+
decisions_table = gr.Dataframe(
|
131 |
+
headers=["Keputusan", "Pembicara", "Timestamp"],
|
132 |
+
label="Daftar Keputusan"
|
133 |
+
)
|
134 |
+
|
135 |
+
# Process button action
|
136 |
+
process_btn.click(
|
137 |
+
fn=process_meeting,
|
138 |
+
inputs=[audio_input, language, summary_ratio],
|
139 |
+
outputs=[
|
140 |
+
markdown_output,
|
141 |
+
json_download,
|
142 |
+
transcript_table,
|
143 |
+
action_items_table,
|
144 |
+
decisions_table
|
145 |
+
]
|
146 |
+
)
|
147 |
+
|
148 |
+
# Examples
|
149 |
+
gr.Examples(
|
150 |
+
examples=[
|
151 |
+
["examples/meeting_sample_id.wav", "id", 0.3],
|
152 |
+
["examples/meeting_sample_en.wav", "en", 0.25]
|
153 |
+
],
|
154 |
+
inputs=[audio_input, language, summary_ratio]
|
155 |
+
)
|
156 |
+
|
157 |
+
if __name__ == "__main__":
|
158 |
+
demo.launch()
|
models/config.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dataclasses import dataclass
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class ModelConfig:
|
6 |
+
# Whisper ASR
|
7 |
+
whisper_model: str = "openai/whisper-medium"
|
8 |
+
whisper_language: str = "id"
|
9 |
+
|
10 |
+
# Speaker Diarization
|
11 |
+
diarization_model: str = "pyannote/speaker-diarization-3.1"
|
12 |
+
min_speakers: int = 1
|
13 |
+
max_speakers: int = 10
|
14 |
+
|
15 |
+
# Text Processing
|
16 |
+
summarization_model: str = "bert-base-multilingual-cased"
|
17 |
+
ner_model: str = "cahya/bert-base-indonesian-NER"
|
18 |
+
keyword_model: str = "paraphrase-multilingual-MiniLM-L12-v2"
|
19 |
+
|
20 |
+
# Processing Parameters
|
21 |
+
chunk_size: int = 3000
|
22 |
+
chunk_overlap: int = 200
|
23 |
+
summary_ratio: float = 0.3
|
24 |
+
max_summary_sentences: int = 6
|
25 |
+
|
26 |
+
# Output
|
27 |
+
output_formats: list = None
|
28 |
+
|
29 |
+
def __post_init__(self):
|
30 |
+
if self.output_formats is None:
|
31 |
+
self.output_formats = ["markdown", "json", "html"]
|
32 |
+
|
33 |
+
# Set HF token from environment
|
34 |
+
self.hf_token = os.environ.get("HF_TOKEN", None)
|
35 |
+
|
36 |
+
# Global config instance
|
37 |
+
config = ModelConfig()
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
gradio==4.19.2
|
3 |
+
transformers==4.37.2
|
4 |
+
torch==2.1.2
|
5 |
+
torchaudio==2.1.2
|
6 |
+
|
7 |
+
# Audio processing
|
8 |
+
pyannote.audio==3.1.1
|
9 |
+
speechbrain==0.5.16
|
10 |
+
librosa==0.10.1
|
11 |
+
pydub==0.25.1
|
12 |
+
|
13 |
+
# NLP
|
14 |
+
keybert==0.8.3
|
15 |
+
bert-extractive-summarizer==0.10.1
|
16 |
+
nltk==3.8.1
|
17 |
+
sentencepiece==0.1.99
|
18 |
+
|
19 |
+
# Utils
|
20 |
+
pandas==2.1.4
|
21 |
+
markdown==3.5.2
|
22 |
+
python-dotenv==1.0.0
|
utils/output_generator.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
from datetime import datetime
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
class OutputGenerator:
|
7 |
+
def __init__(self):
|
8 |
+
self.templates = {
|
9 |
+
'markdown': self._load_markdown_template(),
|
10 |
+
'html': self._load_html_template()
|
11 |
+
}
|
12 |
+
|
13 |
+
def generate_all_formats(self, transcript, summary, extracted_info):
|
14 |
+
"""
|
15 |
+
Generate output dalam berbagai format
|
16 |
+
"""
|
17 |
+
# Prepare data
|
18 |
+
meeting_data = {
|
19 |
+
'date': datetime.now().strftime('%d %B %Y'),
|
20 |
+
'time': datetime.now().strftime('%H:%M'),
|
21 |
+
'duration': self._calculate_duration(transcript),
|
22 |
+
'participants': self._extract_participants(transcript),
|
23 |
+
'summary': summary,
|
24 |
+
'keywords': extracted_info['keywords'],
|
25 |
+
'action_items': extracted_info['action_items'],
|
26 |
+
'decisions': extracted_info['decisions'],
|
27 |
+
'transcript': transcript
|
28 |
+
}
|
29 |
+
|
30 |
+
# Generate outputs
|
31 |
+
outputs = {
|
32 |
+
'markdown': self._generate_markdown(meeting_data),
|
33 |
+
'json': self._generate_json(meeting_data),
|
34 |
+
'transcript_table': self._generate_transcript_table(transcript),
|
35 |
+
'action_items_table': self._generate_action_items_table(
|
36 |
+
extracted_info['action_items']
|
37 |
+
),
|
38 |
+
'decisions_table': self._generate_decisions_table(
|
39 |
+
extracted_info['decisions']
|
40 |
+
)
|
41 |
+
}
|
42 |
+
|
43 |
+
return outputs
|
44 |
+
|
45 |
+
def _generate_markdown(self, data):
|
46 |
+
"""
|
47 |
+
Generate markdown format meeting minutes
|
48 |
+
"""
|
49 |
+
markdown = f"""# 📋 Notulensi Rapat - {data['date']}
|
50 |
+
|
51 |
+
## 📊 Informasi Rapat
|
52 |
+
- **Tanggal**: {data['date']}
|
53 |
+
- **Waktu**: {data['time']}
|
54 |
+
- **Durasi**: {data['duration']}
|
55 |
+
- **Peserta**: {', '.join(data['participants'])}
|
56 |
+
|
57 |
+
## 📝 Ringkasan Eksekutif
|
58 |
+
{data['summary']}
|
59 |
+
|
60 |
+
## 🎯 Topik Utama
|
61 |
+
{self._format_keywords(data['keywords'])}
|
62 |
+
|
63 |
+
## ✅ Action Items
|
64 |
+
{self._format_action_items_md(data['action_items'])}
|
65 |
+
|
66 |
+
## 📌 Keputusan Penting
|
67 |
+
{self._format_decisions_md(data['decisions'])}
|
68 |
+
|
69 |
+
## 💬 Transkrip Lengkap
|
70 |
+
{self._format_transcript_md(data['transcript'])}
|
71 |
+
|
72 |
+
---
|
73 |
+
*Dokumen ini dihasilkan secara otomatis menggunakan AI Meeting Minutes Generator*
|
74 |
+
"""
|
75 |
+
return markdown
|
76 |
+
|
77 |
+
def _generate_json(self, data):
|
78 |
+
"""
|
79 |
+
Generate JSON output and save to file
|
80 |
+
"""
|
81 |
+
json_data = {
|
82 |
+
'metadata': {
|
83 |
+
'generated_at': datetime.now().isoformat(),
|
84 |
+
'version': '1.0'
|
85 |
+
},
|
86 |
+
'meeting_info': {
|
87 |
+
'date': data['date'],
|
88 |
+
'duration': data['duration'],
|
89 |
+
'participants': data['participants']
|
90 |
+
},
|
91 |
+
'content': {
|
92 |
+
'summary': data['summary'],
|
93 |
+
'keywords': [kw[0] for kw in data['keywords'][:5]],
|
94 |
+
'action_items': [
|
95 |
+
{
|
96 |
+
'description': item['text'],
|
97 |
+
'assigned_to': item['speaker'],
|
98 |
+
'timestamp': item['timestamp'],
|
99 |
+
'mentioned_persons': item['entities']['persons'],
|
100 |
+
'mentioned_dates': item['entities']['dates']
|
101 |
+
}
|
102 |
+
for item in data['action_items']
|
103 |
+
],
|
104 |
+
'decisions': [
|
105 |
+
{
|
106 |
+
'description': dec['text'],
|
107 |
+
'made_by': dec['speaker'],
|
108 |
+
'timestamp': dec['timestamp']
|
109 |
+
}
|
110 |
+
for dec in data['decisions']
|
111 |
+
]
|
112 |
+
},
|
113 |
+
'full_transcript': [
|
114 |
+
{
|
115 |
+
'speaker': seg['speaker'],
|
116 |
+
'start_time': seg['start'],
|
117 |
+
'end_time': seg['end'],
|
118 |
+
'text': seg['text']
|
119 |
+
}
|
120 |
+
for seg in data['transcript']
|
121 |
+
]
|
122 |
+
}
|
123 |
+
|
124 |
+
# Save to temporary file
|
125 |
+
temp_file = tempfile.NamedTemporaryFile(
|
126 |
+
mode='w',
|
127 |
+
suffix='.json',
|
128 |
+
delete=False
|
129 |
+
)
|
130 |
+
json.dump(json_data, temp_file, indent=2, ensure_ascii=False)
|
131 |
+
temp_file.close()
|
132 |
+
|
133 |
+
return temp_file.name
|
134 |
+
|
135 |
+
def _generate_transcript_table(self, transcript):
|
136 |
+
"""
|
137 |
+
Generate transcript table for Gradio DataFrame
|
138 |
+
"""
|
139 |
+
data = []
|
140 |
+
for seg in transcript:
|
141 |
+
data.append([
|
142 |
+
f"{seg['start']:.1f}s - {seg['end']:.1f}s",
|
143 |
+
seg['speaker'],
|
144 |
+
seg['text']
|
145 |
+
])
|
146 |
+
|
147 |
+
return pd.DataFrame(data, columns=['Waktu', 'Pembicara', 'Teks'])
|
148 |
+
|
149 |
+
def _generate_action_items_table(self, action_items):
|
150 |
+
"""
|
151 |
+
Generate action items table
|
152 |
+
"""
|
153 |
+
data = []
|
154 |
+
for item in action_items:
|
155 |
+
# Extract mentioned persons for assignment
|
156 |
+
assignees = item['entities']['persons'] if item['entities']['persons'] else [item['speaker']]
|
157 |
+
dates = ', '.join(item['entities']['dates']) if item['entities']['dates'] else 'TBD'
|
158 |
+
|
159 |
+
data.append([
|
160 |
+
item['text'],
|
161 |
+
', '.join(assignees),
|
162 |
+
item['timestamp']
|
163 |
+
])
|
164 |
+
|
165 |
+
return pd.DataFrame(
|
166 |
+
data,
|
167 |
+
columns=['Action Item', 'Penanggung Jawab', 'Timestamp']
|
168 |
+
)
|
169 |
+
|
170 |
+
def _generate_decisions_table(self, decisions):
|
171 |
+
"""
|
172 |
+
Generate decisions table
|
173 |
+
"""
|
174 |
+
data = []
|
175 |
+
for dec in decisions:
|
176 |
+
data.append([
|
177 |
+
dec['text'],
|
178 |
+
dec['speaker'],
|
179 |
+
dec['timestamp']
|
180 |
+
])
|
181 |
+
|
182 |
+
return pd.DataFrame(
|
183 |
+
data,
|
184 |
+
columns=['Keputusan', 'Pembicara', 'Timestamp']
|
185 |
+
)
|
186 |
+
|
187 |
+
# Helper methods
|
188 |
+
def _calculate_duration(self, transcript):
|
189 |
+
if not transcript:
|
190 |
+
return "0:00"
|
191 |
+
|
192 |
+
total_seconds = transcript[-1]['end']
|
193 |
+
hours = int(total_seconds // 3600)
|
194 |
+
minutes = int((total_seconds % 3600) // 60)
|
195 |
+
seconds = int(total_seconds % 60)
|
196 |
+
|
197 |
+
if hours > 0:
|
198 |
+
return f"{hours}:{minutes:02d}:{seconds:02d}"
|
199 |
+
else:
|
200 |
+
return f"{minutes}:{seconds:02d}"
|
201 |
+
|
202 |
+
def _extract_participants(self, transcript):
|
203 |
+
speakers = list(set([seg['speaker'] for seg in transcript]))
|
204 |
+
return sorted(speakers)
|
205 |
+
|
206 |
+
def _format_keywords(self, keywords):
|
207 |
+
return '\n'.join([f"- **{kw[0]}** (score: {kw[1]:.2f})"
|
208 |
+
for kw in keywords[:5]])
|
209 |
+
|
210 |
+
def _format_action_items_md(self, action_items):
|
211 |
+
if not action_items:
|
212 |
+
return "*Tidak ada action items yang terdeteksi*"
|
213 |
+
|
214 |
+
formatted = []
|
215 |
+
for i, item in enumerate(action_items, 1):
|
216 |
+
assignees = item['entities']['persons'] if item['entities']['persons'] else [item['speaker']]
|
217 |
+
formatted.append(f"{i}. {item['text']}\n - **Penanggung Jawab**: {', '.join(assignees)}\n - **Waktu**: {item['timestamp']}")
|
218 |
+
|
219 |
+
return '\n\n'.join(formatted)
|
220 |
+
|
221 |
+
def _format_decisions_md(self, decisions):
|
222 |
+
if not decisions:
|
223 |
+
return "*Tidak ada keputusan yang terdeteksi*"
|
224 |
+
|
225 |
+
formatted = []
|
226 |
+
for i, dec in enumerate(decisions, 1):
|
227 |
+
formatted.append(f"{i}. {dec['text']}\n - **Diputuskan oleh**: {dec['speaker']}\n - **Waktu**: {dec['timestamp']}")
|
228 |
+
|
229 |
+
return '\n\n'.join(formatted)
|
230 |
+
|
231 |
+
def _format_transcript_md(self, transcript):
|
232 |
+
formatted = []
|
233 |
+
current_speaker = None
|
234 |
+
|
235 |
+
for seg in transcript:
|
236 |
+
if seg['speaker'] != current_speaker:
|
237 |
+
formatted.append(f"\n**{seg['speaker']}** ({seg['start']:.1f}s):")
|
238 |
+
current_speaker = seg['speaker']
|
239 |
+
|
240 |
+
formatted.append(f"> {seg['text']}")
|
241 |
+
|
242 |
+
return '\n'.join(formatted)
|
243 |
+
|
244 |
+
def _load_markdown_template(self):
|
245 |
+
# Template bisa di-customize
|
246 |
+
return """# Meeting Minutes Template
|
247 |
+
{content}
|
248 |
+
"""
|
249 |
+
|
250 |
+
def _load_html_template(self):
|
251 |
+
return """<!DOCTYPE html>
|
252 |
+
<html>
|
253 |
+
<head>
|
254 |
+
<style>
|
255 |
+
body { font-family: Arial, sans-serif; margin: 40px; }
|
256 |
+
h1 { color: #333; }
|
257 |
+
.metadata { background: #f0f0f0; padding: 15px; border-radius: 5px; }
|
258 |
+
.action-item { background: #e8f5e9; padding: 10px; margin: 10px 0; border-left: 4px solid #4caf50; }
|
259 |
+
.decision { background: #e3f2fd; padding: 10px; margin: 10px 0; border-left: 4px solid #2196f3; }
|
260 |
+
</style>
|
261 |
+
</head>
|
262 |
+
<body>
|
263 |
+
{content}
|
264 |
+
</body>
|
265 |
+
</html>"""
|
utils/speech_processor.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
from transformers import (
|
4 |
+
WhisperProcessor,
|
5 |
+
WhisperForConditionalGeneration,
|
6 |
+
pipeline
|
7 |
+
)
|
8 |
+
from pyannote.audio import Pipeline
|
9 |
+
import librosa
|
10 |
+
import numpy as np
|
11 |
+
from pydub import AudioSegment
|
12 |
+
import tempfile
|
13 |
+
|
14 |
+
class SpeechProcessor:
|
15 |
+
def __init__(self):
|
16 |
+
# Load Whisper for ASR
|
17 |
+
self.whisper_processor = WhisperProcessor.from_pretrained(
|
18 |
+
"openai/whisper-medium"
|
19 |
+
)
|
20 |
+
self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
|
21 |
+
"openai/whisper-medium"
|
22 |
+
)
|
23 |
+
|
24 |
+
# Load speaker diarization
|
25 |
+
self.diarization_pipeline = Pipeline.from_pretrained(
|
26 |
+
"pyannote/speaker-diarization-3.1",
|
27 |
+
use_auth_token=os.environ.get("HF_TOKEN")
|
28 |
+
)
|
29 |
+
|
30 |
+
def process_audio(self, audio_path, language="id"):
|
31 |
+
"""
|
32 |
+
Process audio file untuk ASR dan speaker diarization
|
33 |
+
"""
|
34 |
+
# Convert to WAV if needed
|
35 |
+
audio_path = self._ensure_wav_format(audio_path)
|
36 |
+
|
37 |
+
# Load audio
|
38 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
39 |
+
|
40 |
+
# Speaker diarization
|
41 |
+
diarization = self.diarization_pipeline(audio_path)
|
42 |
+
|
43 |
+
# Process each speaker segment
|
44 |
+
transcript_segments = []
|
45 |
+
|
46 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
47 |
+
# Extract segment audio
|
48 |
+
start_sample = int(turn.start * sample_rate)
|
49 |
+
end_sample = int(turn.end * sample_rate)
|
50 |
+
segment_waveform = waveform[:, start_sample:end_sample]
|
51 |
+
|
52 |
+
# ASR on segment
|
53 |
+
text = self._transcribe_segment(
|
54 |
+
segment_waveform,
|
55 |
+
sample_rate,
|
56 |
+
language
|
57 |
+
)
|
58 |
+
|
59 |
+
transcript_segments.append({
|
60 |
+
"start": round(turn.start, 2),
|
61 |
+
"end": round(turn.end, 2),
|
62 |
+
"speaker": speaker,
|
63 |
+
"text": text
|
64 |
+
})
|
65 |
+
|
66 |
+
return self._merge_consecutive_segments(transcript_segments)
|
67 |
+
|
68 |
+
def _transcribe_segment(self, waveform, sample_rate, language):
|
69 |
+
"""
|
70 |
+
Transcribe audio segment menggunakan Whisper
|
71 |
+
"""
|
72 |
+
# Resample if needed
|
73 |
+
if sample_rate != 16000:
|
74 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
75 |
+
waveform = resampler(waveform)
|
76 |
+
|
77 |
+
# Prepare input
|
78 |
+
input_features = self.whisper_processor(
|
79 |
+
waveform.squeeze().numpy(),
|
80 |
+
sampling_rate=16000,
|
81 |
+
return_tensors="pt"
|
82 |
+
).input_features
|
83 |
+
|
84 |
+
# Generate transcription
|
85 |
+
forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(
|
86 |
+
language=language,
|
87 |
+
task="transcribe"
|
88 |
+
)
|
89 |
+
|
90 |
+
predicted_ids = self.whisper_model.generate(
|
91 |
+
input_features,
|
92 |
+
forced_decoder_ids=forced_decoder_ids,
|
93 |
+
max_length=448
|
94 |
+
)
|
95 |
+
|
96 |
+
transcription = self.whisper_processor.batch_decode(
|
97 |
+
predicted_ids,
|
98 |
+
skip_special_tokens=True
|
99 |
+
)[0]
|
100 |
+
|
101 |
+
return transcription.strip()
|
102 |
+
|
103 |
+
def _ensure_wav_format(self, audio_path):
|
104 |
+
"""
|
105 |
+
Convert audio to WAV format if needed
|
106 |
+
"""
|
107 |
+
if not audio_path.endswith('.wav'):
|
108 |
+
audio = AudioSegment.from_file(audio_path)
|
109 |
+
wav_path = tempfile.mktemp(suffix='.wav')
|
110 |
+
audio.export(wav_path, format='wav')
|
111 |
+
return wav_path
|
112 |
+
return audio_path
|
113 |
+
|
114 |
+
def _merge_consecutive_segments(self, segments):
|
115 |
+
"""
|
116 |
+
Merge consecutive segments from same speaker
|
117 |
+
"""
|
118 |
+
if not segments:
|
119 |
+
return segments
|
120 |
+
|
121 |
+
merged = [segments[0]]
|
122 |
+
|
123 |
+
for current in segments[1:]:
|
124 |
+
last = merged[-1]
|
125 |
+
|
126 |
+
# Merge if same speaker and close in time
|
127 |
+
if (last['speaker'] == current['speaker'] and
|
128 |
+
current['start'] - last['end'] < 1.0):
|
129 |
+
last['end'] = current['end']
|
130 |
+
last['text'] += ' ' + current['text']
|
131 |
+
else:
|
132 |
+
merged.append(current)
|
133 |
+
|
134 |
+
return merged
|
utils/text_processor.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AutoTokenizer,
|
3 |
+
AutoModelForSeq2SeqLM,
|
4 |
+
AutoModelForTokenClassification,
|
5 |
+
pipeline
|
6 |
+
)
|
7 |
+
from keybert import KeyBERT
|
8 |
+
from summarizer import Summarizer
|
9 |
+
import re
|
10 |
+
import nltk
|
11 |
+
nltk.download('punkt')
|
12 |
+
|
13 |
+
class TextProcessor:
|
14 |
+
def __init__(self):
|
15 |
+
# Initialize summarization model
|
16 |
+
self.summarizer = Summarizer('bert-base-multilingual-cased')
|
17 |
+
|
18 |
+
# Initialize KeyBERT for keyword extraction
|
19 |
+
self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
|
20 |
+
|
21 |
+
# Initialize NER for action item detection
|
22 |
+
self.ner_pipeline = pipeline(
|
23 |
+
"ner",
|
24 |
+
model="cahya/bert-base-indonesian-NER",
|
25 |
+
aggregation_strategy="simple"
|
26 |
+
)
|
27 |
+
|
28 |
+
# Action item patterns
|
29 |
+
self.action_patterns = [
|
30 |
+
r"akan\s+(\w+)",
|
31 |
+
r"harus\s+(\w+)",
|
32 |
+
r"perlu\s+(\w+)",
|
33 |
+
r"mohon\s+(\w+)",
|
34 |
+
r"tolong\s+(\w+)",
|
35 |
+
r"segera\s+(\w+)",
|
36 |
+
r"follow\s*up",
|
37 |
+
r"action\s*item",
|
38 |
+
r"to\s*do",
|
39 |
+
r"deadline"
|
40 |
+
]
|
41 |
+
|
42 |
+
# Decision patterns
|
43 |
+
self.decision_patterns = [
|
44 |
+
r"(diputuskan|memutuskan)\s+(.+)",
|
45 |
+
r"(disepakati|menyepakati)\s+(.+)",
|
46 |
+
r"(setuju|persetujuan)\s+(.+)",
|
47 |
+
r"keputusan(?:nya)?\s+(.+)",
|
48 |
+
r"final(?:isasi)?\s+(.+)"
|
49 |
+
]
|
50 |
+
|
51 |
+
def summarize_transcript(self, transcript_segments, ratio=0.3):
|
52 |
+
"""
|
53 |
+
Hierarchical summarization untuk transcript panjang
|
54 |
+
"""
|
55 |
+
# Gabungkan text dari semua segments
|
56 |
+
full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
57 |
+
|
58 |
+
# Chunking untuk dokumen panjang
|
59 |
+
chunks = self._create_chunks(full_text)
|
60 |
+
|
61 |
+
if len(chunks) == 1:
|
62 |
+
# Direct summarization untuk dokumen pendek
|
63 |
+
return self.summarizer(
|
64 |
+
chunks[0],
|
65 |
+
ratio=ratio,
|
66 |
+
num_sentences=5
|
67 |
+
)
|
68 |
+
else:
|
69 |
+
# Hierarchical summarization
|
70 |
+
return self._hierarchical_summarization(chunks, ratio)
|
71 |
+
|
72 |
+
def extract_key_information(self, transcript_segments):
|
73 |
+
"""
|
74 |
+
Extract action items, decisions, dan key topics
|
75 |
+
"""
|
76 |
+
full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
77 |
+
|
78 |
+
# Extract keywords/topics
|
79 |
+
keywords = self.kw_model.extract_keywords(
|
80 |
+
full_text,
|
81 |
+
keyphrase_ngram_range=(1, 3),
|
82 |
+
stop_words='indonesian',
|
83 |
+
top_n=10,
|
84 |
+
use_mmr=True,
|
85 |
+
diversity=0.5
|
86 |
+
)
|
87 |
+
|
88 |
+
# Extract action items dan decisions
|
89 |
+
action_items = []
|
90 |
+
decisions = []
|
91 |
+
|
92 |
+
for segment in transcript_segments:
|
93 |
+
# Check for action items
|
94 |
+
if self._is_action_item(segment['text']):
|
95 |
+
action_items.append({
|
96 |
+
'text': segment['text'],
|
97 |
+
'speaker': segment['speaker'],
|
98 |
+
'timestamp': f"{segment['start']:.1f}s",
|
99 |
+
'entities': self._extract_entities(segment['text'])
|
100 |
+
})
|
101 |
+
|
102 |
+
# Check for decisions
|
103 |
+
if self._is_decision(segment['text']):
|
104 |
+
decisions.append({
|
105 |
+
'text': segment['text'],
|
106 |
+
'speaker': segment['speaker'],
|
107 |
+
'timestamp': f"{segment['start']:.1f}s"
|
108 |
+
})
|
109 |
+
|
110 |
+
return {
|
111 |
+
'keywords': keywords,
|
112 |
+
'action_items': action_items,
|
113 |
+
'decisions': decisions
|
114 |
+
}
|
115 |
+
|
116 |
+
def _create_chunks(self, text, max_length=3000):
|
117 |
+
"""
|
118 |
+
Create overlapping chunks for long documents
|
119 |
+
"""
|
120 |
+
sentences = nltk.sent_tokenize(text)
|
121 |
+
chunks = []
|
122 |
+
current_chunk = []
|
123 |
+
current_length = 0
|
124 |
+
|
125 |
+
for sentence in sentences:
|
126 |
+
sentence_length = len(sentence)
|
127 |
+
|
128 |
+
if current_length + sentence_length > max_length and current_chunk:
|
129 |
+
chunks.append(' '.join(current_chunk))
|
130 |
+
# Keep last 2 sentences for overlap
|
131 |
+
current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
|
132 |
+
current_length = sum(len(s) for s in current_chunk)
|
133 |
+
|
134 |
+
current_chunk.append(sentence)
|
135 |
+
current_length += sentence_length
|
136 |
+
|
137 |
+
if current_chunk:
|
138 |
+
chunks.append(' '.join(current_chunk))
|
139 |
+
|
140 |
+
return chunks
|
141 |
+
|
142 |
+
def _hierarchical_summarization(self, chunks, ratio):
|
143 |
+
"""
|
144 |
+
Two-level summarization for long documents
|
145 |
+
"""
|
146 |
+
# Level 1: Summarize each chunk
|
147 |
+
chunk_summaries = []
|
148 |
+
for chunk in chunks:
|
149 |
+
summary = self.summarizer(
|
150 |
+
chunk,
|
151 |
+
ratio=0.4, # Higher ratio for first level
|
152 |
+
num_sentences=4
|
153 |
+
)
|
154 |
+
chunk_summaries.append(summary)
|
155 |
+
|
156 |
+
# Level 2: Summarize the summaries
|
157 |
+
combined_summary = ' '.join(chunk_summaries)
|
158 |
+
final_summary = self.summarizer(
|
159 |
+
combined_summary,
|
160 |
+
ratio=ratio,
|
161 |
+
num_sentences=6
|
162 |
+
)
|
163 |
+
|
164 |
+
return final_summary
|
165 |
+
|
166 |
+
def _is_action_item(self, text):
|
167 |
+
"""
|
168 |
+
Detect if text contains action item
|
169 |
+
"""
|
170 |
+
text_lower = text.lower()
|
171 |
+
|
172 |
+
# Check patterns
|
173 |
+
for pattern in self.action_patterns:
|
174 |
+
if re.search(pattern, text_lower):
|
175 |
+
return True
|
176 |
+
|
177 |
+
# Check for imperative sentences
|
178 |
+
first_word = text.split()[0].lower() if text.split() else ""
|
179 |
+
imperative_verbs = [
|
180 |
+
'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
|
181 |
+
'follow', 'prepare', 'send', 'contact', 'create'
|
182 |
+
]
|
183 |
+
|
184 |
+
return first_word in imperative_verbs
|
185 |
+
|
186 |
+
def _is_decision(self, text):
|
187 |
+
"""
|
188 |
+
Detect if text contains decision
|
189 |
+
"""
|
190 |
+
text_lower = text.lower()
|
191 |
+
|
192 |
+
for pattern in self.decision_patterns:
|
193 |
+
if re.search(pattern, text_lower):
|
194 |
+
return True
|
195 |
+
|
196 |
+
return False
|
197 |
+
|
198 |
+
def _extract_entities(self, text):
|
199 |
+
"""
|
200 |
+
Extract named entities (person, date, etc)
|
201 |
+
"""
|
202 |
+
entities = self.ner_pipeline(text)
|
203 |
+
|
204 |
+
return {
|
205 |
+
'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
|
206 |
+
'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
|
207 |
+
'dates': self._extract_dates(text)
|
208 |
+
}
|
209 |
+
|
210 |
+
def _extract_dates(self, text):
|
211 |
+
"""
|
212 |
+
Extract date mentions
|
213 |
+
"""
|
214 |
+
date_patterns = [
|
215 |
+
r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
|
216 |
+
r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
|
217 |
+
r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
|
218 |
+
r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
|
219 |
+
]
|
220 |
+
|
221 |
+
dates = []
|
222 |
+
for pattern in date_patterns:
|
223 |
+
matches = re.findall(pattern, text.lower())
|
224 |
+
dates.extend(matches)
|
225 |
+
|
226 |
+
return dates
|