Spaces:
Sleeping
Sleeping
Fix text procesor
Browse files- utils/text_processor.py +371 -127
utils/text_processor.py
CHANGED
@@ -1,42 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import (
|
2 |
AutoTokenizer,
|
3 |
AutoModelForSeq2SeqLM,
|
4 |
-
AutoModelForTokenClassification,
|
5 |
pipeline
|
6 |
)
|
7 |
from keybert import KeyBERT
|
8 |
-
from summarizer import Summarizer
|
9 |
import re
|
10 |
import nltk
|
11 |
-
|
12 |
|
13 |
class TextProcessor:
|
14 |
def __init__(self):
|
15 |
-
|
16 |
-
self.summarizer = Summarizer('bert-base-multilingual-cased')
|
17 |
|
18 |
-
#
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
# Initialize
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
|
28 |
-
|
29 |
self.action_patterns = [
|
30 |
-
r"akan\s+(\w+)",
|
31 |
-
r"
|
32 |
-
r"
|
33 |
-
r"mohon\s+(\w+)",
|
34 |
-
r"tolong\s+(\w+)",
|
35 |
-
r"segera\s+(\w+)",
|
36 |
-
r"follow\s*up",
|
37 |
-
r"action\s*item",
|
38 |
-
r"to\s*do",
|
39 |
-
r"deadline"
|
40 |
]
|
41 |
|
42 |
# Decision patterns
|
@@ -47,45 +275,73 @@ class TextProcessor:
|
|
47 |
r"keputusan(?:nya)?\s+(.+)",
|
48 |
r"final(?:isasi)?\s+(.+)"
|
49 |
]
|
|
|
|
|
50 |
|
51 |
def summarize_transcript(self, transcript_segments, ratio=0.3):
|
52 |
-
"""
|
53 |
-
|
54 |
-
"""
|
55 |
-
# Gabungkan text dari semua segments
|
56 |
full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
57 |
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
chunks
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def extract_key_information(self, transcript_segments):
|
73 |
-
"""
|
74 |
-
Extract action items, decisions, dan key topics
|
75 |
-
"""
|
76 |
full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
77 |
|
78 |
# Extract keywords/topics
|
79 |
-
keywords =
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
action_items = []
|
90 |
decisions = []
|
91 |
|
@@ -95,8 +351,7 @@ class TextProcessor:
|
|
95 |
action_items.append({
|
96 |
'text': segment['text'],
|
97 |
'speaker': segment['speaker'],
|
98 |
-
'timestamp': f"{segment['start']:.1f}s"
|
99 |
-
'entities': self._extract_entities(segment['text'])
|
100 |
})
|
101 |
|
102 |
# Check for decisions
|
@@ -113,60 +368,81 @@ class TextProcessor:
|
|
113 |
'decisions': decisions
|
114 |
}
|
115 |
|
116 |
-
def
|
117 |
-
"""
|
118 |
-
|
119 |
-
"""
|
120 |
-
sentences = nltk.sent_tokenize(text)
|
121 |
chunks = []
|
122 |
current_chunk = []
|
123 |
current_length = 0
|
124 |
|
125 |
-
for
|
126 |
-
|
|
|
127 |
|
128 |
-
if current_length
|
129 |
chunks.append(' '.join(current_chunk))
|
130 |
-
|
131 |
-
|
132 |
-
current_length = sum(len(s) for s in current_chunk)
|
133 |
-
|
134 |
-
current_chunk.append(sentence)
|
135 |
-
current_length += sentence_length
|
136 |
|
137 |
if current_chunk:
|
138 |
chunks.append(' '.join(current_chunk))
|
139 |
|
140 |
return chunks
|
141 |
|
142 |
-
def
|
143 |
-
"""
|
144 |
-
|
145 |
-
"""
|
146 |
-
# Level 1: Summarize each chunk
|
147 |
-
chunk_summaries = []
|
148 |
-
for chunk in chunks:
|
149 |
-
summary = self.summarizer(
|
150 |
-
chunk,
|
151 |
-
ratio=0.4, # Higher ratio for first level
|
152 |
-
num_sentences=4
|
153 |
-
)
|
154 |
-
chunk_summaries.append(summary)
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
final_summary = self.summarizer(
|
159 |
-
combined_summary,
|
160 |
-
ratio=ratio,
|
161 |
-
num_sentences=6
|
162 |
-
)
|
163 |
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
def _is_action_item(self, text):
|
167 |
-
"""
|
168 |
-
Detect if text contains action item
|
169 |
-
"""
|
170 |
text_lower = text.lower()
|
171 |
|
172 |
# Check patterns
|
@@ -184,43 +460,11 @@ class TextProcessor:
|
|
184 |
return first_word in imperative_verbs
|
185 |
|
186 |
def _is_decision(self, text):
|
187 |
-
"""
|
188 |
-
Detect if text contains decision
|
189 |
-
"""
|
190 |
text_lower = text.lower()
|
191 |
|
192 |
for pattern in self.decision_patterns:
|
193 |
if re.search(pattern, text_lower):
|
194 |
return True
|
195 |
|
196 |
-
return False
|
197 |
-
|
198 |
-
def _extract_entities(self, text):
|
199 |
-
"""
|
200 |
-
Extract named entities (person, date, etc)
|
201 |
-
"""
|
202 |
-
entities = self.ner_pipeline(text)
|
203 |
-
|
204 |
-
return {
|
205 |
-
'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
|
206 |
-
'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
|
207 |
-
'dates': self._extract_dates(text)
|
208 |
-
}
|
209 |
-
|
210 |
-
def _extract_dates(self, text):
|
211 |
-
"""
|
212 |
-
Extract date mentions
|
213 |
-
"""
|
214 |
-
date_patterns = [
|
215 |
-
r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
|
216 |
-
r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
|
217 |
-
r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
|
218 |
-
r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
|
219 |
-
]
|
220 |
-
|
221 |
-
dates = []
|
222 |
-
for pattern in date_patterns:
|
223 |
-
matches = re.findall(pattern, text.lower())
|
224 |
-
dates.extend(matches)
|
225 |
-
|
226 |
-
return dates
|
|
|
1 |
+
# from transformers import (
|
2 |
+
# AutoTokenizer,
|
3 |
+
# AutoModelForSeq2SeqLM,
|
4 |
+
# AutoModelForTokenClassification,
|
5 |
+
# pipeline
|
6 |
+
# )
|
7 |
+
# from keybert import KeyBERT
|
8 |
+
# from summarizer import Summarizer
|
9 |
+
# import re
|
10 |
+
# import nltk
|
11 |
+
# nltk.download('punkt')
|
12 |
+
|
13 |
+
# class TextProcessor:
|
14 |
+
# def __init__(self):
|
15 |
+
# # Initialize summarization model
|
16 |
+
# self.summarizer = Summarizer('bert-base-multilingual-cased')
|
17 |
+
|
18 |
+
# # Initialize KeyBERT for keyword extraction
|
19 |
+
# self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
|
20 |
+
|
21 |
+
# # Initialize NER for action item detection
|
22 |
+
# self.ner_pipeline = pipeline(
|
23 |
+
# "ner",
|
24 |
+
# model="cahya/bert-base-indonesian-NER",
|
25 |
+
# aggregation_strategy="simple"
|
26 |
+
# )
|
27 |
+
|
28 |
+
# # Action item patterns
|
29 |
+
# self.action_patterns = [
|
30 |
+
# r"akan\s+(\w+)",
|
31 |
+
# r"harus\s+(\w+)",
|
32 |
+
# r"perlu\s+(\w+)",
|
33 |
+
# r"mohon\s+(\w+)",
|
34 |
+
# r"tolong\s+(\w+)",
|
35 |
+
# r"segera\s+(\w+)",
|
36 |
+
# r"follow\s*up",
|
37 |
+
# r"action\s*item",
|
38 |
+
# r"to\s*do",
|
39 |
+
# r"deadline"
|
40 |
+
# ]
|
41 |
+
|
42 |
+
# # Decision patterns
|
43 |
+
# self.decision_patterns = [
|
44 |
+
# r"(diputuskan|memutuskan)\s+(.+)",
|
45 |
+
# r"(disepakati|menyepakati)\s+(.+)",
|
46 |
+
# r"(setuju|persetujuan)\s+(.+)",
|
47 |
+
# r"keputusan(?:nya)?\s+(.+)",
|
48 |
+
# r"final(?:isasi)?\s+(.+)"
|
49 |
+
# ]
|
50 |
+
|
51 |
+
# def summarize_transcript(self, transcript_segments, ratio=0.3):
|
52 |
+
# """
|
53 |
+
# Hierarchical summarization untuk transcript panjang
|
54 |
+
# """
|
55 |
+
# # Gabungkan text dari semua segments
|
56 |
+
# full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
57 |
+
|
58 |
+
# # Chunking untuk dokumen panjang
|
59 |
+
# chunks = self._create_chunks(full_text)
|
60 |
+
|
61 |
+
# if len(chunks) == 1:
|
62 |
+
# # Direct summarization untuk dokumen pendek
|
63 |
+
# return self.summarizer(
|
64 |
+
# chunks[0],
|
65 |
+
# ratio=ratio,
|
66 |
+
# num_sentences=5
|
67 |
+
# )
|
68 |
+
# else:
|
69 |
+
# # Hierarchical summarization
|
70 |
+
# return self._hierarchical_summarization(chunks, ratio)
|
71 |
+
|
72 |
+
# def extract_key_information(self, transcript_segments):
|
73 |
+
# """
|
74 |
+
# Extract action items, decisions, dan key topics
|
75 |
+
# """
|
76 |
+
# full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
77 |
+
|
78 |
+
# # Extract keywords/topics
|
79 |
+
# keywords = self.kw_model.extract_keywords(
|
80 |
+
# full_text,
|
81 |
+
# keyphrase_ngram_range=(1, 3),
|
82 |
+
# stop_words='indonesian',
|
83 |
+
# top_n=10,
|
84 |
+
# use_mmr=True,
|
85 |
+
# diversity=0.5
|
86 |
+
# )
|
87 |
+
|
88 |
+
# # Extract action items dan decisions
|
89 |
+
# action_items = []
|
90 |
+
# decisions = []
|
91 |
+
|
92 |
+
# for segment in transcript_segments:
|
93 |
+
# # Check for action items
|
94 |
+
# if self._is_action_item(segment['text']):
|
95 |
+
# action_items.append({
|
96 |
+
# 'text': segment['text'],
|
97 |
+
# 'speaker': segment['speaker'],
|
98 |
+
# 'timestamp': f"{segment['start']:.1f}s",
|
99 |
+
# 'entities': self._extract_entities(segment['text'])
|
100 |
+
# })
|
101 |
+
|
102 |
+
# # Check for decisions
|
103 |
+
# if self._is_decision(segment['text']):
|
104 |
+
# decisions.append({
|
105 |
+
# 'text': segment['text'],
|
106 |
+
# 'speaker': segment['speaker'],
|
107 |
+
# 'timestamp': f"{segment['start']:.1f}s"
|
108 |
+
# })
|
109 |
+
|
110 |
+
# return {
|
111 |
+
# 'keywords': keywords,
|
112 |
+
# 'action_items': action_items,
|
113 |
+
# 'decisions': decisions
|
114 |
+
# }
|
115 |
+
|
116 |
+
# def _create_chunks(self, text, max_length=3000):
|
117 |
+
# """
|
118 |
+
# Create overlapping chunks for long documents
|
119 |
+
# """
|
120 |
+
# sentences = nltk.sent_tokenize(text)
|
121 |
+
# chunks = []
|
122 |
+
# current_chunk = []
|
123 |
+
# current_length = 0
|
124 |
+
|
125 |
+
# for sentence in sentences:
|
126 |
+
# sentence_length = len(sentence)
|
127 |
+
|
128 |
+
# if current_length + sentence_length > max_length and current_chunk:
|
129 |
+
# chunks.append(' '.join(current_chunk))
|
130 |
+
# # Keep last 2 sentences for overlap
|
131 |
+
# current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
|
132 |
+
# current_length = sum(len(s) for s in current_chunk)
|
133 |
+
|
134 |
+
# current_chunk.append(sentence)
|
135 |
+
# current_length += sentence_length
|
136 |
+
|
137 |
+
# if current_chunk:
|
138 |
+
# chunks.append(' '.join(current_chunk))
|
139 |
+
|
140 |
+
# return chunks
|
141 |
+
|
142 |
+
# def _hierarchical_summarization(self, chunks, ratio):
|
143 |
+
# """
|
144 |
+
# Two-level summarization for long documents
|
145 |
+
# """
|
146 |
+
# # Level 1: Summarize each chunk
|
147 |
+
# chunk_summaries = []
|
148 |
+
# for chunk in chunks:
|
149 |
+
# summary = self.summarizer(
|
150 |
+
# chunk,
|
151 |
+
# ratio=0.4, # Higher ratio for first level
|
152 |
+
# num_sentences=4
|
153 |
+
# )
|
154 |
+
# chunk_summaries.append(summary)
|
155 |
+
|
156 |
+
# # Level 2: Summarize the summaries
|
157 |
+
# combined_summary = ' '.join(chunk_summaries)
|
158 |
+
# final_summary = self.summarizer(
|
159 |
+
# combined_summary,
|
160 |
+
# ratio=ratio,
|
161 |
+
# num_sentences=6
|
162 |
+
# )
|
163 |
+
|
164 |
+
# return final_summary
|
165 |
+
|
166 |
+
# def _is_action_item(self, text):
|
167 |
+
# """
|
168 |
+
# Detect if text contains action item
|
169 |
+
# """
|
170 |
+
# text_lower = text.lower()
|
171 |
+
|
172 |
+
# # Check patterns
|
173 |
+
# for pattern in self.action_patterns:
|
174 |
+
# if re.search(pattern, text_lower):
|
175 |
+
# return True
|
176 |
+
|
177 |
+
# # Check for imperative sentences
|
178 |
+
# first_word = text.split()[0].lower() if text.split() else ""
|
179 |
+
# imperative_verbs = [
|
180 |
+
# 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
|
181 |
+
# 'follow', 'prepare', 'send', 'contact', 'create'
|
182 |
+
# ]
|
183 |
+
|
184 |
+
# return first_word in imperative_verbs
|
185 |
+
|
186 |
+
# def _is_decision(self, text):
|
187 |
+
# """
|
188 |
+
# Detect if text contains decision
|
189 |
+
# """
|
190 |
+
# text_lower = text.lower()
|
191 |
+
|
192 |
+
# for pattern in self.decision_patterns:
|
193 |
+
# if re.search(pattern, text_lower):
|
194 |
+
# return True
|
195 |
+
|
196 |
+
# return False
|
197 |
+
|
198 |
+
# def _extract_entities(self, text):
|
199 |
+
# """
|
200 |
+
# Extract named entities (person, date, etc)
|
201 |
+
# """
|
202 |
+
# entities = self.ner_pipeline(text)
|
203 |
+
|
204 |
+
# return {
|
205 |
+
# 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
|
206 |
+
# 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
|
207 |
+
# 'dates': self._extract_dates(text)
|
208 |
+
# }
|
209 |
+
|
210 |
+
# def _extract_dates(self, text):
|
211 |
+
# """
|
212 |
+
# Extract date mentions
|
213 |
+
# """
|
214 |
+
# date_patterns = [
|
215 |
+
# r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
|
216 |
+
# r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
|
217 |
+
# r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
|
218 |
+
# r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
|
219 |
+
# ]
|
220 |
+
|
221 |
+
# dates = []
|
222 |
+
# for pattern in date_patterns:
|
223 |
+
# matches = re.findall(pattern, text.lower())
|
224 |
+
# dates.extend(matches)
|
225 |
+
|
226 |
+
# return dates
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
from transformers import (
|
231 |
AutoTokenizer,
|
232 |
AutoModelForSeq2SeqLM,
|
|
|
233 |
pipeline
|
234 |
)
|
235 |
from keybert import KeyBERT
|
|
|
236 |
import re
|
237 |
import nltk
|
238 |
+
from typing import List, Dict
|
239 |
|
240 |
class TextProcessor:
|
241 |
def __init__(self):
|
242 |
+
print("Initializing Text Processor...")
|
|
|
243 |
|
244 |
+
# Use transformers pipeline for summarization instead
|
245 |
+
try:
|
246 |
+
self.summarizer = pipeline(
|
247 |
+
"summarization",
|
248 |
+
model="sshleifer/distilbart-cnn-12-6",
|
249 |
+
device=-1 # CPU
|
250 |
+
)
|
251 |
+
except:
|
252 |
+
# Fallback to simple extractive summarization
|
253 |
+
self.summarizer = None
|
254 |
+
print("Warning: Summarization model not loaded, using fallback")
|
255 |
|
256 |
+
# Initialize KeyBERT for keyword extraction
|
257 |
+
try:
|
258 |
+
self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
|
259 |
+
except:
|
260 |
+
self.kw_model = None
|
261 |
+
print("Warning: KeyBERT not loaded")
|
262 |
|
263 |
+
# Action item patterns
|
264 |
self.action_patterns = [
|
265 |
+
r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)",
|
266 |
+
r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)",
|
267 |
+
r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
]
|
269 |
|
270 |
# Decision patterns
|
|
|
275 |
r"keputusan(?:nya)?\s+(.+)",
|
276 |
r"final(?:isasi)?\s+(.+)"
|
277 |
]
|
278 |
+
|
279 |
+
print("Text Processor ready!")
|
280 |
|
281 |
def summarize_transcript(self, transcript_segments, ratio=0.3):
|
282 |
+
"""Summarization with fallback methods"""
|
283 |
+
# Combine text from all segments
|
|
|
|
|
284 |
full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
285 |
|
286 |
+
if not full_text.strip():
|
287 |
+
return "No content to summarize."
|
288 |
|
289 |
+
# Try using the summarization pipeline
|
290 |
+
if self.summarizer:
|
291 |
+
try:
|
292 |
+
# Split into chunks if too long
|
293 |
+
max_chunk_length = 1024
|
294 |
+
if len(full_text) > max_chunk_length:
|
295 |
+
chunks = self._split_into_chunks(full_text, max_chunk_length)
|
296 |
+
summaries = []
|
297 |
+
|
298 |
+
for chunk in chunks[:3]: # Limit to first 3 chunks
|
299 |
+
summary = self.summarizer(
|
300 |
+
chunk,
|
301 |
+
max_length=130,
|
302 |
+
min_length=30,
|
303 |
+
do_sample=False
|
304 |
+
)[0]['summary_text']
|
305 |
+
summaries.append(summary)
|
306 |
+
|
307 |
+
return ' '.join(summaries)
|
308 |
+
else:
|
309 |
+
return self.summarizer(
|
310 |
+
full_text,
|
311 |
+
max_length=150,
|
312 |
+
min_length=30,
|
313 |
+
do_sample=False
|
314 |
+
)[0]['summary_text']
|
315 |
+
except:
|
316 |
+
pass
|
317 |
+
|
318 |
+
# Fallback: Simple extractive summarization
|
319 |
+
return self._simple_extractive_summary(full_text, ratio)
|
320 |
|
321 |
def extract_key_information(self, transcript_segments):
|
322 |
+
"""Extract action items, decisions, and key topics"""
|
|
|
|
|
323 |
full_text = ' '.join([seg['text'] for seg in transcript_segments])
|
324 |
|
325 |
# Extract keywords/topics
|
326 |
+
keywords = []
|
327 |
+
if self.kw_model:
|
328 |
+
try:
|
329 |
+
keywords = self.kw_model.extract_keywords(
|
330 |
+
full_text,
|
331 |
+
keyphrase_ngram_range=(1, 3),
|
332 |
+
stop_words=None,
|
333 |
+
top_n=10,
|
334 |
+
use_mmr=True,
|
335 |
+
diversity=0.5
|
336 |
+
)
|
337 |
+
except:
|
338 |
+
pass
|
339 |
+
|
340 |
+
# If KeyBERT fails, use simple frequency-based extraction
|
341 |
+
if not keywords:
|
342 |
+
keywords = self._extract_keywords_simple(full_text)
|
343 |
+
|
344 |
+
# Extract action items and decisions
|
345 |
action_items = []
|
346 |
decisions = []
|
347 |
|
|
|
351 |
action_items.append({
|
352 |
'text': segment['text'],
|
353 |
'speaker': segment['speaker'],
|
354 |
+
'timestamp': f"{segment['start']:.1f}s"
|
|
|
355 |
})
|
356 |
|
357 |
# Check for decisions
|
|
|
368 |
'decisions': decisions
|
369 |
}
|
370 |
|
371 |
+
def _split_into_chunks(self, text, max_length):
|
372 |
+
"""Split text into chunks"""
|
373 |
+
words = text.split()
|
|
|
|
|
374 |
chunks = []
|
375 |
current_chunk = []
|
376 |
current_length = 0
|
377 |
|
378 |
+
for word in words:
|
379 |
+
current_chunk.append(word)
|
380 |
+
current_length += len(word) + 1
|
381 |
|
382 |
+
if current_length >= max_length:
|
383 |
chunks.append(' '.join(current_chunk))
|
384 |
+
current_chunk = []
|
385 |
+
current_length = 0
|
|
|
|
|
|
|
|
|
386 |
|
387 |
if current_chunk:
|
388 |
chunks.append(' '.join(current_chunk))
|
389 |
|
390 |
return chunks
|
391 |
|
392 |
+
def _simple_extractive_summary(self, text, ratio=0.3):
|
393 |
+
"""Simple extractive summarization fallback"""
|
394 |
+
sentences = nltk.sent_tokenize(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
|
396 |
+
if len(sentences) <= 3:
|
397 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
+
# Calculate number of sentences to include
|
400 |
+
num_sentences = max(3, int(len(sentences) * ratio))
|
401 |
+
|
402 |
+
# Simple scoring: prefer sentences with more content words
|
403 |
+
scored_sentences = []
|
404 |
+
for i, sent in enumerate(sentences):
|
405 |
+
# Score based on length and position
|
406 |
+
score = len(sent.split())
|
407 |
+
if i < 3: # Boost first sentences
|
408 |
+
score *= 1.5
|
409 |
+
if i >= len(sentences) - 2: # Boost last sentences
|
410 |
+
score *= 1.2
|
411 |
+
scored_sentences.append((score, sent))
|
412 |
+
|
413 |
+
# Sort by score and select top sentences
|
414 |
+
scored_sentences.sort(reverse=True)
|
415 |
+
selected = [sent for _, sent in scored_sentences[:num_sentences]]
|
416 |
+
|
417 |
+
# Return in original order
|
418 |
+
return ' '.join([s for s in sentences if s in selected])
|
419 |
+
|
420 |
+
def _extract_keywords_simple(self, text):
|
421 |
+
"""Simple keyword extraction fallback"""
|
422 |
+
# Remove common words
|
423 |
+
stopwords = {
|
424 |
+
'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah',
|
425 |
+
'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan',
|
426 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
|
427 |
+
'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were'
|
428 |
+
}
|
429 |
+
|
430 |
+
# Count word frequency
|
431 |
+
words = re.findall(r'\b\w+\b', text.lower())
|
432 |
+
word_freq = {}
|
433 |
+
|
434 |
+
for word in words:
|
435 |
+
if len(word) > 3 and word not in stopwords:
|
436 |
+
word_freq[word] = word_freq.get(word, 0) + 1
|
437 |
+
|
438 |
+
# Get top keywords
|
439 |
+
keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
|
440 |
+
|
441 |
+
# Format like KeyBERT output
|
442 |
+
return [(word, freq/len(words)) for word, freq in keywords]
|
443 |
|
444 |
def _is_action_item(self, text):
|
445 |
+
"""Detect if text contains action item"""
|
|
|
|
|
446 |
text_lower = text.lower()
|
447 |
|
448 |
# Check patterns
|
|
|
460 |
return first_word in imperative_verbs
|
461 |
|
462 |
def _is_decision(self, text):
|
463 |
+
"""Detect if text contains decision"""
|
|
|
|
|
464 |
text_lower = text.lower()
|
465 |
|
466 |
for pattern in self.decision_patterns:
|
467 |
if re.search(pattern, text_lower):
|
468 |
return True
|
469 |
|
470 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|