# from transformers import ( # AutoTokenizer, # AutoModelForSeq2SeqLM, # AutoModelForTokenClassification, # pipeline # ) # from keybert import KeyBERT # from summarizer import Summarizer # import re # import nltk # nltk.download('punkt') # class TextProcessor: # def __init__(self): # # Initialize summarization model # self.summarizer = Summarizer('bert-base-multilingual-cased') # # Initialize KeyBERT for keyword extraction # self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2') # # Initialize NER for action item detection # self.ner_pipeline = pipeline( # "ner", # model="cahya/bert-base-indonesian-NER", # aggregation_strategy="simple" # ) # # Action item patterns # self.action_patterns = [ # r"akan\s+(\w+)", # r"harus\s+(\w+)", # r"perlu\s+(\w+)", # r"mohon\s+(\w+)", # r"tolong\s+(\w+)", # r"segera\s+(\w+)", # r"follow\s*up", # r"action\s*item", # r"to\s*do", # r"deadline" # ] # # Decision patterns # self.decision_patterns = [ # r"(diputuskan|memutuskan)\s+(.+)", # r"(disepakati|menyepakati)\s+(.+)", # r"(setuju|persetujuan)\s+(.+)", # r"keputusan(?:nya)?\s+(.+)", # r"final(?:isasi)?\s+(.+)" # ] # def summarize_transcript(self, transcript_segments, ratio=0.3): # """ # Hierarchical summarization untuk transcript panjang # """ # # Gabungkan text dari semua segments # full_text = ' '.join([seg['text'] for seg in transcript_segments]) # # Chunking untuk dokumen panjang # chunks = self._create_chunks(full_text) # if len(chunks) == 1: # # Direct summarization untuk dokumen pendek # return self.summarizer( # chunks[0], # ratio=ratio, # num_sentences=5 # ) # else: # # Hierarchical summarization # return self._hierarchical_summarization(chunks, ratio) # def extract_key_information(self, transcript_segments): # """ # Extract action items, decisions, dan key topics # """ # full_text = ' '.join([seg['text'] for seg in transcript_segments]) # # Extract keywords/topics # keywords = self.kw_model.extract_keywords( # full_text, # keyphrase_ngram_range=(1, 3), # stop_words='indonesian', # top_n=10, # use_mmr=True, # diversity=0.5 # ) # # Extract action items dan decisions # action_items = [] # decisions = [] # for segment in transcript_segments: # # Check for action items # if self._is_action_item(segment['text']): # action_items.append({ # 'text': segment['text'], # 'speaker': segment['speaker'], # 'timestamp': f"{segment['start']:.1f}s", # 'entities': self._extract_entities(segment['text']) # }) # # Check for decisions # if self._is_decision(segment['text']): # decisions.append({ # 'text': segment['text'], # 'speaker': segment['speaker'], # 'timestamp': f"{segment['start']:.1f}s" # }) # return { # 'keywords': keywords, # 'action_items': action_items, # 'decisions': decisions # } # def _create_chunks(self, text, max_length=3000): # """ # Create overlapping chunks for long documents # """ # sentences = nltk.sent_tokenize(text) # chunks = [] # current_chunk = [] # current_length = 0 # for sentence in sentences: # sentence_length = len(sentence) # if current_length + sentence_length > max_length and current_chunk: # chunks.append(' '.join(current_chunk)) # # Keep last 2 sentences for overlap # current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else [] # current_length = sum(len(s) for s in current_chunk) # current_chunk.append(sentence) # current_length += sentence_length # if current_chunk: # chunks.append(' '.join(current_chunk)) # return chunks # def _hierarchical_summarization(self, chunks, ratio): # """ # Two-level summarization for long documents # """ # # Level 1: Summarize each chunk # chunk_summaries = [] # for chunk in chunks: # summary = self.summarizer( # chunk, # ratio=0.4, # Higher ratio for first level # num_sentences=4 # ) # chunk_summaries.append(summary) # # Level 2: Summarize the summaries # combined_summary = ' '.join(chunk_summaries) # final_summary = self.summarizer( # combined_summary, # ratio=ratio, # num_sentences=6 # ) # return final_summary # def _is_action_item(self, text): # """ # Detect if text contains action item # """ # text_lower = text.lower() # # Check patterns # for pattern in self.action_patterns: # if re.search(pattern, text_lower): # return True # # Check for imperative sentences # first_word = text.split()[0].lower() if text.split() else "" # imperative_verbs = [ # 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi', # 'follow', 'prepare', 'send', 'contact', 'create' # ] # return first_word in imperative_verbs # def _is_decision(self, text): # """ # Detect if text contains decision # """ # text_lower = text.lower() # for pattern in self.decision_patterns: # if re.search(pattern, text_lower): # return True # return False # def _extract_entities(self, text): # """ # Extract named entities (person, date, etc) # """ # entities = self.ner_pipeline(text) # return { # 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'], # 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'], # 'dates': self._extract_dates(text) # } # def _extract_dates(self, text): # """ # Extract date mentions # """ # date_patterns = [ # r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', # r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)', # r'(besok|lusa|minggu\s+depan|bulan\s+depan)', # r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)' # ] # dates = [] # for pattern in date_patterns: # matches = re.findall(pattern, text.lower()) # dates.extend(matches) # return dates from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, pipeline ) from keybert import KeyBERT import re import nltk from typing import List, Dict class TextProcessor: def __init__(self): print("Initializing Text Processor...") # Use transformers pipeline for summarization instead try: self.summarizer = pipeline( "summarization", model="sshleifer/distilbart-cnn-12-6", device=-1 # CPU ) except: # Fallback to simple extractive summarization self.summarizer = None print("Warning: Summarization model not loaded, using fallback") # Initialize KeyBERT for keyword extraction try: self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2') except: self.kw_model = None print("Warning: KeyBERT not loaded") # Action item patterns self.action_patterns = [ r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)", r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)", r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline" ] # Decision patterns self.decision_patterns = [ r"(diputuskan|memutuskan)\s+(.+)", r"(disepakati|menyepakati)\s+(.+)", r"(setuju|persetujuan)\s+(.+)", r"keputusan(?:nya)?\s+(.+)", r"final(?:isasi)?\s+(.+)" ] print("Text Processor ready!") def summarize_transcript(self, transcript_segments, ratio=0.3): """Summarization with fallback methods""" # Combine text from all segments full_text = ' '.join([seg['text'] for seg in transcript_segments]) if not full_text.strip(): return "No content to summarize." # Try using the summarization pipeline if self.summarizer: try: # Split into chunks if too long max_chunk_length = 1024 if len(full_text) > max_chunk_length: chunks = self._split_into_chunks(full_text, max_chunk_length) summaries = [] for chunk in chunks[:3]: # Limit to first 3 chunks summary = self.summarizer( chunk, max_length=130, min_length=30, do_sample=False )[0]['summary_text'] summaries.append(summary) return ' '.join(summaries) else: return self.summarizer( full_text, max_length=150, min_length=30, do_sample=False )[0]['summary_text'] except: pass # Fallback: Simple extractive summarization return self._simple_extractive_summary(full_text, ratio) def extract_key_information(self, transcript_segments): """Extract action items, decisions, and key topics""" full_text = ' '.join([seg['text'] for seg in transcript_segments]) # Extract keywords/topics keywords = [] if self.kw_model: try: keywords = self.kw_model.extract_keywords( full_text, keyphrase_ngram_range=(1, 3), stop_words=None, top_n=10, use_mmr=True, diversity=0.5 ) except: pass # If KeyBERT fails, use simple frequency-based extraction if not keywords: keywords = self._extract_keywords_simple(full_text) # Extract action items and decisions action_items = [] decisions = [] for segment in transcript_segments: # Check for action items if self._is_action_item(segment['text']): action_items.append({ 'text': segment['text'], 'speaker': segment['speaker'], 'timestamp': f"{segment['start']:.1f}s" }) # Check for decisions if self._is_decision(segment['text']): decisions.append({ 'text': segment['text'], 'speaker': segment['speaker'], 'timestamp': f"{segment['start']:.1f}s" }) return { 'keywords': keywords, 'action_items': action_items, 'decisions': decisions } def _split_into_chunks(self, text, max_length): """Split text into chunks""" words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: current_chunk.append(word) current_length += len(word) + 1 if current_length >= max_length: chunks.append(' '.join(current_chunk)) current_chunk = [] current_length = 0 if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def _simple_extractive_summary(self, text, ratio=0.3): """Simple extractive summarization fallback""" sentences = nltk.sent_tokenize(text) if len(sentences) <= 3: return text # Calculate number of sentences to include num_sentences = max(3, int(len(sentences) * ratio)) # Simple scoring: prefer sentences with more content words scored_sentences = [] for i, sent in enumerate(sentences): # Score based on length and position score = len(sent.split()) if i < 3: # Boost first sentences score *= 1.5 if i >= len(sentences) - 2: # Boost last sentences score *= 1.2 scored_sentences.append((score, sent)) # Sort by score and select top sentences scored_sentences.sort(reverse=True) selected = [sent for _, sent in scored_sentences[:num_sentences]] # Return in original order return ' '.join([s for s in sentences if s in selected]) def _extract_keywords_simple(self, text): """Simple keyword extraction fallback""" # Remove common words stopwords = { 'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah', 'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan', 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were' } # Count word frequency words = re.findall(r'\b\w+\b', text.lower()) word_freq = {} for word in words: if len(word) > 3 and word not in stopwords: word_freq[word] = word_freq.get(word, 0) + 1 # Get top keywords keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10] # Format like KeyBERT output return [(word, freq/len(words)) for word, freq in keywords] def _is_action_item(self, text): """Detect if text contains action item""" text_lower = text.lower() # Check patterns for pattern in self.action_patterns: if re.search(pattern, text_lower): return True # Check for imperative sentences first_word = text.split()[0].lower() if text.split() else "" imperative_verbs = [ 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi', 'follow', 'prepare', 'send', 'contact', 'create' ] return first_word in imperative_verbs def _is_decision(self, text): """Detect if text contains decision""" text_lower = text.lower() for pattern in self.decision_patterns: if re.search(pattern, text_lower): return True return False