Spaces:

Yermia
/

meeting-minutes-ai

Sleeping

File size: 16,070 Bytes

# from transformers import (
#     AutoTokenizer, 
#     AutoModelForSeq2SeqLM,
#     AutoModelForTokenClassification,
#     pipeline
# )
# from keybert import KeyBERT
# from summarizer import Summarizer
# import re
# import nltk
# nltk.download('punkt')

# class TextProcessor:
#     def __init__(self):
#         # Initialize summarization model
#         self.summarizer = Summarizer('bert-base-multilingual-cased')
        
#         # Initialize KeyBERT for keyword extraction
#         self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
        
#         # Initialize NER for action item detection
#         self.ner_pipeline = pipeline(
#             "ner",
#             model="cahya/bert-base-indonesian-NER",
#             aggregation_strategy="simple"
#         )
        
#                 # Action item patterns
#         self.action_patterns = [
#             r"akan\s+(\w+)",
#             r"harus\s+(\w+)",
#             r"perlu\s+(\w+)",
#             r"mohon\s+(\w+)",
#             r"tolong\s+(\w+)",
#             r"segera\s+(\w+)",
#             r"follow\s*up",
#             r"action\s*item",
#             r"to\s*do",
#             r"deadline"
#         ]
        
#         # Decision patterns
#         self.decision_patterns = [
#             r"(diputuskan|memutuskan)\s+(.+)",
#             r"(disepakati|menyepakati)\s+(.+)",
#             r"(setuju|persetujuan)\s+(.+)",
#             r"keputusan(?:nya)?\s+(.+)",
#             r"final(?:isasi)?\s+(.+)"
#         ]
    
#     def summarize_transcript(self, transcript_segments, ratio=0.3):
#         """
#         Hierarchical summarization untuk transcript panjang
#         """
#         # Gabungkan text dari semua segments
#         full_text = ' '.join([seg['text'] for seg in transcript_segments])
        
#         # Chunking untuk dokumen panjang
#         chunks = self._create_chunks(full_text)
        
#         if len(chunks) == 1:
#             # Direct summarization untuk dokumen pendek
#             return self.summarizer(
#                 chunks[0], 
#                 ratio=ratio,
#                 num_sentences=5
#             )
#         else:
#             # Hierarchical summarization
#             return self._hierarchical_summarization(chunks, ratio)
    
#     def extract_key_information(self, transcript_segments):
#         """
#         Extract action items, decisions, dan key topics
#         """
#         full_text = ' '.join([seg['text'] for seg in transcript_segments])
        
#         # Extract keywords/topics
#         keywords = self.kw_model.extract_keywords(
#             full_text,
#             keyphrase_ngram_range=(1, 3),
#             stop_words='indonesian',
#             top_n=10,
#             use_mmr=True,
#             diversity=0.5
#         )
        
#         # Extract action items dan decisions
#         action_items = []
#         decisions = []
        
#         for segment in transcript_segments:
#             # Check for action items
#             if self._is_action_item(segment['text']):
#                 action_items.append({
#                     'text': segment['text'],
#                     'speaker': segment['speaker'],
#                     'timestamp': f"{segment['start']:.1f}s",
#                     'entities': self._extract_entities(segment['text'])
#                 })
            
#             # Check for decisions
#             if self._is_decision(segment['text']):
#                 decisions.append({
#                     'text': segment['text'],
#                     'speaker': segment['speaker'],
#                     'timestamp': f"{segment['start']:.1f}s"
#                 })
        
#         return {
#             'keywords': keywords,
#             'action_items': action_items,
#             'decisions': decisions
#         }
    
#     def _create_chunks(self, text, max_length=3000):
#         """
#         Create overlapping chunks for long documents
#         """
#         sentences = nltk.sent_tokenize(text)
#         chunks = []
#         current_chunk = []
#         current_length = 0
        
#         for sentence in sentences:
#             sentence_length = len(sentence)
            
#             if current_length + sentence_length > max_length and current_chunk:
#                 chunks.append(' '.join(current_chunk))
#                 # Keep last 2 sentences for overlap
#                 current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
#                 current_length = sum(len(s) for s in current_chunk)
            
#             current_chunk.append(sentence)
#             current_length += sentence_length
        
#         if current_chunk:
#             chunks.append(' '.join(current_chunk))
        
#         return chunks
    
#     def _hierarchical_summarization(self, chunks, ratio):
#         """
#         Two-level summarization for long documents
#         """
#         # Level 1: Summarize each chunk
#         chunk_summaries = []
#         for chunk in chunks:
#             summary = self.summarizer(
#                 chunk,
#                 ratio=0.4,  # Higher ratio for first level
#                 num_sentences=4
#             )
#             chunk_summaries.append(summary)
        
#         # Level 2: Summarize the summaries
#         combined_summary = ' '.join(chunk_summaries)
#         final_summary = self.summarizer(
#             combined_summary,
#             ratio=ratio,
#             num_sentences=6
#         )
        
#         return final_summary
    
#     def _is_action_item(self, text):
#         """
#         Detect if text contains action item
#         """
#         text_lower = text.lower()
        
#         # Check patterns
#         for pattern in self.action_patterns:
#             if re.search(pattern, text_lower):
#                 return True
        
#         # Check for imperative sentences
#         first_word = text.split()[0].lower() if text.split() else ""
#         imperative_verbs = [
#             'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
#             'follow', 'prepare', 'send', 'contact', 'create'
#         ]
        
#         return first_word in imperative_verbs
    
#     def _is_decision(self, text):
#         """
#         Detect if text contains decision
#         """
#         text_lower = text.lower()
        
#         for pattern in self.decision_patterns:
#             if re.search(pattern, text_lower):
#                 return True
        
#         return False
    
#     def _extract_entities(self, text):
#         """
#         Extract named entities (person, date, etc)
#         """
#         entities = self.ner_pipeline(text)
        
#         return {
#             'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
#             'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
#             'dates': self._extract_dates(text)
#         }
    
#     def _extract_dates(self, text):
#         """
#         Extract date mentions
#         """
#         date_patterns = [
#             r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
#             r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
#             r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
#             r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
#         ]
        
#         dates = []
#         for pattern in date_patterns:
#             matches = re.findall(pattern, text.lower())
#             dates.extend(matches)
        
#         return dates



from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    pipeline
)
from keybert import KeyBERT
import re
import nltk
from typing import List, Dict

class TextProcessor:
    def __init__(self):
        print("Initializing Text Processor...")
        
        # Use transformers pipeline for summarization instead
        try:
            self.summarizer = pipeline(
                "summarization", 
                model="sshleifer/distilbart-cnn-12-6",
                device=-1  # CPU
            )
        except:
            # Fallback to simple extractive summarization
            self.summarizer = None
            print("Warning: Summarization model not loaded, using fallback")
        
        # Initialize KeyBERT for keyword extraction
        try:
            self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
        except:
            self.kw_model = None
            print("Warning: KeyBERT not loaded")
        
        # Action item patterns
        self.action_patterns = [
            r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)",
            r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)",
            r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline"
        ]
        
        # Decision patterns
        self.decision_patterns = [
            r"(diputuskan|memutuskan)\s+(.+)",
            r"(disepakati|menyepakati)\s+(.+)",
            r"(setuju|persetujuan)\s+(.+)",
            r"keputusan(?:nya)?\s+(.+)",
            r"final(?:isasi)?\s+(.+)"
        ]
        
        print("Text Processor ready!")
    
    def summarize_transcript(self, transcript_segments, ratio=0.3):
        """Summarization with fallback methods"""
        # Combine text from all segments
        full_text = ' '.join([seg['text'] for seg in transcript_segments])
        
        if not full_text.strip():
            return "No content to summarize."
        
        # Try using the summarization pipeline
        if self.summarizer:
            try:
                # Split into chunks if too long
                max_chunk_length = 1024
                if len(full_text) > max_chunk_length:
                    chunks = self._split_into_chunks(full_text, max_chunk_length)
                    summaries = []
                    
                    for chunk in chunks[:3]:  # Limit to first 3 chunks
                        summary = self.summarizer(
                            chunk, 
                            max_length=130, 
                            min_length=30, 
                            do_sample=False
                        )[0]['summary_text']
                        summaries.append(summary)
                    
                    return ' '.join(summaries)
                else:
                    return self.summarizer(
                        full_text, 
                        max_length=150, 
                        min_length=30, 
                        do_sample=False
                    )[0]['summary_text']
            except:
                pass
        
        # Fallback: Simple extractive summarization
        return self._simple_extractive_summary(full_text, ratio)
    
    def extract_key_information(self, transcript_segments):
        """Extract action items, decisions, and key topics"""
        full_text = ' '.join([seg['text'] for seg in transcript_segments])
        
        # Extract keywords/topics
        keywords = []
        if self.kw_model:
            try:
                keywords = self.kw_model.extract_keywords(
                    full_text,
                    keyphrase_ngram_range=(1, 3),
                    stop_words=None,
                    top_n=10,
                    use_mmr=True,
                    diversity=0.5
                )
            except:
                pass
        
        # If KeyBERT fails, use simple frequency-based extraction
        if not keywords:
            keywords = self._extract_keywords_simple(full_text)
        
        # Extract action items and decisions
        action_items = []
        decisions = []
        
        for segment in transcript_segments:
            # Check for action items
            if self._is_action_item(segment['text']):
                action_items.append({
                    'text': segment['text'],
                    'speaker': segment['speaker'],
                    'timestamp': f"{segment['start']:.1f}s"
                })
            
            # Check for decisions
            if self._is_decision(segment['text']):
                decisions.append({
                    'text': segment['text'],
                    'speaker': segment['speaker'],
                    'timestamp': f"{segment['start']:.1f}s"
                })
        
        return {
            'keywords': keywords,
            'action_items': action_items,
            'decisions': decisions
        }
    
    def _split_into_chunks(self, text, max_length):
        """Split text into chunks"""
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0
        
        for word in words:
            current_chunk.append(word)
            current_length += len(word) + 1
            
            if current_length >= max_length:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_length = 0
        
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        return chunks
    
    def _simple_extractive_summary(self, text, ratio=0.3):
        """Simple extractive summarization fallback"""
        sentences = nltk.sent_tokenize(text)
        
        if len(sentences) <= 3:
            return text
        
        # Calculate number of sentences to include
        num_sentences = max(3, int(len(sentences) * ratio))
        
        # Simple scoring: prefer sentences with more content words
        scored_sentences = []
        for i, sent in enumerate(sentences):
            # Score based on length and position
            score = len(sent.split())
            if i < 3:  # Boost first sentences
                score *= 1.5
            if i >= len(sentences) - 2:  # Boost last sentences
                score *= 1.2
            scored_sentences.append((score, sent))
        
        # Sort by score and select top sentences
        scored_sentences.sort(reverse=True)
        selected = [sent for _, sent in scored_sentences[:num_sentences]]
        
        # Return in original order
        return ' '.join([s for s in sentences if s in selected])
    
    def _extract_keywords_simple(self, text):
        """Simple keyword extraction fallback"""
        # Remove common words
        stopwords = {
            'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah', 
            'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan',
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 
            'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were'
        }
        
        # Count word frequency
        words = re.findall(r'\b\w+\b', text.lower())
        word_freq = {}
        
        for word in words:
            if len(word) > 3 and word not in stopwords:
                word_freq[word] = word_freq.get(word, 0) + 1
        
        # Get top keywords
        keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
        
        # Format like KeyBERT output
        return [(word, freq/len(words)) for word, freq in keywords]
    
    def _is_action_item(self, text):
        """Detect if text contains action item"""
        text_lower = text.lower()
        
        # Check patterns
        for pattern in self.action_patterns:
            if re.search(pattern, text_lower):
                return True
        
        # Check for imperative sentences
        first_word = text.split()[0].lower() if text.split() else ""
        imperative_verbs = [
            'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
            'follow', 'prepare', 'send', 'contact', 'create'
        ]
        
        return first_word in imperative_verbs
    
    def _is_decision(self, text):
        """Detect if text contains decision"""
        text_lower = text.lower()
        
        for pattern in self.decision_patterns:
            if re.search(pattern, text_lower):
                return True
        
        return False