Spaces:

Yermia
/

meeting-minutes-ai

Sleeping

App Files Files Community

Yermia commited on 24 days ago

Commit

7af8df4

1 Parent(s): e43a761

Fix text procesor

Browse files

Files changed (1) hide show

utils/text_processor.py +371 -127

utils/text_processor.py CHANGED Viewed

@@ -1,42 +1,270 @@
 from transformers import (
     AutoTokenizer,
     AutoModelForSeq2SeqLM,
-    AutoModelForTokenClassification,
     pipeline
 )
 from keybert import KeyBERT
-from summarizer import Summarizer
 import re
 import nltk
-nltk.download('punkt')
 class TextProcessor:
     def __init__(self):
-        # Initialize summarization model
-        self.summarizer = Summarizer('bert-base-multilingual-cased')
-        # Initialize KeyBERT for keyword extraction
-        self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
-        # Initialize NER for action item detection
-        self.ner_pipeline = pipeline(
-            "ner",
-            model="cahya/bert-base-indonesian-NER",
-            aggregation_strategy="simple"
-        )
-                # Action item patterns
         self.action_patterns = [
-            r"akan\s+(\w+)",
-            r"harus\s+(\w+)",
-            r"perlu\s+(\w+)",
-            r"mohon\s+(\w+)",
-            r"tolong\s+(\w+)",
-            r"segera\s+(\w+)",
-            r"follow\s*up",
-            r"action\s*item",
-            r"to\s*do",
-            r"deadline"
         ]
         # Decision patterns
@@ -47,45 +275,73 @@ class TextProcessor:
             r"keputusan(?:nya)?\s+(.+)",
             r"final(?:isasi)?\s+(.+)"
         ]
     def summarize_transcript(self, transcript_segments, ratio=0.3):
-        """
-        Hierarchical summarization untuk transcript panjang
-        """
-        # Gabungkan text dari semua segments
         full_text = ' '.join([seg['text'] for seg in transcript_segments])
-        # Chunking untuk dokumen panjang
-        chunks = self._create_chunks(full_text)
-        if len(chunks) == 1:
-            # Direct summarization untuk dokumen pendek
-            return self.summarizer(
-                chunks[0],
-                ratio=ratio,
-                num_sentences=5
-            )
-        else:
-            # Hierarchical summarization
-            return self._hierarchical_summarization(chunks, ratio)
     def extract_key_information(self, transcript_segments):
-        """
-        Extract action items, decisions, dan key topics
-        """
         full_text = ' '.join([seg['text'] for seg in transcript_segments])
         # Extract keywords/topics
-        keywords = self.kw_model.extract_keywords(
-            full_text,
-            keyphrase_ngram_range=(1, 3),
-            stop_words='indonesian',
-            top_n=10,
-            use_mmr=True,
-            diversity=0.5
-        )
-        # Extract action items dan decisions
         action_items = []
         decisions = []
@@ -95,8 +351,7 @@ class TextProcessor:
                 action_items.append({
                     'text': segment['text'],
                     'speaker': segment['speaker'],
-                    'timestamp': f"{segment['start']:.1f}s",
-                    'entities': self._extract_entities(segment['text'])
                 })
             # Check for decisions
@@ -113,60 +368,81 @@ class TextProcessor:
             'decisions': decisions
         }
-    def _create_chunks(self, text, max_length=3000):
-        """
-        Create overlapping chunks for long documents
-        """
-        sentences = nltk.sent_tokenize(text)
         chunks = []
         current_chunk = []
         current_length = 0
-        for sentence in sentences:
-            sentence_length = len(sentence)
-            if current_length + sentence_length > max_length and current_chunk:
                 chunks.append(' '.join(current_chunk))
-                # Keep last 2 sentences for overlap
-                current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
-                current_length = sum(len(s) for s in current_chunk)
-            current_chunk.append(sentence)
-            current_length += sentence_length
         if current_chunk:
             chunks.append(' '.join(current_chunk))
         return chunks
-    def _hierarchical_summarization(self, chunks, ratio):
-        """
-        Two-level summarization for long documents
-        """
-        # Level 1: Summarize each chunk
-        chunk_summaries = []
-        for chunk in chunks:
-            summary = self.summarizer(
-                chunk,
-                ratio=0.4,  # Higher ratio for first level
-                num_sentences=4
-            )
-            chunk_summaries.append(summary)
-        # Level 2: Summarize the summaries
-        combined_summary = ' '.join(chunk_summaries)
-        final_summary = self.summarizer(
-            combined_summary,
-            ratio=ratio,
-            num_sentences=6
-        )
-        return final_summary
     def _is_action_item(self, text):
-        """
-        Detect if text contains action item
-        """
         text_lower = text.lower()
         # Check patterns
@@ -184,43 +460,11 @@ class TextProcessor:
         return first_word in imperative_verbs
     def _is_decision(self, text):
-        """
-        Detect if text contains decision
-        """
         text_lower = text.lower()
         for pattern in self.decision_patterns:
             if re.search(pattern, text_lower):
                 return True
-        return False
-    def _extract_entities(self, text):
-        """
-        Extract named entities (person, date, etc)
-        """
-        entities = self.ner_pipeline(text)
-        return {
-            'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
-            'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
-            'dates': self._extract_dates(text)
-        }
-    def _extract_dates(self, text):
-        """
-        Extract date mentions
-        """
-        date_patterns = [
-            r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
-            r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
-            r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
-            r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
-        ]
-        dates = []
-        for pattern in date_patterns:
-            matches = re.findall(pattern, text.lower())
-            dates.extend(matches)
-        return dates

+# from transformers import (
+#     AutoTokenizer,
+#     AutoModelForSeq2SeqLM,
+#     AutoModelForTokenClassification,
+#     pipeline
+# )
+# from keybert import KeyBERT
+# from summarizer import Summarizer
+# import re
+# import nltk
+# nltk.download('punkt')
+# class TextProcessor:
+#     def __init__(self):
+#         # Initialize summarization model
+#         self.summarizer = Summarizer('bert-base-multilingual-cased')
+#         # Initialize KeyBERT for keyword extraction
+#         self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
+#         # Initialize NER for action item detection
+#         self.ner_pipeline = pipeline(
+#             "ner",
+#             model="cahya/bert-base-indonesian-NER",
+#             aggregation_strategy="simple"
+#         )
+#                 # Action item patterns
+#         self.action_patterns = [
+#             r"akan\s+(\w+)",
+#             r"harus\s+(\w+)",
+#             r"perlu\s+(\w+)",
+#             r"mohon\s+(\w+)",
+#             r"tolong\s+(\w+)",
+#             r"segera\s+(\w+)",
+#             r"follow\s*up",
+#             r"action\s*item",
+#             r"to\s*do",
+#             r"deadline"
+#         ]
+#         # Decision patterns
+#         self.decision_patterns = [
+#             r"(diputuskan|memutuskan)\s+(.+)",
+#             r"(disepakati|menyepakati)\s+(.+)",
+#             r"(setuju|persetujuan)\s+(.+)",
+#             r"keputusan(?:nya)?\s+(.+)",
+#             r"final(?:isasi)?\s+(.+)"
+#         ]
+#     def summarize_transcript(self, transcript_segments, ratio=0.3):
+#         """
+#         Hierarchical summarization untuk transcript panjang
+#         """
+#         # Gabungkan text dari semua segments
+#         full_text = ' '.join([seg['text'] for seg in transcript_segments])
+#         # Chunking untuk dokumen panjang
+#         chunks = self._create_chunks(full_text)
+#         if len(chunks) == 1:
+#             # Direct summarization untuk dokumen pendek
+#             return self.summarizer(
+#                 chunks[0],
+#                 ratio=ratio,
+#                 num_sentences=5
+#             )
+#         else:
+#             # Hierarchical summarization
+#             return self._hierarchical_summarization(chunks, ratio)
+#     def extract_key_information(self, transcript_segments):
+#         """
+#         Extract action items, decisions, dan key topics
+#         """
+#         full_text = ' '.join([seg['text'] for seg in transcript_segments])
+#         # Extract keywords/topics
+#         keywords = self.kw_model.extract_keywords(
+#             full_text,
+#             keyphrase_ngram_range=(1, 3),
+#             stop_words='indonesian',
+#             top_n=10,
+#             use_mmr=True,
+#             diversity=0.5
+#         )
+#         # Extract action items dan decisions
+#         action_items = []
+#         decisions = []
+#         for segment in transcript_segments:
+#             # Check for action items
+#             if self._is_action_item(segment['text']):
+#                 action_items.append({
+#                     'text': segment['text'],
+#                     'speaker': segment['speaker'],
+#                     'timestamp': f"{segment['start']:.1f}s",
+#                     'entities': self._extract_entities(segment['text'])
+#                 })
+#             # Check for decisions
+#             if self._is_decision(segment['text']):
+#                 decisions.append({
+#                     'text': segment['text'],
+#                     'speaker': segment['speaker'],
+#                     'timestamp': f"{segment['start']:.1f}s"
+#                 })
+#         return {
+#             'keywords': keywords,
+#             'action_items': action_items,
+#             'decisions': decisions
+#         }
+#     def _create_chunks(self, text, max_length=3000):
+#         """
+#         Create overlapping chunks for long documents
+#         """
+#         sentences = nltk.sent_tokenize(text)
+#         chunks = []
+#         current_chunk = []
+#         current_length = 0
+#         for sentence in sentences:
+#             sentence_length = len(sentence)
+#             if current_length + sentence_length > max_length and current_chunk:
+#                 chunks.append(' '.join(current_chunk))
+#                 # Keep last 2 sentences for overlap
+#                 current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
+#                 current_length = sum(len(s) for s in current_chunk)
+#             current_chunk.append(sentence)
+#             current_length += sentence_length
+#         if current_chunk:
+#             chunks.append(' '.join(current_chunk))
+#         return chunks
+#     def _hierarchical_summarization(self, chunks, ratio):
+#         """
+#         Two-level summarization for long documents
+#         """
+#         # Level 1: Summarize each chunk
+#         chunk_summaries = []
+#         for chunk in chunks:
+#             summary = self.summarizer(
+#                 chunk,
+#                 ratio=0.4,  # Higher ratio for first level
+#                 num_sentences=4
+#             )
+#             chunk_summaries.append(summary)
+#         # Level 2: Summarize the summaries
+#         combined_summary = ' '.join(chunk_summaries)
+#         final_summary = self.summarizer(
+#             combined_summary,
+#             ratio=ratio,
+#             num_sentences=6
+#         )
+#         return final_summary
+#     def _is_action_item(self, text):
+#         """
+#         Detect if text contains action item
+#         """
+#         text_lower = text.lower()
+#         # Check patterns
+#         for pattern in self.action_patterns:
+#             if re.search(pattern, text_lower):
+#                 return True
+#         # Check for imperative sentences
+#         first_word = text.split()[0].lower() if text.split() else ""
+#         imperative_verbs = [
+#             'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
+#             'follow', 'prepare', 'send', 'contact', 'create'
+#         ]
+#         return first_word in imperative_verbs
+#     def _is_decision(self, text):
+#         """
+#         Detect if text contains decision
+#         """
+#         text_lower = text.lower()
+#         for pattern in self.decision_patterns:
+#             if re.search(pattern, text_lower):
+#                 return True
+#         return False
+#     def _extract_entities(self, text):
+#         """
+#         Extract named entities (person, date, etc)
+#         """
+#         entities = self.ner_pipeline(text)
+#         return {
+#             'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
+#             'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
+#             'dates': self._extract_dates(text)
+#         }
+#     def _extract_dates(self, text):
+#         """
+#         Extract date mentions
+#         """
+#         date_patterns = [
+#             r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
+#             r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
+#             r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
+#             r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
+#         ]
+#         dates = []
+#         for pattern in date_patterns:
+#             matches = re.findall(pattern, text.lower())
+#             dates.extend(matches)
+#         return dates
 from transformers import (
     AutoTokenizer,
     AutoModelForSeq2SeqLM,
     pipeline
 )
 from keybert import KeyBERT
 import re
 import nltk
+from typing import List, Dict
 class TextProcessor:
     def __init__(self):
+        print("Initializing Text Processor...")
+        # Use transformers pipeline for summarization instead
+        try:
+            self.summarizer = pipeline(
+                "summarization",
+                model="sshleifer/distilbart-cnn-12-6",
+                device=-1  # CPU
+            )
+        except:
+            # Fallback to simple extractive summarization
+            self.summarizer = None
+            print("Warning: Summarization model not loaded, using fallback")
+        # Initialize KeyBERT for keyword extraction
+        try:
+            self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
+        except:
+            self.kw_model = None
+            print("Warning: KeyBERT not loaded")
+        # Action item patterns
         self.action_patterns = [
+            r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)",
+            r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)",
+            r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline"
         ]
         # Decision patterns
             r"keputusan(?:nya)?\s+(.+)",
             r"final(?:isasi)?\s+(.+)"
         ]
+        print("Text Processor ready!")
     def summarize_transcript(self, transcript_segments, ratio=0.3):
+        """Summarization with fallback methods"""
+        # Combine text from all segments
         full_text = ' '.join([seg['text'] for seg in transcript_segments])
+        if not full_text.strip():
+            return "No content to summarize."
+        # Try using the summarization pipeline
+        if self.summarizer:
+            try:
+                # Split into chunks if too long
+                max_chunk_length = 1024
+                if len(full_text) > max_chunk_length:
+                    chunks = self._split_into_chunks(full_text, max_chunk_length)
+                    summaries = []
+                    for chunk in chunks[:3]:  # Limit to first 3 chunks
+                        summary = self.summarizer(
+                            chunk,
+                            max_length=130,
+                            min_length=30,
+                            do_sample=False
+                        )[0]['summary_text']
+                        summaries.append(summary)
+                    return ' '.join(summaries)
+                else:
+                    return self.summarizer(
+                        full_text,
+                        max_length=150,
+                        min_length=30,
+                        do_sample=False
+                    )[0]['summary_text']
+            except:
+                pass
+        # Fallback: Simple extractive summarization
+        return self._simple_extractive_summary(full_text, ratio)
     def extract_key_information(self, transcript_segments):
+        """Extract action items, decisions, and key topics"""
         full_text = ' '.join([seg['text'] for seg in transcript_segments])
         # Extract keywords/topics
+        keywords = []
+        if self.kw_model:
+            try:
+                keywords = self.kw_model.extract_keywords(
+                    full_text,
+                    keyphrase_ngram_range=(1, 3),
+                    stop_words=None,
+                    top_n=10,
+                    use_mmr=True,
+                    diversity=0.5
+                )
+            except:
+                pass
+        # If KeyBERT fails, use simple frequency-based extraction
+        if not keywords:
+            keywords = self._extract_keywords_simple(full_text)
+        # Extract action items and decisions
         action_items = []
         decisions = []
                 action_items.append({
                     'text': segment['text'],
                     'speaker': segment['speaker'],
+                    'timestamp': f"{segment['start']:.1f}s"
                 })
             # Check for decisions
             'decisions': decisions
         }
+    def _split_into_chunks(self, text, max_length):
+        """Split text into chunks"""
+        words = text.split()
         chunks = []
         current_chunk = []
         current_length = 0
+        for word in words:
+            current_chunk.append(word)
+            current_length += len(word) + 1
+            if current_length >= max_length:
                 chunks.append(' '.join(current_chunk))
+                current_chunk = []
+                current_length = 0
         if current_chunk:
             chunks.append(' '.join(current_chunk))
         return chunks
+    def _simple_extractive_summary(self, text, ratio=0.3):
+        """Simple extractive summarization fallback"""
+        sentences = nltk.sent_tokenize(text)
+        if len(sentences) <= 3:
+            return text
+        # Calculate number of sentences to include
+        num_sentences = max(3, int(len(sentences) * ratio))
+        # Simple scoring: prefer sentences with more content words
+        scored_sentences = []
+        for i, sent in enumerate(sentences):
+            # Score based on length and position
+            score = len(sent.split())
+            if i < 3:  # Boost first sentences
+                score *= 1.5
+            if i >= len(sentences) - 2:  # Boost last sentences
+                score *= 1.2
+            scored_sentences.append((score, sent))
+        # Sort by score and select top sentences
+        scored_sentences.sort(reverse=True)
+        selected = [sent for _, sent in scored_sentences[:num_sentences]]
+        # Return in original order
+        return ' '.join([s for s in sentences if s in selected])
+    def _extract_keywords_simple(self, text):
+        """Simple keyword extraction fallback"""
+        # Remove common words
+        stopwords = {
+            'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah',
+            'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan',
+            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
+            'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were'
+        }
+        # Count word frequency
+        words = re.findall(r'\b\w+\b', text.lower())
+        word_freq = {}
+        for word in words:
+            if len(word) > 3 and word not in stopwords:
+                word_freq[word] = word_freq.get(word, 0) + 1
+        # Get top keywords
+        keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
+        # Format like KeyBERT output
+        return [(word, freq/len(words)) for word, freq in keywords]
     def _is_action_item(self, text):
+        """Detect if text contains action item"""
         text_lower = text.lower()
         # Check patterns
         return first_word in imperative_verbs
     def _is_decision(self, text):
+        """Detect if text contains decision"""
         text_lower = text.lower()
         for pattern in self.decision_patterns:
             if re.search(pattern, text_lower):
                 return True
+        return False