Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 11, 2025

Commit

95714c1

verified ·

1 Parent(s): 5fcab8d

Update chunker.py

Browse files

Files changed (1) hide show

chunker.py +31 -10

chunker.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# chunker.py - Document chunking with token-based splitting
 from transformers import AutoTokenizer
 from typing import List
 import logging
@@ -19,13 +19,22 @@ class DocumentChunker:
                 print(f"✅ Tokenizer loaded successfully")
             except Exception as e:
                 print(f"⚠️ Failed to load tokenizer {model_name}, using backup...")
-                # Fallback to a more common tokenizer
-                self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     def chunk_by_tokens(self, text: str, max_tokens: int = 1600, stride: int = 50) -> List[str]:
-        """Fast token-window chunking without NLTK dependency"""
         self.initialize_tokenizer()
         try:
             # Encode text to tokens
             ids = self.tokenizer.encode(text, add_special_tokens=False)
@@ -56,7 +65,7 @@ class DocumentChunker:
             return self._fallback_chunk(text, max_tokens, stride)
     def _fallback_chunk(self, text: str, max_tokens: int, stride: int) -> List[str]:
-        """Fallback chunking method using character-based estimation"""
         # Rough estimation: 1 token ≈ 4 characters for English text
         max_chars = max_tokens * 4
         stride_chars = stride * 4
@@ -67,14 +76,26 @@ class DocumentChunker:
         while i < text_length:
             j = min(i + max_chars, text_length)
-            chunk = text[i:j].strip()
-            if chunk:
-                chunks.append(chunk)
-            if j == text_length:
                 break
             i = max(j - stride_chars, 0)
-        logger.info(f"✂️ Created {len(chunks)} chunks using fallback method")
         return chunks

+# chunker.py - Enhanced with better fallback handling
 from transformers import AutoTokenizer
 from typing import List
 import logging
                 print(f"✅ Tokenizer loaded successfully")
             except Exception as e:
                 print(f"⚠️ Failed to load tokenizer {model_name}, using backup...")
+                try:
+                    # Fallback to a more common tokenizer
+                    self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+                    print("✅ Fallback tokenizer loaded")
+                except Exception as fallback_error:
+                    print(f"❌ All tokenizers failed: {fallback_error}")
+                    self.tokenizer = None
     def chunk_by_tokens(self, text: str, max_tokens: int = 1600, stride: int = 50) -> List[str]:
+        """Fast token-window chunking with enhanced fallback"""
         self.initialize_tokenizer()
+        # If no tokenizer available, use character-based fallback
+        if not self.tokenizer:
+            return self._fallback_chunk(text, max_tokens, stride)
         try:
             # Encode text to tokens
             ids = self.tokenizer.encode(text, add_special_tokens=False)
             return self._fallback_chunk(text, max_tokens, stride)
     def _fallback_chunk(self, text: str, max_tokens: int, stride: int) -> List[str]:
+        """Enhanced fallback chunking method"""
         # Rough estimation: 1 token ≈ 4 characters for English text
         max_chars = max_tokens * 4
         stride_chars = stride * 4
         while i < text_length:
             j = min(i + max_chars, text_length)
+            # Try to break at sentence boundaries
+            chunk = text[i:j]
+            if j < text_length:
+                # Look for sentence end within the last 200 characters
+                last_period = chunk.rfind('.')
+                last_exclamation = chunk.rfind('!')
+                last_question = chunk.rfind('?')
+                sentence_end = max(last_period, last_exclamation, last_question)
+                if sentence_end > len(chunk) - 200:  # If sentence end is reasonably close
+                    j = i + sentence_end + 1
+                    chunk = text[i:j]
+            if chunk.strip():
+                chunks.append(chunk.strip())
+            if j >= text_length:
                 break
             i = max(j - stride_chars, 0)
+        logger.info(f"✂️ Created {len(chunks)} chunks using enhanced fallback method")
         return chunks