Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 11

Commit

d1bc339

verified ·

1 Parent(s): 38eb958

Update chunker.py

Browse files

Files changed (1) hide show

chunker.py +54 -14

chunker.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from transformers import AutoTokenizer
 from typing import List
 import logging
@@ -13,28 +14,67 @@ class DocumentChunker:
         """Initialize tokenizer if not already done"""
         if self.tokenizer is None:
             print(f"🔤 Loading tokenizer: {model_name}")
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
     def chunk_by_tokens(self, text: str, max_tokens: int = 1600, stride: int = 50) -> List[str]:
         """Fast token-window chunking without NLTK dependency"""
         self.initialize_tokenizer()
-        # Encode text to tokens
-        ids = self.tokenizer.encode(text, add_special_tokens=False)
-        chunks = []
-        i, n = 0, len(ids)
-        logger.info(f"📄 Total tokens: {n}, creating chunks with max_tokens: {max_tokens}")
-        while i < n:
-            j = min(i + max_tokens, n)
-            chunk_ids = ids[i:j]
-            chunk_text = self.tokenizer.decode(chunk_ids, skip_special_tokens=True)
-            chunks.append(chunk_text)
-            if j == n:
                 break
-            i = max(j - stride, 0)  # Overlap to avoid cutting mid-sentence
-        logger.info(f"✂️ Created {len(chunks)} chunks")
         return chunks

+# chunker.py - Document chunking with token-based splitting
 from transformers import AutoTokenizer
 from typing import List
 import logging
         """Initialize tokenizer if not already done"""
         if self.tokenizer is None:
             print(f"🔤 Loading tokenizer: {model_name}")
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+                print(f"✅ Tokenizer loaded successfully")
+            except Exception as e:
+                print(f"⚠️ Failed to load tokenizer {model_name}, using backup...")
+                # Fallback to a more common tokenizer
+                self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     def chunk_by_tokens(self, text: str, max_tokens: int = 1600, stride: int = 50) -> List[str]:
         """Fast token-window chunking without NLTK dependency"""
         self.initialize_tokenizer()
+        try:
+            # Encode text to tokens
+            ids = self.tokenizer.encode(text, add_special_tokens=False)
+            chunks = []
+            i, n = 0, len(ids)
+            logger.info(f"📄 Total tokens: {n}, creating chunks with max_tokens: {max_tokens}")
+            while i < n:
+                j = min(i + max_tokens, n)
+                chunk_ids = ids[i:j]
+                chunk_text = self.tokenizer.decode(chunk_ids, skip_special_tokens=True)
+                # Only add non-empty chunks
+                if chunk_text.strip():
+                    chunks.append(chunk_text.strip())
+                if j == n:
+                    break
+                i = max(j - stride, 0)  # Overlap to avoid cutting mid-sentence
+            logger.info(f"✂️ Created {len(chunks)} chunks")
+            return chunks
+        except Exception as e:
+            logger.error(f"❌ Chunking failed: {e}")
+            # Fallback to simple text splitting if tokenization fails
+            return self._fallback_chunk(text, max_tokens, stride)
+    def _fallback_chunk(self, text: str, max_tokens: int, stride: int) -> List[str]:
+        """Fallback chunking method using character-based estimation"""
+        # Rough estimation: 1 token ≈ 4 characters for English text
+        max_chars = max_tokens * 4
+        stride_chars = stride * 4
+        chunks = []
+        i = 0
+        text_length = len(text)
+        while i < text_length:
+            j = min(i + max_chars, text_length)
+            chunk = text[i:j].strip()
+            if chunk:
+                chunks.append(chunk)
+            if j == text_length:
                 break
+            i = max(j - stride_chars, 0)
+        logger.info(f"✂️ Created {len(chunks)} chunks using fallback method")
         return chunks