Update chunker.py
Browse files- chunker.py +31 -10
chunker.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# chunker.py -
|
| 2 |
from transformers import AutoTokenizer
|
| 3 |
from typing import List
|
| 4 |
import logging
|
|
@@ -19,13 +19,22 @@ class DocumentChunker:
|
|
| 19 |
print(f"β
Tokenizer loaded successfully")
|
| 20 |
except Exception as e:
|
| 21 |
print(f"β οΈ Failed to load tokenizer {model_name}, using backup...")
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def chunk_by_tokens(self, text: str, max_tokens: int = 1600, stride: int = 50) -> List[str]:
|
| 26 |
-
"""Fast token-window chunking
|
| 27 |
self.initialize_tokenizer()
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
try:
|
| 30 |
# Encode text to tokens
|
| 31 |
ids = self.tokenizer.encode(text, add_special_tokens=False)
|
|
@@ -56,7 +65,7 @@ class DocumentChunker:
|
|
| 56 |
return self._fallback_chunk(text, max_tokens, stride)
|
| 57 |
|
| 58 |
def _fallback_chunk(self, text: str, max_tokens: int, stride: int) -> List[str]:
|
| 59 |
-
"""
|
| 60 |
# Rough estimation: 1 token β 4 characters for English text
|
| 61 |
max_chars = max_tokens * 4
|
| 62 |
stride_chars = stride * 4
|
|
@@ -67,14 +76,26 @@ class DocumentChunker:
|
|
| 67 |
|
| 68 |
while i < text_length:
|
| 69 |
j = min(i + max_chars, text_length)
|
| 70 |
-
chunk = text[i:j].strip()
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
if j
|
| 76 |
break
|
| 77 |
i = max(j - stride_chars, 0)
|
| 78 |
|
| 79 |
-
logger.info(f"βοΈ Created {len(chunks)} chunks using fallback method")
|
| 80 |
return chunks
|
|
|
|
| 1 |
+
# chunker.py - Enhanced with better fallback handling
|
| 2 |
from transformers import AutoTokenizer
|
| 3 |
from typing import List
|
| 4 |
import logging
|
|
|
|
| 19 |
print(f"β
Tokenizer loaded successfully")
|
| 20 |
except Exception as e:
|
| 21 |
print(f"β οΈ Failed to load tokenizer {model_name}, using backup...")
|
| 22 |
+
try:
|
| 23 |
+
# Fallback to a more common tokenizer
|
| 24 |
+
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
| 25 |
+
print("β
Fallback tokenizer loaded")
|
| 26 |
+
except Exception as fallback_error:
|
| 27 |
+
print(f"β All tokenizers failed: {fallback_error}")
|
| 28 |
+
self.tokenizer = None
|
| 29 |
|
| 30 |
def chunk_by_tokens(self, text: str, max_tokens: int = 1600, stride: int = 50) -> List[str]:
|
| 31 |
+
"""Fast token-window chunking with enhanced fallback"""
|
| 32 |
self.initialize_tokenizer()
|
| 33 |
|
| 34 |
+
# If no tokenizer available, use character-based fallback
|
| 35 |
+
if not self.tokenizer:
|
| 36 |
+
return self._fallback_chunk(text, max_tokens, stride)
|
| 37 |
+
|
| 38 |
try:
|
| 39 |
# Encode text to tokens
|
| 40 |
ids = self.tokenizer.encode(text, add_special_tokens=False)
|
|
|
|
| 65 |
return self._fallback_chunk(text, max_tokens, stride)
|
| 66 |
|
| 67 |
def _fallback_chunk(self, text: str, max_tokens: int, stride: int) -> List[str]:
|
| 68 |
+
"""Enhanced fallback chunking method"""
|
| 69 |
# Rough estimation: 1 token β 4 characters for English text
|
| 70 |
max_chars = max_tokens * 4
|
| 71 |
stride_chars = stride * 4
|
|
|
|
| 76 |
|
| 77 |
while i < text_length:
|
| 78 |
j = min(i + max_chars, text_length)
|
|
|
|
| 79 |
|
| 80 |
+
# Try to break at sentence boundaries
|
| 81 |
+
chunk = text[i:j]
|
| 82 |
+
if j < text_length:
|
| 83 |
+
# Look for sentence end within the last 200 characters
|
| 84 |
+
last_period = chunk.rfind('.')
|
| 85 |
+
last_exclamation = chunk.rfind('!')
|
| 86 |
+
last_question = chunk.rfind('?')
|
| 87 |
+
sentence_end = max(last_period, last_exclamation, last_question)
|
| 88 |
+
|
| 89 |
+
if sentence_end > len(chunk) - 200: # If sentence end is reasonably close
|
| 90 |
+
j = i + sentence_end + 1
|
| 91 |
+
chunk = text[i:j]
|
| 92 |
+
|
| 93 |
+
if chunk.strip():
|
| 94 |
+
chunks.append(chunk.strip())
|
| 95 |
|
| 96 |
+
if j >= text_length:
|
| 97 |
break
|
| 98 |
i = max(j - stride_chars, 0)
|
| 99 |
|
| 100 |
+
logger.info(f"βοΈ Created {len(chunks)} chunks using enhanced fallback method")
|
| 101 |
return chunks
|