spacesedan commited on
Commit
204ba37
·
1 Parent(s): 750c1cd

more chunking updates

Browse files
Files changed (1) hide show
  1. app.py +17 -4
app.py CHANGED
@@ -24,13 +24,20 @@ class SummarizationResponseItem(BaseModel):
24
  class BatchSummarizationResponse(BaseModel):
25
  summaries: List[SummarizationResponseItem]
26
 
27
- def chunk_text(text, max_tokens=700):
 
 
 
 
28
  tokens = tokenizer.encode(text, truncation=False)
29
  chunks = []
30
 
31
  for i in range(0, len(tokens), max_tokens):
32
- chunk = tokens[i:i + max_tokens]
33
- chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))
 
 
 
34
 
35
  return chunks
36
 
@@ -44,8 +51,14 @@ async def summarize_batch(request: BatchSummarizationRequest):
44
  all_chunks.extend(chunks)
45
  chunk_map.extend([item.content_id] * len(chunks))
46
 
 
 
 
 
 
 
47
  summaries = summarizer(
48
- all_chunks,
49
  max_length=150,
50
  min_length=30,
51
  truncation=True,
 
24
  class BatchSummarizationResponse(BaseModel):
25
  summaries: List[SummarizationResponseItem]
26
 
27
+ # Ensure no chunk ever exceeds model token limit
28
+ MAX_MODEL_TOKENS = 1024
29
+ SAFE_CHUNK_SIZE = 700
30
+
31
+ def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
32
  tokens = tokenizer.encode(text, truncation=False)
33
  chunks = []
34
 
35
  for i in range(0, len(tokens), max_tokens):
36
+ chunk_tokens = tokens[i:i + max_tokens]
37
+ if len(chunk_tokens) > MAX_MODEL_TOKENS:
38
+ chunk_tokens = chunk_tokens[:MAX_MODEL_TOKENS]
39
+ chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
40
+ chunks.append(chunk)
41
 
42
  return chunks
43
 
 
51
  all_chunks.extend(chunks)
52
  chunk_map.extend([item.content_id] * len(chunks))
53
 
54
+ # Final safety pass to enforce 1024 token limit
55
+ safe_chunks = [
56
+ tokenizer.decode(tokenizer.encode(chunk, truncation=False)[:MAX_MODEL_TOKENS], skip_special_tokens=True)
57
+ for chunk in all_chunks
58
+ ]
59
+
60
  summaries = summarizer(
61
+ safe_chunks,
62
  max_length=150,
63
  min_length=30,
64
  truncation=True,