spacesedan commited on
Commit
eb54abc
·
1 Parent(s): 0dedb70
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -39,8 +39,6 @@ def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
39
 
40
  for i in range(0, len(tokens), max_tokens):
41
  chunk_tokens = tokens[i:i + max_tokens]
42
- if len(chunk_tokens) > MAX_MODEL_TOKENS:
43
- chunk_tokens = chunk_tokens[:MAX_MODEL_TOKENS]
44
  chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
45
  chunks.append(chunk)
46
 
@@ -57,11 +55,14 @@ async def summarize_batch(request: BatchSummarizationRequest):
57
  all_chunks.extend(chunks)
58
  chunk_map.extend([item.content_id] * len(chunks))
59
 
60
- # Final safety pass to enforce 1024 token limit
61
- safe_chunks = [
62
- tokenizer.decode(tokenizer.encode(chunk, truncation=False)[:MAX_MODEL_TOKENS], skip_special_tokens=True)
63
- for chunk in all_chunks
64
- ]
 
 
 
65
 
66
  summaries = summarizer(
67
  safe_chunks,
 
39
 
40
  for i in range(0, len(tokens), max_tokens):
41
  chunk_tokens = tokens[i:i + max_tokens]
 
 
42
  chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
43
  chunks.append(chunk)
44
 
 
55
  all_chunks.extend(chunks)
56
  chunk_map.extend([item.content_id] * len(chunks))
57
 
58
+ # Final safety pass to enforce 1024 token limit after decoding
59
+ safe_chunks = []
60
+ for chunk in all_chunks:
61
+ encoded = tokenizer.encode(chunk, truncation=False)
62
+ if len(encoded) > MAX_MODEL_TOKENS:
63
+ logger.warning(f"[TRUNCATING] Chunk exceeded max tokens ({len(encoded)}), trimming to {MAX_MODEL_TOKENS} tokens")
64
+ encoded = encoded[:MAX_MODEL_TOKENS]
65
+ safe_chunks.append(tokenizer.decode(encoded, skip_special_tokens=True))
66
 
67
  summaries = summarizer(
68
  safe_chunks,