Spaces:
Running
Running
Commit
·
eb54abc
1
Parent(s):
0dedb70
why
Browse files
app.py
CHANGED
@@ -39,8 +39,6 @@ def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
|
|
39 |
|
40 |
for i in range(0, len(tokens), max_tokens):
|
41 |
chunk_tokens = tokens[i:i + max_tokens]
|
42 |
-
if len(chunk_tokens) > MAX_MODEL_TOKENS:
|
43 |
-
chunk_tokens = chunk_tokens[:MAX_MODEL_TOKENS]
|
44 |
chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
45 |
chunks.append(chunk)
|
46 |
|
@@ -57,11 +55,14 @@ async def summarize_batch(request: BatchSummarizationRequest):
|
|
57 |
all_chunks.extend(chunks)
|
58 |
chunk_map.extend([item.content_id] * len(chunks))
|
59 |
|
60 |
-
# Final safety pass to enforce 1024 token limit
|
61 |
-
safe_chunks = [
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
65 |
|
66 |
summaries = summarizer(
|
67 |
safe_chunks,
|
|
|
39 |
|
40 |
for i in range(0, len(tokens), max_tokens):
|
41 |
chunk_tokens = tokens[i:i + max_tokens]
|
|
|
|
|
42 |
chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
43 |
chunks.append(chunk)
|
44 |
|
|
|
55 |
all_chunks.extend(chunks)
|
56 |
chunk_map.extend([item.content_id] * len(chunks))
|
57 |
|
58 |
+
# Final safety pass to enforce 1024 token limit after decoding
|
59 |
+
safe_chunks = []
|
60 |
+
for chunk in all_chunks:
|
61 |
+
encoded = tokenizer.encode(chunk, truncation=False)
|
62 |
+
if len(encoded) > MAX_MODEL_TOKENS:
|
63 |
+
logger.warning(f"[TRUNCATING] Chunk exceeded max tokens ({len(encoded)}), trimming to {MAX_MODEL_TOKENS} tokens")
|
64 |
+
encoded = encoded[:MAX_MODEL_TOKENS]
|
65 |
+
safe_chunks.append(tokenizer.decode(encoded, skip_special_tokens=True))
|
66 |
|
67 |
summaries = summarizer(
|
68 |
safe_chunks,
|