Spaces:
Running
Running
Commit
·
204ba37
1
Parent(s):
750c1cd
more chunking updates
Browse files
app.py
CHANGED
@@ -24,13 +24,20 @@ class SummarizationResponseItem(BaseModel):
|
|
24 |
class BatchSummarizationResponse(BaseModel):
|
25 |
summaries: List[SummarizationResponseItem]
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
tokens = tokenizer.encode(text, truncation=False)
|
29 |
chunks = []
|
30 |
|
31 |
for i in range(0, len(tokens), max_tokens):
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
34 |
|
35 |
return chunks
|
36 |
|
@@ -44,8 +51,14 @@ async def summarize_batch(request: BatchSummarizationRequest):
|
|
44 |
all_chunks.extend(chunks)
|
45 |
chunk_map.extend([item.content_id] * len(chunks))
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
summaries = summarizer(
|
48 |
-
|
49 |
max_length=150,
|
50 |
min_length=30,
|
51 |
truncation=True,
|
|
|
24 |
class BatchSummarizationResponse(BaseModel):
|
25 |
summaries: List[SummarizationResponseItem]
|
26 |
|
27 |
+
# Ensure no chunk ever exceeds model token limit
|
28 |
+
MAX_MODEL_TOKENS = 1024
|
29 |
+
SAFE_CHUNK_SIZE = 700
|
30 |
+
|
31 |
+
def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
|
32 |
tokens = tokenizer.encode(text, truncation=False)
|
33 |
chunks = []
|
34 |
|
35 |
for i in range(0, len(tokens), max_tokens):
|
36 |
+
chunk_tokens = tokens[i:i + max_tokens]
|
37 |
+
if len(chunk_tokens) > MAX_MODEL_TOKENS:
|
38 |
+
chunk_tokens = chunk_tokens[:MAX_MODEL_TOKENS]
|
39 |
+
chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
40 |
+
chunks.append(chunk)
|
41 |
|
42 |
return chunks
|
43 |
|
|
|
51 |
all_chunks.extend(chunks)
|
52 |
chunk_map.extend([item.content_id] * len(chunks))
|
53 |
|
54 |
+
# Final safety pass to enforce 1024 token limit
|
55 |
+
safe_chunks = [
|
56 |
+
tokenizer.decode(tokenizer.encode(chunk, truncation=False)[:MAX_MODEL_TOKENS], skip_special_tokens=True)
|
57 |
+
for chunk in all_chunks
|
58 |
+
]
|
59 |
+
|
60 |
summaries = summarizer(
|
61 |
+
safe_chunks,
|
62 |
max_length=150,
|
63 |
min_length=30,
|
64 |
truncation=True,
|