Spaces:
Running
Running
Commit
·
d1754e4
1
Parent(s):
a67ba36
truncates
Browse files
app.py
CHANGED
@@ -29,7 +29,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
29 |
|
30 |
# Token constraints
|
31 |
MAX_MODEL_TOKENS = 1024
|
32 |
-
SAFE_CHUNK_SIZE = 600 #
|
|
|
33 |
|
34 |
# Pydantic schemas
|
35 |
class SummarizationItem(BaseModel):
|
@@ -79,6 +80,14 @@ def split_sentences(text: str, max_sentence_tokens: int = SAFE_CHUNK_SIZE) -> li
|
|
79 |
|
80 |
return split_results
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
# Chunking based on token length
|
83 |
def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
|
84 |
sentences = split_sentences(text)
|
@@ -121,7 +130,7 @@ async def summarize_batch(request: BatchSummarizationRequest):
|
|
121 |
logger.info(f"[CHUNKING] content_id={item.content_id} num_chunks={len(chunks)}")
|
122 |
|
123 |
for chunk in chunks:
|
124 |
-
all_chunks.append(chunk)
|
125 |
chunk_map.append(item.content_id)
|
126 |
|
127 |
if not all_chunks:
|
|
|
29 |
|
30 |
# Token constraints
|
31 |
MAX_MODEL_TOKENS = 1024
|
32 |
+
SAFE_CHUNK_SIZE = 600 # Safe for aggregation
|
33 |
+
TRUNCATED_TOKENS = MAX_MODEL_TOKENS - 2 # Leave room for special tokens
|
34 |
|
35 |
# Pydantic schemas
|
36 |
class SummarizationItem(BaseModel):
|
|
|
80 |
|
81 |
return split_results
|
82 |
|
83 |
+
# Truncate text safely at token-level
|
84 |
+
def truncate_text(text: str, max_tokens: int = TRUNCATED_TOKENS) -> str:
|
85 |
+
tokens = tokenizer.encode(text, add_special_tokens=False)
|
86 |
+
if len(tokens) <= max_tokens:
|
87 |
+
return text
|
88 |
+
truncated = tokens[:max_tokens]
|
89 |
+
return tokenizer.decode(truncated, skip_special_tokens=True)
|
90 |
+
|
91 |
# Chunking based on token length
|
92 |
def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
|
93 |
sentences = split_sentences(text)
|
|
|
130 |
logger.info(f"[CHUNKING] content_id={item.content_id} num_chunks={len(chunks)}")
|
131 |
|
132 |
for chunk in chunks:
|
133 |
+
all_chunks.append(truncate_text(chunk)) # ✅ enforce max length
|
134 |
chunk_map.append(item.content_id)
|
135 |
|
136 |
if not all_chunks:
|