spacesedan commited on
Commit
d1754e4
·
1 Parent(s): a67ba36
Files changed (1) hide show
  1. app.py +11 -2
app.py CHANGED
@@ -29,7 +29,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
 
30
  # Token constraints
31
  MAX_MODEL_TOKENS = 1024
32
- SAFE_CHUNK_SIZE = 600 # Reduced to leave room for special tokens
 
33
 
34
  # Pydantic schemas
35
  class SummarizationItem(BaseModel):
@@ -79,6 +80,14 @@ def split_sentences(text: str, max_sentence_tokens: int = SAFE_CHUNK_SIZE) -> li
79
 
80
  return split_results
81
 
 
 
 
 
 
 
 
 
82
  # Chunking based on token length
83
  def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
84
  sentences = split_sentences(text)
@@ -121,7 +130,7 @@ async def summarize_batch(request: BatchSummarizationRequest):
121
  logger.info(f"[CHUNKING] content_id={item.content_id} num_chunks={len(chunks)}")
122
 
123
  for chunk in chunks:
124
- all_chunks.append(chunk)
125
  chunk_map.append(item.content_id)
126
 
127
  if not all_chunks:
 
29
 
30
  # Token constraints
31
  MAX_MODEL_TOKENS = 1024
32
+ SAFE_CHUNK_SIZE = 600 # Safe for aggregation
33
+ TRUNCATED_TOKENS = MAX_MODEL_TOKENS - 2 # Leave room for special tokens
34
 
35
  # Pydantic schemas
36
  class SummarizationItem(BaseModel):
 
80
 
81
  return split_results
82
 
83
+ # Truncate text safely at token-level
84
+ def truncate_text(text: str, max_tokens: int = TRUNCATED_TOKENS) -> str:
85
+ tokens = tokenizer.encode(text, add_special_tokens=False)
86
+ if len(tokens) <= max_tokens:
87
+ return text
88
+ truncated = tokens[:max_tokens]
89
+ return tokenizer.decode(truncated, skip_special_tokens=True)
90
+
91
  # Chunking based on token length
92
  def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
93
  sentences = split_sentences(text)
 
130
  logger.info(f"[CHUNKING] content_id={item.content_id} num_chunks={len(chunks)}")
131
 
132
  for chunk in chunks:
133
+ all_chunks.append(truncate_text(chunk)) # ✅ enforce max length
134
  chunk_map.append(item.content_id)
135
 
136
  if not all_chunks: