spacesedan commited on
Commit
71a1190
·
1 Parent(s): fcdc986
Files changed (1) hide show
  1. app.py +6 -17
app.py CHANGED
@@ -39,6 +39,8 @@ def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
39
 
40
  for i in range(0, len(tokens), max_tokens):
41
  chunk_tokens = tokens[i:i + max_tokens]
 
 
42
  chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
43
  chunks.append(chunk)
44
 
@@ -56,25 +58,12 @@ async def summarize_batch(request: BatchSummarizationRequest):
56
  all_chunks.extend(chunks)
57
  chunk_map.extend([item.content_id] * len(chunks))
58
 
59
- # Retokenize and only allow chunks that are safely below the max token limit
60
- safe_chunks = []
61
- safe_chunk_map = []
62
- for content_id, chunk in zip(chunk_map, all_chunks):
63
- encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_MODEL_TOKENS)
64
- token_count = encoded["input_ids"].shape[1]
65
- if token_count > MAX_MODEL_TOKENS:
66
- logger.warning(f"[SKIP] content_id={content_id} Chunk too long after truncation: {token_count} tokens")
67
- continue
68
- decoded = tokenizer.decode(encoded["input_ids"][0], skip_special_tokens=True)
69
- safe_chunks.append(decoded)
70
- safe_chunk_map.append(content_id)
71
-
72
- if not safe_chunks:
73
- logger.error("No valid chunks after token filtering. Returning empty response.")
74
  return {"summaries": []}
75
 
76
  summaries = summarizer(
77
- safe_chunks,
78
  max_length=150,
79
  min_length=30,
80
  truncation=True,
@@ -84,7 +73,7 @@ async def summarize_batch(request: BatchSummarizationRequest):
84
 
85
  # Aggregate summaries back per content_id
86
  summary_map = {}
87
- for content_id, result in zip(safe_chunk_map, summaries):
88
  summary_map.setdefault(content_id, []).append(result["summary_text"])
89
 
90
  response_items = [
 
39
 
40
  for i in range(0, len(tokens), max_tokens):
41
  chunk_tokens = tokens[i:i + max_tokens]
42
+ if len(chunk_tokens) > MAX_MODEL_TOKENS:
43
+ chunk_tokens = chunk_tokens[:MAX_MODEL_TOKENS]
44
  chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
45
  chunks.append(chunk)
46
 
 
58
  all_chunks.extend(chunks)
59
  chunk_map.extend([item.content_id] * len(chunks))
60
 
61
+ if not all_chunks:
62
+ logger.error("No valid chunks after chunking. Returning empty response.")
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return {"summaries": []}
64
 
65
  summaries = summarizer(
66
+ all_chunks,
67
  max_length=150,
68
  min_length=30,
69
  truncation=True,
 
73
 
74
  # Aggregate summaries back per content_id
75
  summary_map = {}
76
+ for content_id, result in zip(chunk_map, summaries):
77
  summary_map.setdefault(content_id, []).append(result["summary_text"])
78
 
79
  response_items = [