acecalisto3 commited on
Commit
d2d1f59
·
verified ·
1 Parent(s): 7ae78cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -17
app.py CHANGED
@@ -289,13 +289,13 @@ class EnhancedFileProcessor:
289
  return dataset
290
 
291
  def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
292
- """Enhanced data chunking with sequence metadata"""
293
  try:
294
- # Convert data to JSON string
295
  json_str = json.dumps(data, ensure_ascii=False)
296
- total_length = len(json_str)
297
-
298
- # Calculate overhead for metadata
 
299
  metadata_template = {
300
  "chunk_index": 0,
301
  "total_chunks": 1,
@@ -303,21 +303,37 @@ class EnhancedFileProcessor:
303
  "chunk_hash": "",
304
  "data": ""
305
  }
306
- overhead = len(json.dumps(metadata_template)) + 20 # Extra padding for safety
307
-
308
- # Calculate effective chunk size
309
- effective_chunk_size = max_size - overhead
310
-
311
- if total_length <= effective_chunk_size:
312
- # Data fits in one chunk
 
 
 
 
 
 
313
  chunk = {
314
- "chunk_index": 0,
315
- "total_chunks": 1,
316
  "total_length": total_length,
317
- "chunk_hash": hash(json_str) & 0xFFFFFFFF, # 32-bit hash
318
- "data": json_str
319
  }
320
- return [chunk]
 
 
 
 
 
 
 
 
 
 
321
 
322
  # Calculate number of chunks needed
323
  num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
 
289
  return dataset
290
 
291
  def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
 
292
  try:
293
+ # Convert data to JSON bytes
294
  json_str = json.dumps(data, ensure_ascii=False)
295
+ json_bytes = json_str.encode('utf-8')
296
+ total_length = len(json_bytes)
297
+
298
+ # Calculate metadata overhead in bytes
299
  metadata_template = {
300
  "chunk_index": 0,
301
  "total_chunks": 1,
 
303
  "chunk_hash": "",
304
  "data": ""
305
  }
306
+ overhead_bytes = len(json.dumps(metadata_template).encode('utf-8')) + 20 # Add padding
307
+
308
+ effective_chunk_size = max_size - overhead_bytes
309
+
310
+ if effective_chunk_size <= 0:
311
+ raise ValueError("Max size is too small after accounting for metadata overhead")
312
+
313
+ chunks = []
314
+ start = 0
315
+ while start < total_length:
316
+ end = start + effective_chunk_size
317
+ # Ensure valid Unicode by decoding
318
+ chunk_str = json_bytes[start:end].decode('utf-8', errors='replace')
319
  chunk = {
320
+ "chunk_index": len(chunks),
321
+ "total_chunks": -1, # To be set later
322
  "total_length": total_length,
323
+ "chunk_hash": hash(chunk_str) & 0xFFFFFFFF,
324
+ "data": chunk_str
325
  }
326
+ chunks.append(chunk)
327
+ start = end
328
+
329
+ # Update total_chunks in each chunk
330
+ for i, chunk in enumerate(chunks):
331
+ chunk["total_chunks"] = len(chunks)
332
+
333
+ return chunks
334
+ except Exception as e:
335
+ logger.error(f"Error chunking data: {e}")
336
+ return []
337
 
338
  # Calculate number of chunks needed
339
  num_chunks = -(-total_length // effective_chunk_size) # Ceiling division