Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -289,13 +289,13 @@ class EnhancedFileProcessor:
|
|
289 |
return dataset
|
290 |
|
291 |
def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
|
292 |
-
"""Enhanced data chunking with sequence metadata"""
|
293 |
try:
|
294 |
-
# Convert data to JSON
|
295 |
json_str = json.dumps(data, ensure_ascii=False)
|
296 |
-
|
297 |
-
|
298 |
-
|
|
|
299 |
metadata_template = {
|
300 |
"chunk_index": 0,
|
301 |
"total_chunks": 1,
|
@@ -303,21 +303,37 @@ class EnhancedFileProcessor:
|
|
303 |
"chunk_hash": "",
|
304 |
"data": ""
|
305 |
}
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
chunk = {
|
314 |
-
"chunk_index":
|
315 |
-
"total_chunks": 1,
|
316 |
"total_length": total_length,
|
317 |
-
"chunk_hash": hash(
|
318 |
-
"data":
|
319 |
}
|
320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
|
322 |
# Calculate number of chunks needed
|
323 |
num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
|
|
|
289 |
return dataset
|
290 |
|
291 |
def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
|
|
|
292 |
try:
|
293 |
+
# Convert data to JSON bytes
|
294 |
json_str = json.dumps(data, ensure_ascii=False)
|
295 |
+
json_bytes = json_str.encode('utf-8')
|
296 |
+
total_length = len(json_bytes)
|
297 |
+
|
298 |
+
# Calculate metadata overhead in bytes
|
299 |
metadata_template = {
|
300 |
"chunk_index": 0,
|
301 |
"total_chunks": 1,
|
|
|
303 |
"chunk_hash": "",
|
304 |
"data": ""
|
305 |
}
|
306 |
+
overhead_bytes = len(json.dumps(metadata_template).encode('utf-8')) + 20 # Add padding
|
307 |
+
|
308 |
+
effective_chunk_size = max_size - overhead_bytes
|
309 |
+
|
310 |
+
if effective_chunk_size <= 0:
|
311 |
+
raise ValueError("Max size is too small after accounting for metadata overhead")
|
312 |
+
|
313 |
+
chunks = []
|
314 |
+
start = 0
|
315 |
+
while start < total_length:
|
316 |
+
end = start + effective_chunk_size
|
317 |
+
# Ensure valid Unicode by decoding
|
318 |
+
chunk_str = json_bytes[start:end].decode('utf-8', errors='replace')
|
319 |
chunk = {
|
320 |
+
"chunk_index": len(chunks),
|
321 |
+
"total_chunks": -1, # To be set later
|
322 |
"total_length": total_length,
|
323 |
+
"chunk_hash": hash(chunk_str) & 0xFFFFFFFF,
|
324 |
+
"data": chunk_str
|
325 |
}
|
326 |
+
chunks.append(chunk)
|
327 |
+
start = end
|
328 |
+
|
329 |
+
# Update total_chunks in each chunk
|
330 |
+
for i, chunk in enumerate(chunks):
|
331 |
+
chunk["total_chunks"] = len(chunks)
|
332 |
+
|
333 |
+
return chunks
|
334 |
+
except Exception as e:
|
335 |
+
logger.error(f"Error chunking data: {e}")
|
336 |
+
return []
|
337 |
|
338 |
# Calculate number of chunks needed
|
339 |
num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
|