Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -227,43 +227,43 @@ class FileProcessor:
|
|
227 |
|
228 |
return chunks
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
# Seek to the last 1MB
|
246 |
-
f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
|
247 |
-
content += f.read() # Last 1MB
|
248 |
-
else:
|
249 |
-
# Regular file processing
|
250 |
-
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
251 |
-
content = f.read()
|
252 |
-
|
253 |
-
return [{
|
254 |
-
'source',
|
255 |
-
'filename': os.path.basename(file.name),
|
256 |
-
'file_size': file_stat.st_size,
|
257 |
-
'mime_type': mimetypes.guess_type(file.name)[0],
|
258 |
-
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
259 |
-
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
260 |
-
'content': content,
|
261 |
-
'timestamp': datetime.now().isoformat()
|
262 |
-
}]
|
263 |
-
except Exception as e:
|
264 |
-
logger.error(f"File processing error: {e}")
|
265 |
-
return []
|
266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
|
268 |
"""Clean and validate JSON data"""
|
269 |
try:
|
|
|
227 |
|
228 |
return chunks
|
229 |
|
230 |
+
def _process_single_file(self, file) -> List[Dict]:
|
231 |
+
"""Process a single file"""
|
232 |
+
try:
|
233 |
+
file_stat = os.stat(file.name)
|
234 |
|
235 |
+
# For very large files, read in chunks and summarize
|
236 |
+
if file_stat.st_size > 100 * 1024 * 1024: # 100MB
|
237 |
+
logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
|
238 |
|
239 |
+
# Read first and last 1MB for extremely large files
|
240 |
+
content = ""
|
241 |
+
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
242 |
+
content = f.read(1 * 1024 * 1024) # First 1MB
|
243 |
+
content += "\n...[Content truncated due to large file size]...\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
+
# Seek to the last 1MB
|
246 |
+
f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
|
247 |
+
content += f.read() # Last 1MB
|
248 |
+
else:
|
249 |
+
# Regular file processing
|
250 |
+
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
251 |
+
content = f.read()
|
252 |
+
|
253 |
+
return [{
|
254 |
+
'source': 'filename', # Assuming 'source' should be a string value
|
255 |
+
'filename': os.path.basename(file.name),
|
256 |
+
'file_size': file_stat.st_size,
|
257 |
+
'mime_type': mimetypes.guess_type(file.name)[0],
|
258 |
+
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
259 |
+
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
260 |
+
'content': content,
|
261 |
+
'timestamp': datetime.now().isoformat()
|
262 |
+
}]
|
263 |
+
except Exception as e:
|
264 |
+
logger.error(f"File processing error: {e}")
|
265 |
+
return []
|
266 |
+
|
267 |
def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
|
268 |
"""Clean and validate JSON data"""
|
269 |
try:
|