mike23415 commited on
Commit
415ccf1
·
verified ·
1 Parent(s): 66de5aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1176 -446
app.py CHANGED
@@ -11,14 +11,28 @@ import threading
11
  import time
12
  import logging
13
  from scipy import stats
 
 
 
 
 
 
 
 
 
14
  import matplotlib
15
- matplotlib.use('Agg') # Use non-interactive backend
16
  import matplotlib.pyplot as plt
17
  import seaborn as sns
 
 
 
18
  import io
19
  import base64
20
  from apscheduler.schedulers.background import BackgroundScheduler
21
  import atexit
 
 
22
 
23
  # Configure logging
24
  logging.basicConfig(level=logging.INFO)
@@ -30,31 +44,495 @@ CORS(app)
30
  # Configuration
31
  UPLOAD_FOLDER = '/tmp/uploads'
32
  PROCESSED_FOLDER = '/tmp/processed'
33
- MAX_FILE_SIZE = 512 * 1024 * 1024 # 512MB
34
- ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'xls', 'json', 'parquet', 'tsv'}
35
- FILE_EXPIRY_HOURS = 1
 
36
 
37
  # Ensure directories exist
38
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
39
- os.makedirs(PROCESSED_FOLDER, exist_ok=True)
40
 
41
- # File storage to track sessions and files
42
  file_storage = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def allowed_file(filename):
45
  return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
46
 
47
- def get_file_age(filepath):
48
- """Get file age in hours"""
49
- if os.path.exists(filepath):
50
- file_time = os.path.getmtime(filepath)
51
- return (time.time() - file_time) / 3600
52
- return float('inf')
53
-
54
  def cleanup_old_files():
55
- """Remove files older than FILE_EXPIRY_HOURS"""
56
  try:
57
- for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER]:
 
58
  for root, dirs, files in os.walk(folder):
59
  for file in files:
60
  filepath = os.path.join(root, file)
@@ -62,36 +540,40 @@ def cleanup_old_files():
62
  os.remove(filepath)
63
  logger.info(f"Cleaned up old file: {filepath}")
64
 
65
- # Clean up file_storage entries
66
  current_time = datetime.now()
67
- sessions_to_remove = []
68
- for session_id, files in file_storage.items():
69
- files_to_remove = []
70
- for file_id, file_info in files.items():
71
- file_time = datetime.fromisoformat(file_info['timestamp'])
72
- if (current_time - file_time).total_seconds() > FILE_EXPIRY_HOURS * 3600:
73
- files_to_remove.append(file_id)
74
-
75
- for file_id in files_to_remove:
76
- del files[file_id]
77
-
78
- if not files:
79
- sessions_to_remove.append(session_id)
80
-
81
- for session_id in sessions_to_remove:
82
- del file_storage[session_id]
83
 
 
 
 
84
  except Exception as e:
85
  logger.error(f"Error during cleanup: {str(e)}")
86
 
87
- # Setup scheduler for automatic cleanup
88
- scheduler = BackgroundScheduler()
89
- scheduler.add_job(func=cleanup_old_files, trigger="interval", minutes=15)
90
- scheduler.start()
91
- atexit.register(lambda: scheduler.shutdown())
 
92
 
93
  def load_data_file(filepath, filename):
94
- """Load data from various file formats"""
95
  try:
96
  file_ext = filename.rsplit('.', 1)[1].lower()
97
 
@@ -105,278 +587,29 @@ def load_data_file(filepath, filename):
105
  return pd.read_parquet(filepath)
106
  elif file_ext == 'tsv':
107
  return pd.read_csv(filepath, sep='\t')
 
 
108
  else:
109
  raise ValueError(f"Unsupported file format: {file_ext}")
110
  except Exception as e:
111
  raise Exception(f"Error loading file: {str(e)}")
112
 
113
- def perform_basic_statistics(df, columns=None):
114
- """Perform basic statistical analysis"""
115
- if columns:
116
- df = df[columns]
117
-
118
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
119
- categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
120
-
121
- result = {
122
- 'numeric_summary': {},
123
- 'categorical_summary': {},
124
- 'general_info': {
125
- 'total_rows': len(df),
126
- 'total_columns': len(df.columns),
127
- 'numeric_columns': len(numeric_cols),
128
- 'categorical_columns': len(categorical_cols),
129
- 'missing_values': df.isnull().sum().to_dict()
130
- }
131
- }
132
-
133
- # Numeric statistics
134
- if numeric_cols:
135
- numeric_stats = df[numeric_cols].describe()
136
- result['numeric_summary'] = numeric_stats.to_dict()
137
-
138
- # Categorical statistics
139
- if categorical_cols:
140
- for col in categorical_cols:
141
- result['categorical_summary'][col] = {
142
- 'unique_values': df[col].nunique(),
143
- 'top_values': df[col].value_counts().head(10).to_dict(),
144
- 'missing_count': df[col].isnull().sum()
145
- }
146
-
147
- return result
148
-
149
- def perform_groupby_analysis(df, group_column, target_column, operation='mean', filters=None):
150
- """Perform group by analysis"""
151
- # Apply filters if provided
152
- if filters:
153
- for f in filters:
154
- col, op, val = f['column'], f['operator'], f['value']
155
- if op == '>':
156
- df = df[df[col] > val]
157
- elif op == '<':
158
- df = df[df[col] < val]
159
- elif op == '==':
160
- df = df[df[col] == val]
161
- elif op == '!=':
162
- df = df[df[col] != val]
163
- elif op == '>=':
164
- df = df[df[col] >= val]
165
- elif op == '<=':
166
- df = df[df[col] <= val]
167
-
168
- # Perform groupby operation
169
- grouped = df.groupby(group_column)[target_column]
170
-
171
- if operation == 'mean':
172
- result = grouped.mean()
173
- elif operation == 'sum':
174
- result = grouped.sum()
175
- elif operation == 'count':
176
- result = grouped.count()
177
- elif operation == 'max':
178
- result = grouped.max()
179
- elif operation == 'min':
180
- result = grouped.min()
181
- elif operation == 'std':
182
- result = grouped.std()
183
- else:
184
- raise ValueError(f"Unsupported operation: {operation}")
185
-
186
- return {
187
- 'result': result.to_dict(),
188
- 'operation': operation,
189
- 'group_column': group_column,
190
- 'target_column': target_column,
191
- 'total_groups': len(result)
192
- }
193
-
194
- def perform_correlation_analysis(df, columns=None, method='pearson'):
195
- """Perform correlation analysis"""
196
- if columns:
197
- df = df[columns]
198
-
199
- # Only numeric columns
200
- numeric_df = df.select_dtypes(include=[np.number])
201
-
202
- if numeric_df.empty:
203
- raise ValueError("No numeric columns found for correlation analysis")
204
-
205
- correlation_matrix = numeric_df.corr(method=method)
206
-
207
- return {
208
- 'correlation_matrix': correlation_matrix.to_dict(),
209
- 'method': method,
210
- 'columns': numeric_df.columns.tolist()
211
- }
212
-
213
- def detect_outliers(df, columns=None, method='iqr'):
214
- """Detect outliers in numeric columns"""
215
- if columns:
216
- df = df[columns]
217
-
218
- numeric_df = df.select_dtypes(include=[np.number])
219
- outliers = {}
220
-
221
- for col in numeric_df.columns:
222
- if method == 'iqr':
223
- Q1 = numeric_df[col].quantile(0.25)
224
- Q3 = numeric_df[col].quantile(0.75)
225
- IQR = Q3 - Q1
226
- lower_bound = Q1 - 1.5 * IQR
227
- upper_bound = Q3 + 1.5 * IQR
228
-
229
- outlier_indices = numeric_df[(numeric_df[col] < lower_bound) |
230
- (numeric_df[col] > upper_bound)].index.tolist()
231
-
232
- elif method == 'zscore':
233
- z_scores = np.abs(stats.zscore(numeric_df[col].dropna()))
234
- outlier_indices = numeric_df[z_scores > 3].index.tolist()
235
-
236
- outliers[col] = {
237
- 'count': len(outlier_indices),
238
- 'indices': outlier_indices[:100], # Limit to first 100
239
- 'percentage': (len(outlier_indices) / len(numeric_df)) * 100
240
- }
241
-
242
- return outliers
243
 
244
- def generate_visualization(df, chart_type, x_column, y_column=None, group_column=None):
245
- """Generate visualization and return base64 encoded image"""
246
- plt.figure(figsize=(10, 6))
247
-
248
- try:
249
- if chart_type == 'histogram':
250
- plt.hist(df[x_column], bins=30, alpha=0.7)
251
- plt.xlabel(x_column)
252
- plt.ylabel('Frequency')
253
- plt.title(f'Histogram of {x_column}')
254
-
255
- elif chart_type == 'scatter':
256
- if not y_column:
257
- raise ValueError("Y column required for scatter plot")
258
- plt.scatter(df[x_column], df[y_column], alpha=0.6)
259
- plt.xlabel(x_column)
260
- plt.ylabel(y_column)
261
- plt.title(f'{x_column} vs {y_column}')
262
-
263
- elif chart_type == 'bar':
264
- if group_column:
265
- grouped = df.groupby(group_column)[x_column].mean() if pd.api.types.is_numeric_dtype(df[x_column]) else df[group_column].value_counts()
266
- else:
267
- grouped = df[x_column].value_counts().head(20)
268
-
269
- grouped.plot(kind='bar')
270
- plt.xlabel(group_column or x_column)
271
- plt.ylabel('Count' if not pd.api.types.is_numeric_dtype(df[x_column]) else f'Mean {x_column}')
272
- plt.title(f'Bar Chart')
273
- plt.xticks(rotation=45)
274
-
275
- elif chart_type == 'line':
276
- if y_column:
277
- plt.plot(df[x_column], df[y_column])
278
- plt.xlabel(x_column)
279
- plt.ylabel(y_column)
280
- else:
281
- df[x_column].plot()
282
- plt.ylabel(x_column)
283
- plt.title('Line Chart')
284
-
285
- elif chart_type == 'box':
286
- if group_column:
287
- df.boxplot(column=x_column, by=group_column)
288
- else:
289
- df.boxplot(column=x_column)
290
- plt.title('Box Plot')
291
-
292
- plt.tight_layout()
293
-
294
- # Convert plot to base64 string
295
- img_buffer = io.BytesIO()
296
- plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight')
297
- img_buffer.seek(0)
298
- img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
299
- plt.close()
300
-
301
- return img_base64
302
-
303
- except Exception as e:
304
- plt.close()
305
- raise Exception(f"Error generating visualization: {str(e)}")
306
-
307
- def parse_natural_language_query(query, df_columns):
308
- """Simple natural language query parser"""
309
- query_lower = query.lower()
310
-
311
- # Define operation keywords
312
- operations = {
313
- 'average': 'mean', 'mean': 'mean', 'avg': 'mean',
314
- 'sum': 'sum', 'total': 'sum',
315
- 'count': 'count', 'number': 'count',
316
- 'max': 'max', 'maximum': 'max', 'highest': 'max',
317
- 'min': 'min', 'minimum': 'min', 'lowest': 'min'
318
- }
319
-
320
- # Find operation
321
- operation = 'mean' # default
322
- for keyword, op in operations.items():
323
- if keyword in query_lower:
324
- operation = op
325
- break
326
-
327
- # Find columns mentioned in query
328
- mentioned_columns = [col for col in df_columns if col.lower() in query_lower]
329
-
330
- # Simple parsing patterns
331
- if 'by' in query_lower and len(mentioned_columns) >= 2:
332
- # Group by analysis
333
- target_col = mentioned_columns[0]
334
- group_col = mentioned_columns[-1]
335
-
336
- return {
337
- 'analysisType': 'groupby',
338
- 'parameters': {
339
- 'groupByColumn': group_col,
340
- 'targetColumn': target_col,
341
- 'operation': operation
342
- }
343
- }
344
- elif 'correlation' in query_lower:
345
- return {
346
- 'analysisType': 'correlation',
347
- 'parameters': {
348
- 'columns': mentioned_columns if mentioned_columns else None
349
- }
350
- }
351
- elif any(word in query_lower for word in ['chart', 'plot', 'graph', 'visualize']):
352
- chart_type = 'bar' # default
353
- if 'scatter' in query_lower:
354
- chart_type = 'scatter'
355
- elif 'line' in query_lower:
356
- chart_type = 'line'
357
- elif 'histogram' in query_lower:
358
- chart_type = 'histogram'
359
-
360
- return {
361
- 'analysisType': 'visualization',
362
- 'parameters': {
363
- 'chartType': chart_type,
364
- 'xColumn': mentioned_columns[0] if mentioned_columns else None,
365
- 'yColumn': mentioned_columns[1] if len(mentioned_columns) > 1 else None
366
- }
367
- }
368
- else:
369
- # Default to basic statistics
370
- return {
371
- 'analysisType': 'statistics',
372
- 'parameters': {
373
- 'columns': mentioned_columns if mentioned_columns else None
374
- }
375
- }
376
 
377
  @app.route('/api/health', methods=['GET'])
378
  def health_check():
379
- return jsonify({'status': 'healthy', 'timestamp': datetime.now().isoformat()})
 
 
 
 
 
380
 
381
  @app.route('/api/upload', methods=['POST'])
382
  def upload_file():
@@ -397,9 +630,9 @@ def upload_file():
397
  return jsonify({'error': 'File type not supported'}), 400
398
 
399
  # Check file size
400
- file.seek(0, 2) # Seek to end
401
  file_size = file.tell()
402
- file.seek(0) # Reset to beginning
403
 
404
  if file_size > MAX_FILE_SIZE:
405
  return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400
@@ -416,7 +649,7 @@ def upload_file():
416
  filepath = os.path.join(session_dir, f"{file_id}_{filename}")
417
  file.save(filepath)
418
 
419
- # Store file info
420
  if session_id not in file_storage:
421
  file_storage[session_id] = {}
422
 
@@ -424,13 +657,16 @@ def upload_file():
424
  'filename': filename,
425
  'filepath': filepath,
426
  'size': file_size,
427
- 'timestamp': datetime.now().isoformat()
 
 
428
  }
429
 
430
  return jsonify({
431
  'fileId': file_id,
432
  'filename': filename,
433
  'size': file_size,
 
434
  'message': 'File uploaded successfully'
435
  })
436
 
@@ -438,8 +674,9 @@ def upload_file():
438
  logger.error(f"Upload error: {str(e)}")
439
  return jsonify({'error': str(e)}), 500
440
 
441
- @app.route('/api/preview/<file_id>', methods=['GET'])
442
- def preview_file(file_id):
 
443
  try:
444
  session_id = request.args.get('sessionId')
445
  if not session_id or session_id not in file_storage:
@@ -449,33 +686,70 @@ def preview_file(file_id):
449
  return jsonify({'error': 'File not found'}), 404
450
 
451
  file_info = file_storage[session_id][file_id]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
- # Load data and get preview
 
 
 
 
 
 
454
  df = load_data_file(file_info['filepath'], file_info['filename'])
455
 
456
- preview_data = {
457
- 'columns': df.columns.tolist(),
458
- 'dtypes': df.dtypes.astype(str).to_dict(),
459
- 'shape': df.shape,
460
- 'head': df.head(5).to_dict('records'),
461
- 'missing_values': df.isnull().sum().to_dict()
462
- }
463
 
464
- return jsonify(preview_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
 
466
  except Exception as e:
467
- logger.error(f"Preview error: {str(e)}")
468
  return jsonify({'error': str(e)}), 500
469
 
470
- @app.route('/api/analyze', methods=['POST'])
471
- def analyze_data():
 
472
  try:
473
  data = request.get_json()
474
  session_id = data.get('sessionId')
475
  file_id = data.get('fileId')
476
- analysis_type = data.get('analysisType')
477
- parameters = data.get('parameters', {})
478
- natural_query = data.get('naturalQuery')
479
 
480
  if not all([session_id, file_id]):
481
  return jsonify({'error': 'Session ID and File ID required'}), 400
@@ -486,181 +760,637 @@ def analyze_data():
486
  file_info = file_storage[session_id][file_id]
487
  df = load_data_file(file_info['filepath'], file_info['filename'])
488
 
489
- # Handle natural language query
490
- if natural_query and not analysis_type:
491
- parsed_query = parse_natural_language_query(natural_query, df.columns.tolist())
492
- analysis_type = parsed_query['analysisType']
493
- parameters = parsed_query['parameters']
494
 
495
- result = {}
 
 
 
496
 
497
- if analysis_type == 'statistics':
498
- result = perform_basic_statistics(df, parameters.get('columns'))
499
-
500
- elif analysis_type == 'groupby':
501
- result = perform_groupby_analysis(
502
- df,
503
- parameters.get('groupByColumn'),
504
- parameters.get('targetColumn'),
505
- parameters.get('operation', 'mean'),
506
- parameters.get('filters')
507
- )
508
-
509
- elif analysis_type == 'correlation':
510
- result = perform_correlation_analysis(
511
- df,
512
- parameters.get('columns'),
513
- parameters.get('method', 'pearson')
514
- )
515
-
516
- elif analysis_type == 'outliers':
517
- result = detect_outliers(
518
- df,
519
- parameters.get('columns'),
520
- parameters.get('method', 'iqr')
521
- )
522
-
523
- elif analysis_type == 'visualization':
524
- chart_base64 = generate_visualization(
525
- df,
526
- parameters.get('chartType', 'bar'),
527
- parameters.get('xColumn'),
528
- parameters.get('yColumn'),
529
- parameters.get('groupColumn')
530
- )
531
- result = {
532
- 'chart': chart_base64,
533
- 'chartType': parameters.get('chartType', 'bar')
534
- }
535
-
536
- else:
537
- return jsonify({'error': 'Invalid analysis type'}), 400
538
 
539
- # Save result to processed folder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  result_id = str(uuid.uuid4())
541
  result_dir = os.path.join(PROCESSED_FOLDER, session_id)
542
  os.makedirs(result_dir, exist_ok=True)
543
 
544
- result_filepath = os.path.join(result_dir, f"{result_id}_result.json")
545
  with open(result_filepath, 'w') as f:
546
- json.dump(result, f, indent=2, default=str)
547
 
548
  return jsonify({
549
  'resultId': result_id,
550
- 'result': result,
551
- 'analysisType': analysis_type,
552
  'timestamp': datetime.now().isoformat()
553
  })
554
 
555
  except Exception as e:
556
- logger.error(f"Analysis error: {str(e)}")
557
  return jsonify({'error': str(e)}), 500
558
 
559
- @app.route('/api/files/<session_id>', methods=['GET'])
560
- def list_files(session_id):
 
561
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  if session_id not in file_storage:
563
- return jsonify({'files': []})
564
-
565
- files = []
566
- for file_id, file_info in file_storage[session_id].items():
567
- # Check if file still exists
568
- if os.path.exists(file_info['filepath']):
569
- files.append({
570
- 'fileId': file_id,
571
- 'filename': file_info['filename'],
572
- 'size': file_info['size'],
573
- 'timestamp': file_info['timestamp']
574
- })
575
 
576
- return jsonify({'files': files})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  except Exception as e:
579
- logger.error(f"List files error: {str(e)}")
580
  return jsonify({'error': str(e)}), 500
581
 
582
- @app.route('/api/file/<file_id>', methods=['DELETE'])
583
- def delete_file(file_id):
 
584
  try:
585
- session_id = request.args.get('sessionId')
586
- if not session_id or session_id not in file_storage:
587
- return jsonify({'error': 'Invalid session'}), 400
 
 
588
 
589
- if file_id not in file_storage[session_id]:
 
 
 
590
  return jsonify({'error': 'File not found'}), 404
591
 
592
  file_info = file_storage[session_id][file_id]
 
593
 
594
- # Remove file from filesystem
595
- if os.path.exists(file_info['filepath']):
596
- os.remove(file_info['filepath'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
 
598
- # Remove from storage
599
- del file_storage[session_id][file_id]
600
 
601
- return jsonify({'message': 'File deleted successfully'})
 
 
 
 
602
 
603
  except Exception as e:
604
- logger.error(f"Delete error: {str(e)}")
605
  return jsonify({'error': str(e)}), 500
606
 
607
- @app.route('/api/download/<result_id>', methods=['GET'])
608
- def download_result(result_id):
 
609
  try:
610
- session_id = request.args.get('sessionId')
611
- format_type = request.args.get('format', 'json')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
 
613
  if not session_id:
614
  return jsonify({'error': 'Session ID required'}), 400
615
 
616
- result_filepath = os.path.join(PROCESSED_FOLDER, session_id, f"{result_id}_result.json")
 
 
 
 
 
 
 
 
 
 
617
 
618
- if not os.path.exists(result_filepath):
619
- return jsonify({'error': 'Result not found'}), 404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
 
621
- if format_type == 'json':
622
- return send_file(result_filepath, as_attachment=True,
623
- download_name=f"analysis_result_{result_id}.json")
624
- else:
625
- return jsonify({'error': 'Format not supported'}), 400
626
-
627
  except Exception as e:
628
- logger.error(f"Download error: {str(e)}")
629
  return jsonify({'error': str(e)}), 500
630
 
631
  @app.route('/', methods=['GET'])
632
  def home():
633
  return jsonify({
634
- 'message': 'Data Analytics API is running!',
635
- 'version': '1.0.0',
 
 
 
 
 
 
 
636
  'endpoints': {
637
- 'health': '/api/health',
638
- 'upload': '/api/upload',
639
- 'preview': '/api/preview/<file_id>',
640
- 'analyze': '/api/analyze',
641
- 'files': '/api/files/<session_id>',
642
- 'delete': '/api/file/<file_id>',
643
- 'download': '/api/download/<result_id>'
644
  },
645
  'timestamp': datetime.now().isoformat()
646
  })
647
 
648
- @app.errorhandler(404)
649
- def not_found(error):
650
- return jsonify({
651
- 'error': 'Endpoint not found',
652
- 'message': 'Please check the API documentation',
653
- 'available_endpoints': [
654
- '/',
655
- '/api/health',
656
- '/api/upload',
657
- '/api/preview/<file_id>',
658
- '/api/analyze',
659
- '/api/files/<session_id>',
660
- '/api/file/<file_id>',
661
- '/api/download/<result_id>'
662
- ]
663
- }), 404
664
-
665
  if __name__ == '__main__':
666
- app.run(host='0.0.0.0', port=7860, debug=True)
 
11
  import time
12
  import logging
13
  from scipy import stats
14
+ from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
15
+ from sklearn.model_selection import train_test_split, cross_val_score
16
+ from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
17
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
18
+ from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
19
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
20
+ from sklearn.decomposition import PCA
21
+ from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
22
+ from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
23
  import matplotlib
24
+ matplotlib.use('Agg')
25
  import matplotlib.pyplot as plt
26
  import seaborn as sns
27
+ import plotly.graph_objects as go
28
+ import plotly.express as px
29
+ from plotly.utils import PlotlyJSONEncoder
30
  import io
31
  import base64
32
  from apscheduler.schedulers.background import BackgroundScheduler
33
  import atexit
34
+ import warnings
35
+ warnings.filterwarnings('ignore')
36
 
37
  # Configure logging
38
  logging.basicConfig(level=logging.INFO)
 
44
  # Configuration
45
  UPLOAD_FOLDER = '/tmp/uploads'
46
  PROCESSED_FOLDER = '/tmp/processed'
47
+ MODELS_FOLDER = '/tmp/models'
48
+ MAX_FILE_SIZE = 1024 * 1024 * 1024 # 1GB for enterprise
49
+ ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'xls', 'json', 'parquet', 'tsv', 'feather'}
50
+ FILE_EXPIRY_HOURS = 24 # Extended for enterprise use
51
 
52
  # Ensure directories exist
53
+ for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER, MODELS_FOLDER]:
54
+ os.makedirs(folder, exist_ok=True)
55
 
56
+ # Enhanced file storage with metadata
57
  file_storage = {}
58
+ model_storage = {}
59
+ analysis_history = {}
60
+
61
+ class EnterpriseAnalytics:
62
+ """Enterprise-grade analytics engine"""
63
+
64
+ def __init__(self):
65
+ self.scaler = StandardScaler()
66
+ self.models = {}
67
+
68
+ def advanced_data_profiling(self, df):
69
+ """Comprehensive data profiling like enterprise tools"""
70
+ profile = {
71
+ 'dataset_overview': {
72
+ 'rows': len(df),
73
+ 'columns': len(df.columns),
74
+ 'memory_usage': df.memory_usage(deep=True).sum(),
75
+ 'duplicate_rows': df.duplicated().sum()
76
+ },
77
+ 'column_analysis': {},
78
+ 'data_quality': {},
79
+ 'relationships': {},
80
+ 'recommendations': []
81
+ }
82
+
83
+ for col in df.columns:
84
+ col_data = df[col]
85
+ col_profile = {
86
+ 'dtype': str(col_data.dtype),
87
+ 'missing_count': col_data.isnull().sum(),
88
+ 'missing_percentage': (col_data.isnull().sum() / len(df)) * 100,
89
+ 'unique_values': col_data.nunique(),
90
+ 'cardinality': col_data.nunique() / len(df) if len(df) > 0 else 0
91
+ }
92
+
93
+ if pd.api.types.is_numeric_dtype(col_data):
94
+ col_profile.update({
95
+ 'statistics': {
96
+ 'mean': col_data.mean(),
97
+ 'median': col_data.median(),
98
+ 'std': col_data.std(),
99
+ 'min': col_data.min(),
100
+ 'max': col_data.max(),
101
+ 'q25': col_data.quantile(0.25),
102
+ 'q75': col_data.quantile(0.75),
103
+ 'skewness': stats.skew(col_data.dropna()),
104
+ 'kurtosis': stats.kurtosis(col_data.dropna())
105
+ },
106
+ 'distribution': 'normal' if abs(stats.skew(col_data.dropna())) < 0.5 else 'skewed'
107
+ })
108
+ else:
109
+ col_profile.update({
110
+ 'top_categories': col_data.value_counts().head(10).to_dict(),
111
+ 'category_distribution': 'uniform' if col_data.value_counts().std() < col_data.value_counts().mean() * 0.5 else 'imbalanced'
112
+ })
113
+
114
+ profile['column_analysis'][col] = col_profile
115
+
116
+ # Data quality assessment
117
+ profile['data_quality'] = {
118
+ 'completeness_score': (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,
119
+ 'uniqueness_score': (df.nunique().sum() / (len(df) * len(df.columns))) * 100,
120
+ 'consistency_score': self._calculate_consistency_score(df)
121
+ }
122
+
123
+ # Generate recommendations
124
+ profile['recommendations'] = self._generate_recommendations(df, profile)
125
+
126
+ return profile
127
+
128
+ def _calculate_consistency_score(self, df):
129
+ """Calculate data consistency score"""
130
+ score = 100
131
+ for col in df.select_dtypes(include=['object']):
132
+ # Check for inconsistent formatting
133
+ values = df[col].dropna().astype(str)
134
+ if len(values) > 0:
135
+ # Check for mixed case
136
+ if len(set([v.lower() for v in values])) != len(set(values)):
137
+ score -= 5
138
+ # Check for leading/trailing spaces
139
+ if any(v != v.strip() for v in values):
140
+ score -= 5
141
+ return max(0, score)
142
+
143
+ def _generate_recommendations(self, df, profile):
144
+ """Generate actionable recommendations"""
145
+ recommendations = []
146
+
147
+ # High missing value columns
148
+ for col, analysis in profile['column_analysis'].items():
149
+ if analysis['missing_percentage'] > 20:
150
+ recommendations.append({
151
+ 'type': 'data_quality',
152
+ 'priority': 'high',
153
+ 'message': f"Column '{col}' has {analysis['missing_percentage']:.1f}% missing values. Consider imputation or removal.",
154
+ 'action': 'handle_missing_values'
155
+ })
156
+
157
+ # High cardinality categorical columns
158
+ for col, analysis in profile['column_analysis'].items():
159
+ if analysis.get('cardinality', 0) > 0.8 and df[col].dtype == 'object':
160
+ recommendations.append({
161
+ 'type': 'feature_engineering',
162
+ 'priority': 'medium',
163
+ 'message': f"Column '{col}' has high cardinality. Consider feature encoding or dimensionality reduction.",
164
+ 'action': 'encode_categorical'
165
+ })
166
+
167
+ # Skewed distributions
168
+ for col, analysis in profile['column_analysis'].items():
169
+ if 'statistics' in analysis and abs(analysis['statistics']['skewness']) > 2:
170
+ recommendations.append({
171
+ 'type': 'data_transformation',
172
+ 'priority': 'medium',
173
+ 'message': f"Column '{col}' is highly skewed. Consider log transformation or scaling.",
174
+ 'action': 'transform_distribution'
175
+ })
176
+
177
+ return recommendations
178
+
179
+ def advanced_feature_engineering(self, df, target_column=None):
180
+ """Enterprise-level feature engineering"""
181
+ engineered_features = {}
182
+
183
+ # Numeric feature engineering
184
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
185
+ for col in numeric_cols:
186
+ if col != target_column:
187
+ # Polynomial features
188
+ engineered_features[f'{col}_squared'] = df[col] ** 2
189
+ engineered_features[f'{col}_log'] = np.log1p(df[col].abs())
190
+
191
+ # Binning
192
+ engineered_features[f'{col}_binned'] = pd.cut(df[col], bins=5, labels=False)
193
+
194
+ # Rolling statistics (if data has time component)
195
+ if len(df) > 10:
196
+ engineered_features[f'{col}_rolling_mean'] = df[col].rolling(window=min(5, len(df)//2)).mean()
197
+
198
+ # Categorical feature engineering
199
+ categorical_cols = df.select_dtypes(include=['object']).columns
200
+ for col in categorical_cols:
201
+ if col != target_column:
202
+ # Frequency encoding
203
+ freq_map = df[col].value_counts().to_dict()
204
+ engineered_features[f'{col}_frequency'] = df[col].map(freq_map)
205
+
206
+ # Target encoding (if target is provided)
207
+ if target_column and target_column in df.columns:
208
+ target_mean = df.groupby(col)[target_column].mean()
209
+ engineered_features[f'{col}_target_encoded'] = df[col].map(target_mean)
210
+
211
+ # Interaction features
212
+ if len(numeric_cols) >= 2:
213
+ col_pairs = [(numeric_cols[i], numeric_cols[j])
214
+ for i in range(len(numeric_cols))
215
+ for j in range(i+1, min(i+3, len(numeric_cols)))] # Limit combinations
216
+
217
+ for col1, col2 in col_pairs:
218
+ if col1 != target_column and col2 != target_column:
219
+ engineered_features[f'{col1}_{col2}_interaction'] = df[col1] * df[col2]
220
+ engineered_features[f'{col1}_{col2}_ratio'] = df[col1] / (df[col2] + 1e-8)
221
+
222
+ return pd.DataFrame(engineered_features, index=df.index)
223
+
224
+ def automated_ml_pipeline(self, df, target_column, problem_type='auto'):
225
+ """Enterprise AutoML pipeline"""
226
+ results = {
227
+ 'preprocessing': {},
228
+ 'feature_selection': {},
229
+ 'models': {},
230
+ 'best_model': {},
231
+ 'predictions': {},
232
+ 'feature_importance': {}
233
+ }
234
+
235
+ # Determine problem type
236
+ if problem_type == 'auto':
237
+ if df[target_column].dtype in ['object', 'category'] or df[target_column].nunique() < 10:
238
+ problem_type = 'classification'
239
+ else:
240
+ problem_type = 'regression'
241
+
242
+ # Preprocessing
243
+ feature_cols = [col for col in df.columns if col != target_column]
244
+ X = df[feature_cols].copy()
245
+ y = df[target_column].copy()
246
+
247
+ # Handle missing values
248
+ X_numeric = X.select_dtypes(include=[np.number])
249
+ X_categorical = X.select_dtypes(include=['object'])
250
+
251
+ if not X_numeric.empty:
252
+ X_numeric = X_numeric.fillna(X_numeric.median())
253
+ if not X_categorical.empty:
254
+ X_categorical = X_categorical.fillna(X_categorical.mode().iloc[0] if not X_categorical.mode().empty else 'Unknown')
255
+
256
+ # Encode categorical variables
257
+ if not X_categorical.empty:
258
+ le = LabelEncoder()
259
+ for col in X_categorical.columns:
260
+ X_categorical[col] = le.fit_transform(X_categorical[col].astype(str))
261
+
262
+ X_processed = pd.concat([X_numeric, X_categorical], axis=1)
263
+
264
+ # Handle target variable for classification
265
+ if problem_type == 'classification' and y.dtype == 'object':
266
+ le_target = LabelEncoder()
267
+ y = le_target.fit_transform(y)
268
+
269
+ # Feature selection
270
+ if len(X_processed.columns) > 10:
271
+ selector = SelectKBest(f_regression, k=min(10, len(X_processed.columns)))
272
+ X_selected = selector.fit_transform(X_processed, y)
273
+ selected_features = X_processed.columns[selector.get_support()].tolist()
274
+ X_processed = pd.DataFrame(X_selected, columns=selected_features)
275
+ results['feature_selection']['selected_features'] = selected_features
276
+
277
+ # Split data
278
+ X_train, X_test, y_train, y_test = train_test_split(
279
+ X_processed, y, test_size=0.2, random_state=42
280
+ )
281
+
282
+ # Scale features
283
+ scaler = StandardScaler()
284
+ X_train_scaled = scaler.fit_transform(X_train)
285
+ X_test_scaled = scaler.transform(X_test)
286
+
287
+ # Model selection based on problem type
288
+ if problem_type == 'regression':
289
+ models = {
290
+ 'Linear Regression': LinearRegression(),
291
+ 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
292
+ 'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
293
+ 'Ridge Regression': Ridge()
294
+ }
295
+ else:
296
+ models = {
297
+ 'Logistic Regression': LogisticRegression(random_state=42),
298
+ 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
299
+ 'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
300
+ }
301
+
302
+ # Train and evaluate models
303
+ best_score = -np.inf if problem_type == 'regression' else 0
304
+ best_model_name = None
305
+
306
+ for name, model in models.items():
307
+ try:
308
+ # Cross-validation
309
+ if problem_type == 'regression':
310
+ scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
311
+ score = scores.mean()
312
+ else:
313
+ scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
314
+ score = scores.mean()
315
+
316
+ # Train final model
317
+ model.fit(X_train_scaled, y_train)
318
+ y_pred = model.predict(X_test_scaled)
319
+
320
+ if problem_type == 'regression':
321
+ test_score = r2_score(y_test, y_pred)
322
+ mse = mean_squared_error(y_test, y_pred)
323
+ results['models'][name] = {
324
+ 'cv_score': score,
325
+ 'test_r2': test_score,
326
+ 'test_mse': mse,
327
+ 'predictions': y_pred.tolist()
328
+ }
329
+ else:
330
+ test_score = model.score(X_test_scaled, y_test)
331
+ results['models'][name] = {
332
+ 'cv_score': score,
333
+ 'test_accuracy': test_score,
334
+ 'predictions': y_pred.tolist()
335
+ }
336
+
337
+ # Track best model
338
+ if score > best_score:
339
+ best_score = score
340
+ best_model_name = name
341
+
342
+ # Feature importance
343
+ if hasattr(model, 'feature_importances_'):
344
+ importance = dict(zip(X_processed.columns, model.feature_importances_))
345
+ results['feature_importance'] = dict(sorted(importance.items(), key=lambda x: x[1], reverse=True))
346
+
347
+ except Exception as e:
348
+ logger.error(f"Error training {name}: {str(e)}")
349
+ continue
350
+
351
+ results['best_model'] = {
352
+ 'name': best_model_name,
353
+ 'score': best_score,
354
+ 'problem_type': problem_type
355
+ }
356
+
357
+ results['preprocessing'] = {
358
+ 'numeric_features': X_numeric.columns.tolist() if not X_numeric.empty else [],
359
+ 'categorical_features': X_categorical.columns.tolist() if not X_categorical.empty else [],
360
+ 'scaling_applied': True,
361
+ 'missing_values_handled': True
362
+ }
363
+
364
+ return results
365
+
366
+ def advanced_clustering_analysis(self, df, n_clusters=None):
367
+ """Enterprise clustering with multiple algorithms"""
368
+ # Prepare data
369
+ numeric_df = df.select_dtypes(include=[np.number])
370
+ if numeric_df.empty:
371
+ raise ValueError("No numeric columns for clustering")
372
+
373
+ # Handle missing values
374
+ numeric_df = numeric_df.fillna(numeric_df.median())
375
+
376
+ # Scale data
377
+ scaler = StandardScaler()
378
+ X_scaled = scaler.fit_transform(numeric_df)
379
+
380
+ results = {
381
+ 'algorithms': {},
382
+ 'optimal_clusters': {},
383
+ 'silhouette_scores': {},
384
+ 'recommendations': []
385
+ }
386
+
387
+ # Determine optimal number of clusters if not provided
388
+ if n_clusters is None:
389
+ # Elbow method for K-means
390
+ inertias = []
391
+ k_range = range(2, min(11, len(numeric_df) // 2))
392
+
393
+ for k in k_range:
394
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
395
+ kmeans.fit(X_scaled)
396
+ inertias.append(kmeans.inertia_)
397
+
398
+ # Find elbow point (simplified)
399
+ if len(inertias) > 2:
400
+ diffs = np.diff(inertias)
401
+ second_diffs = np.diff(diffs)
402
+ n_clusters = k_range[np.argmax(second_diffs) + 1] if len(second_diffs) > 0 else 3
403
+ else:
404
+ n_clusters = 3
405
+
406
+ # Apply multiple clustering algorithms
407
+ algorithms = {
408
+ 'K-Means': KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
409
+ 'Hierarchical': AgglomerativeClustering(n_clusters=n_clusters),
410
+ 'DBSCAN': DBSCAN(eps=0.5, min_samples=5)
411
+ }
412
+
413
+ for name, algo in algorithms.items():
414
+ try:
415
+ if name == 'DBSCAN':
416
+ labels = algo.fit_predict(X_scaled)
417
+ n_clusters_found = len(set(labels)) - (1 if -1 in labels else 0)
418
+ else:
419
+ labels = algo.fit_predict(X_scaled)
420
+ n_clusters_found = n_clusters
421
+
422
+ # Calculate silhouette score
423
+ if len(set(labels)) > 1:
424
+ from sklearn.metrics import silhouette_score
425
+ sil_score = silhouette_score(X_scaled, labels)
426
+ else:
427
+ sil_score = 0
428
+
429
+ results['algorithms'][name] = {
430
+ 'labels': labels.tolist(),
431
+ 'n_clusters': n_clusters_found,
432
+ 'silhouette_score': sil_score
433
+ }
434
+
435
+ results['silhouette_scores'][name] = sil_score
436
+
437
+ except Exception as e:
438
+ logger.error(f"Error in {name} clustering: {str(e)}")
439
+ continue
440
+
441
+ # PCA for visualization
442
+ if len(numeric_df.columns) > 2:
443
+ pca = PCA(n_components=2)
444
+ X_pca = pca.fit_transform(X_scaled)
445
+ results['pca_components'] = X_pca.tolist()
446
+ results['pca_explained_variance'] = pca.explained_variance_ratio_.tolist()
447
+
448
+ # Generate recommendations
449
+ best_algo = max(results['silhouette_scores'].items(), key=lambda x: x[1])[0]
450
+ results['recommendations'].append({
451
+ 'type': 'clustering',
452
+ 'message': f"Best clustering algorithm: {best_algo} with silhouette score: {results['silhouette_scores'][best_algo]:.3f}",
453
+ 'optimal_clusters': results['algorithms'][best_algo]['n_clusters']
454
+ })
455
+
456
+ return results
457
+
458
+ def time_series_analysis(self, df, date_column, value_column):
459
+ """Advanced time series analysis"""
460
+ # Convert date column
461
+ df[date_column] = pd.to_datetime(df[date_column])
462
+ df = df.sort_values(date_column)
463
+
464
+ # Set date as index
465
+ ts_df = df.set_index(date_column)[value_column]
466
+
467
+ results = {
468
+ 'trend_analysis': {},
469
+ 'seasonality': {},
470
+ 'forecasting': {},
471
+ 'anomalies': {},
472
+ 'statistics': {}
473
+ }
474
+
475
+ # Basic statistics
476
+ results['statistics'] = {
477
+ 'mean': ts_df.mean(),
478
+ 'std': ts_df.std(),
479
+ 'min': ts_df.min(),
480
+ 'max': ts_df.max(),
481
+ 'trend': 'increasing' if ts_df.iloc[-1] > ts_df.iloc[0] else 'decreasing'
482
+ }
483
+
484
+ # Trend analysis using linear regression
485
+ X = np.arange(len(ts_df)).reshape(-1, 1)
486
+ y = ts_df.values
487
+
488
+ lr = LinearRegression()
489
+ lr.fit(X, y)
490
+ trend_slope = lr.coef_[0]
491
+
492
+ results['trend_analysis'] = {
493
+ 'slope': trend_slope,
494
+ 'direction': 'increasing' if trend_slope > 0 else 'decreasing',
495
+ 'strength': abs(trend_slope)
496
+ }
497
+
498
+ # Simple anomaly detection using IQR
499
+ Q1 = ts_df.quantile(0.25)
500
+ Q3 = ts_df.quantile(0.75)
501
+ IQR = Q3 - Q1
502
+
503
+ anomalies = ts_df[(ts_df < Q1 - 1.5 * IQR) | (ts_df > Q3 + 1.5 * IQR)]
504
+
505
+ results['anomalies'] = {
506
+ 'count': len(anomalies),
507
+ 'dates': anomalies.index.strftime('%Y-%m-%d').tolist(),
508
+ 'values': anomalies.values.tolist()
509
+ }
510
+
511
+ # Simple forecasting (moving average)
512
+ window = min(7, len(ts_df) // 4)
513
+ if window > 0:
514
+ forecast_periods = min(10, len(ts_df) // 4)
515
+ last_values = ts_df.tail(window).mean()
516
+
517
+ results['forecasting'] = {
518
+ 'method': 'moving_average',
519
+ 'forecast_periods': forecast_periods,
520
+ 'forecast_values': [last_values] * forecast_periods
521
+ }
522
+
523
+ return results
524
+
525
+ # Initialize analytics engine
526
+ analytics_engine = EnterpriseAnalytics()
527
 
528
  def allowed_file(filename):
529
  return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
530
 
 
 
 
 
 
 
 
531
  def cleanup_old_files():
532
+ """Enhanced cleanup with model cleanup"""
533
  try:
534
+ # Existing cleanup logic...
535
+ for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER, MODELS_FOLDER]:
536
  for root, dirs, files in os.walk(folder):
537
  for file in files:
538
  filepath = os.path.join(root, file)
 
540
  os.remove(filepath)
541
  logger.info(f"Cleaned up old file: {filepath}")
542
 
543
+ # Clean up storage entries
544
  current_time = datetime.now()
545
+ for storage in [file_storage, model_storage, analysis_history]:
546
+ sessions_to_remove = []
547
+ for session_id, session_data in storage.items():
548
+ if isinstance(session_data, dict):
549
+ items_to_remove = []
550
+ for item_id, item_info in session_data.items():
551
+ if 'timestamp' in item_info:
552
+ item_time = datetime.fromisoformat(item_info['timestamp'])
553
+ if (current_time - item_time).total_seconds() > FILE_EXPIRY_HOURS * 3600:
554
+ items_to_remove.append(item_id)
555
+
556
+ for item_id in items_to_remove:
557
+ del session_data[item_id]
558
+
559
+ if not session_data:
560
+ sessions_to_remove.append(session_id)
561
 
562
+ for session_id in sessions_to_remove:
563
+ del storage[session_id]
564
+
565
  except Exception as e:
566
  logger.error(f"Error during cleanup: {str(e)}")
567
 
568
+ def get_file_age(filepath):
569
+ """Get file age in hours"""
570
+ if os.path.exists(filepath):
571
+ file_time = os.path.getmtime(filepath)
572
+ return (time.time() - file_time) / 3600
573
+ return float('inf')
574
 
575
  def load_data_file(filepath, filename):
576
+ """Enhanced data loading with more formats"""
577
  try:
578
  file_ext = filename.rsplit('.', 1)[1].lower()
579
 
 
587
  return pd.read_parquet(filepath)
588
  elif file_ext == 'tsv':
589
  return pd.read_csv(filepath, sep='\t')
590
+ elif file_ext == 'feather':
591
+ return pd.read_feather(filepath)
592
  else:
593
  raise ValueError(f"Unsupported file format: {file_ext}")
594
  except Exception as e:
595
  raise Exception(f"Error loading file: {str(e)}")
596
 
597
+ # Setup enhanced scheduler
598
+ scheduler = BackgroundScheduler()
599
+ scheduler.add_job(func=cleanup_old_files, trigger="interval", hours=1)
600
+ scheduler.start()
601
+ atexit.register(lambda: scheduler.shutdown())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
 
603
+ # API Endpoints
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
  @app.route('/api/health', methods=['GET'])
606
  def health_check():
607
+ return jsonify({
608
+ 'status': 'healthy',
609
+ 'version': '2.0.0-enterprise',
610
+ 'features': ['advanced_profiling', 'automl', 'clustering', 'time_series'],
611
+ 'timestamp': datetime.now().isoformat()
612
+ })
613
 
614
  @app.route('/api/upload', methods=['POST'])
615
  def upload_file():
 
630
  return jsonify({'error': 'File type not supported'}), 400
631
 
632
  # Check file size
633
+ file.seek(0, 2)
634
  file_size = file.tell()
635
+ file.seek(0)
636
 
637
  if file_size > MAX_FILE_SIZE:
638
  return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400
 
649
  filepath = os.path.join(session_dir, f"{file_id}_{filename}")
650
  file.save(filepath)
651
 
652
+ # Enhanced file metadata
653
  if session_id not in file_storage:
654
  file_storage[session_id] = {}
655
 
 
657
  'filename': filename,
658
  'filepath': filepath,
659
  'size': file_size,
660
+ 'timestamp': datetime.now().isoformat(),
661
+ 'format': filename.rsplit('.', 1)[1].lower(),
662
+ 'status': 'uploaded'
663
  }
664
 
665
  return jsonify({
666
  'fileId': file_id,
667
  'filename': filename,
668
  'size': file_size,
669
+ 'format': filename.rsplit('.', 1)[1].lower(),
670
  'message': 'File uploaded successfully'
671
  })
672
 
 
674
  logger.error(f"Upload error: {str(e)}")
675
  return jsonify({'error': str(e)}), 500
676
 
677
+ @app.route('/api/profile/<file_id>', methods=['GET'])
678
+ def profile_data(file_id):
679
+ """Advanced data profiling endpoint"""
680
  try:
681
  session_id = request.args.get('sessionId')
682
  if not session_id or session_id not in file_storage:
 
686
  return jsonify({'error': 'File not found'}), 404
687
 
688
  file_info = file_storage[session_id][file_id]
689
+ df = load_data_file(file_info['filepath'], file_info['filename'])
690
+
691
+ # Perform advanced profiling
692
+ profile = analytics_engine.advanced_data_profiling(df)
693
+
694
+ return jsonify(profile)
695
+
696
+ except Exception as e:
697
+ logger.error(f"Profiling error: {str(e)}")
698
+ return jsonify({'error': str(e)}), 500
699
+
700
+ @app.route('/api/automl', methods=['POST'])
701
+ def run_automl():
702
+ """Automated ML pipeline endpoint"""
703
+ try:
704
+ data = request.get_json()
705
+ session_id = data.get('sessionId')
706
+ file_id = data.get('fileId')
707
+ target_column = data.get('targetColumn')
708
+ problem_type = data.get('problemType', 'auto')
709
 
710
+ if not all([session_id, file_id, target_column]):
711
+ return jsonify({'error': 'Session ID, File ID, and target column required'}), 400
712
+
713
+ if session_id not in file_storage or file_id not in file_storage[session_id]:
714
+ return jsonify({'error': 'File not found'}), 404
715
+
716
+ file_info = file_storage[session_id][file_id]
717
  df = load_data_file(file_info['filepath'], file_info['filename'])
718
 
719
+ if target_column not in df.columns:
720
+ return jsonify({'error': f'Target column {target_column} not found'}), 400
 
 
 
 
 
721
 
722
+ # Run AutoML pipeline
723
+ results = analytics_engine.automated_ml_pipeline(df, target_column, problem_type)
724
+
725
+ # Save results
726
+ result_id = str(uuid.uuid4())
727
+ result_dir = os.path.join(PROCESSED_FOLDER, session_id)
728
+ os.makedirs(result_dir, exist_ok=True)
729
+
730
+ result_filepath = os.path.join(result_dir, f"{result_id}_automl.json")
731
+ with open(result_filepath, 'w') as f:
732
+ json.dump(results, f, indent=2, default=str)
733
+
734
+ return jsonify({
735
+ 'resultId': result_id,
736
+ 'results': results,
737
+ 'analysisType': 'automl',
738
+ 'timestamp': datetime.now().isoformat()
739
+ })
740
 
741
  except Exception as e:
742
+ logger.error(f"AutoML error: {str(e)}")
743
  return jsonify({'error': str(e)}), 500
744
 
745
+ @app.route('/api/clustering', methods=['POST'])
746
+ def run_clustering():
747
+ """Advanced clustering analysis endpoint"""
748
  try:
749
  data = request.get_json()
750
  session_id = data.get('sessionId')
751
  file_id = data.get('fileId')
752
+ n_clusters = data.get('nClusters')
 
 
753
 
754
  if not all([session_id, file_id]):
755
  return jsonify({'error': 'Session ID and File ID required'}), 400
 
760
  file_info = file_storage[session_id][file_id]
761
  df = load_data_file(file_info['filepath'], file_info['filename'])
762
 
763
+ # Run clustering analysis
764
+ results = analytics_engine.advanced_clustering_analysis(df, n_clusters)
 
 
 
765
 
766
+ # Save results
767
+ result_id = str(uuid.uuid4())
768
+ result_dir = os.path.join(PROCESSED_FOLDER, session_id)
769
+ os.makedirs(result_dir, exist_ok=True)
770
 
771
+ result_filepath = os.path.join(result_dir, f"{result_id}_clustering.json")
772
+ with open(result_filepath, 'w') as f:
773
+ json.dump(results, f, indent=2, default=str)
774
+
775
+ return jsonify({
776
+ 'resultId': result_id,
777
+ 'results': results,
778
+ 'analysisType': 'clustering',
779
+ 'timestamp': datetime.now().isoformat()
780
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
 
782
+ except Exception as e:
783
+ logger.error(f"Clustering error: {str(e)}")
784
+ return jsonify({'error': str(e)}), 500
785
+
786
+ @app.route('/api/timeseries', methods=['POST'])
787
+ def run_timeseries():
788
+ """Time series analysis endpoint"""
789
+ try:
790
+ data = request.get_json()
791
+ session_id = data.get('sessionId')
792
+ file_id = data.get('fileId')
793
+ date_column = data.get('dateColumn')
794
+ value_column = data.get('valueColumn')
795
+
796
+ if not all([session_id, file_id, date_column, value_column]):
797
+ return jsonify({'error': 'Session ID, File ID, date column, and value column required'}), 400
798
+
799
+ if session_id not in file_storage or file_id not in file_storage[session_id]:
800
+ return jsonify({'error': 'File not found'}), 404
801
+
802
+ file_info = file_storage[session_id][file_id]
803
+ df = load_data_file(file_info['filepath'], file_info['filename'])
804
+
805
+ if date_column not in df.columns or value_column not in df.columns:
806
+ return jsonify({'error': 'Date or value column not found'}), 400
807
+
808
+ # Run time series analysis
809
+ results = analytics_engine.time_series_analysis(df, date_column, value_column)
810
+
811
+ # Save results
812
  result_id = str(uuid.uuid4())
813
  result_dir = os.path.join(PROCESSED_FOLDER, session_id)
814
  os.makedirs(result_dir, exist_ok=True)
815
 
816
+ result_filepath = os.path.join(result_dir, f"{result_id}_timeseries.json")
817
  with open(result_filepath, 'w') as f:
818
+ json.dump(results, f, indent=2, default=str)
819
 
820
  return jsonify({
821
  'resultId': result_id,
822
+ 'results': results,
823
+ 'analysisType': 'timeseries',
824
  'timestamp': datetime.now().isoformat()
825
  })
826
 
827
  except Exception as e:
828
+ logger.error(f"Time series error: {str(e)}")
829
  return jsonify({'error': str(e)}), 500
830
 
831
+ @app.route('/api/feature-engineering', methods=['POST'])
832
+ def run_feature_engineering():
833
+ """Feature engineering endpoint"""
834
  try:
835
+ data = request.get_json()
836
+ session_id = data.get('sessionId')
837
+ file_id = data.get('fileId')
838
+ target_column = data.get('targetColumn')
839
+
840
+ if not all([session_id, file_id]):
841
+ return jsonify({'error': 'Session ID and File ID required'}), 400
842
+
843
+ if session_id not in file_storage or file_id not in file_storage[session_id]:
844
+ return jsonify({'error': 'File not found'}), 404
845
+
846
+ file_info = file_storage[session_id][file_id]
847
+ df = load_data_file(file_info['filepath'], file_info['filename'])
848
+
849
+ # Generate engineered features
850
+ engineered_df = analytics_engine.advanced_feature_engineering(df, target_column)
851
+
852
+ # Save engineered dataset
853
+ engineered_file_id = str(uuid.uuid4())
854
+ engineered_filepath = os.path.join(
855
+ PROCESSED_FOLDER, session_id, f"{engineered_file_id}_engineered.csv"
856
+ )
857
+ os.makedirs(os.path.dirname(engineered_filepath), exist_ok=True)
858
+
859
+ # Combine original and engineered features
860
+ combined_df = pd.concat([df, engineered_df], axis=1)
861
+ combined_df.to_csv(engineered_filepath, index=False)
862
+
863
+ # Store engineered file info
864
  if session_id not in file_storage:
865
+ file_storage[session_id] = {}
 
 
 
 
 
 
 
 
 
 
 
866
 
867
+ file_storage[session_id][engineered_file_id] = {
868
+ 'filename': f"{file_info['filename'].split('.')[0]}_engineered.csv",
869
+ 'filepath': engineered_filepath,
870
+ 'size': os.path.getsize(engineered_filepath),
871
+ 'timestamp': datetime.now().isoformat(),
872
+ 'format': 'csv',
873
+ 'status': 'engineered',
874
+ 'parent_file': file_id
875
+ }
876
+
877
+ return jsonify({
878
+ 'engineeredFileId': engineered_file_id,
879
+ 'originalFeatures': len(df.columns),
880
+ 'engineeredFeatures': len(engineered_df.columns),
881
+ 'totalFeatures': len(combined_df.columns),
882
+ 'featureNames': engineered_df.columns.tolist(),
883
+ 'message': 'Feature engineering completed successfully'
884
+ })
885
 
886
  except Exception as e:
887
+ logger.error(f"Feature engineering error: {str(e)}")
888
  return jsonify({'error': str(e)}), 500
889
 
890
+ @app.route('/api/advanced-visualization', methods=['POST'])
891
+ def create_advanced_visualization():
892
+ """Advanced visualization endpoint with Plotly"""
893
  try:
894
+ data = request.get_json()
895
+ session_id = data.get('sessionId')
896
+ file_id = data.get('fileId')
897
+ chart_type = data.get('chartType')
898
+ parameters = data.get('parameters', {})
899
 
900
+ if not all([session_id, file_id, chart_type]):
901
+ return jsonify({'error': 'Session ID, File ID, and chart type required'}), 400
902
+
903
+ if session_id not in file_storage or file_id not in file_storage[session_id]:
904
  return jsonify({'error': 'File not found'}), 404
905
 
906
  file_info = file_storage[session_id][file_id]
907
+ df = load_data_file(file_info['filepath'], file_info['filename'])
908
 
909
+ # Create advanced visualizations using Plotly
910
+ if chart_type == 'correlation_heatmap':
911
+ numeric_df = df.select_dtypes(include=[np.number])
912
+ corr_matrix = numeric_df.corr()
913
+
914
+ fig = px.imshow(corr_matrix,
915
+ title='Correlation Heatmap',
916
+ color_continuous_scale='RdBu_r',
917
+ aspect='auto')
918
+
919
+ elif chart_type == 'distribution_plots':
920
+ column = parameters.get('column')
921
+ if not column or column not in df.columns:
922
+ return jsonify({'error': 'Column not specified or not found'}), 400
923
+
924
+ fig = px.histogram(df, x=column,
925
+ title=f'Distribution of {column}',
926
+ marginal='box')
927
+
928
+ elif chart_type == 'scatter_matrix':
929
+ columns = parameters.get('columns', df.select_dtypes(include=[np.number]).columns[:4])
930
+ fig = px.scatter_matrix(df[columns],
931
+ title='Scatter Matrix',
932
+ dimensions=columns)
933
+
934
+ elif chart_type == 'parallel_coordinates':
935
+ columns = parameters.get('columns', df.select_dtypes(include=[np.number]).columns[:5])
936
+ fig = px.parallel_coordinates(df,
937
+ dimensions=columns,
938
+ title='Parallel Coordinates Plot')
939
+
940
+ elif chart_type == 'box_plots':
941
+ columns = parameters.get('columns', df.select_dtypes(include=[np.number]).columns[:5])
942
+ fig = px.box(df[columns],
943
+ title='Box Plots Comparison')
944
+
945
+ elif chart_type == '3d_scatter':
946
+ x_col = parameters.get('x_column')
947
+ y_col = parameters.get('y_column')
948
+ z_col = parameters.get('z_column')
949
+
950
+ if not all([x_col, y_col, z_col]):
951
+ return jsonify({'error': '3D scatter requires x, y, and z columns'}), 400
952
+
953
+ fig = px.scatter_3d(df, x=x_col, y=y_col, z=z_col,
954
+ title=f'3D Scatter: {x_col} vs {y_col} vs {z_col}')
955
+
956
+ else:
957
+ return jsonify({'error': 'Unsupported chart type'}), 400
958
 
959
+ # Convert to JSON
960
+ chart_json = json.dumps(fig, cls=PlotlyJSONEncoder)
961
 
962
+ return jsonify({
963
+ 'chart': chart_json,
964
+ 'chartType': chart_type,
965
+ 'timestamp': datetime.now().isoformat()
966
+ })
967
 
968
  except Exception as e:
969
+ logger.error(f"Visualization error: {str(e)}")
970
  return jsonify({'error': str(e)}), 500
971
 
972
+ @app.route('/api/data-quality', methods=['POST'])
973
+ def assess_data_quality():
974
+ """Data quality assessment endpoint"""
975
  try:
976
+ data = request.get_json()
977
+ session_id = data.get('sessionId')
978
+ file_id = data.get('fileId')
979
+
980
+ if not all([session_id, file_id]):
981
+ return jsonify({'error': 'Session ID and File ID required'}), 400
982
+
983
+ if session_id not in file_storage or file_id not in file_storage[session_id]:
984
+ return jsonify({'error': 'File not found'}), 404
985
+
986
+ file_info = file_storage[session_id][file_id]
987
+ df = load_data_file(file_info['filepath'], file_info['filename'])
988
+
989
+ quality_report = {
990
+ 'overall_score': 0,
991
+ 'dimensions': {
992
+ 'completeness': {},
993
+ 'consistency': {},
994
+ 'validity': {},
995
+ 'uniqueness': {},
996
+ 'accuracy': {}
997
+ },
998
+ 'issues': [],
999
+ 'recommendations': []
1000
+ }
1001
+
1002
+ # Completeness assessment
1003
+ total_cells = len(df) * len(df.columns)
1004
+ missing_cells = df.isnull().sum().sum()
1005
+ completeness_score = ((total_cells - missing_cells) / total_cells) * 100
1006
+
1007
+ quality_report['dimensions']['completeness'] = {
1008
+ 'score': completeness_score,
1009
+ 'missing_values': df.isnull().sum().to_dict(),
1010
+ 'missing_percentage': (df.isnull().sum() / len(df) * 100).to_dict()
1011
+ }
1012
+
1013
+ # Consistency assessment
1014
+ consistency_issues = []
1015
+ for col in df.select_dtypes(include=['object']):
1016
+ # Check for inconsistent formatting
1017
+ values = df[col].dropna().astype(str)
1018
+ if len(values) > 0:
1019
+ # Mixed case issues
1020
+ lowercase_values = set(v.lower() for v in values)
1021
+ if len(lowercase_values) != len(set(values)):
1022
+ consistency_issues.append(f"Column '{col}' has mixed case values")
1023
+
1024
+ # Leading/trailing spaces
1025
+ if any(v != v.strip() for v in values):
1026
+ consistency_issues.append(f"Column '{col}' has leading/trailing spaces")
1027
+
1028
+ consistency_score = max(0, 100 - len(consistency_issues) * 10)
1029
+ quality_report['dimensions']['consistency'] = {
1030
+ 'score': consistency_score,
1031
+ 'issues': consistency_issues
1032
+ }
1033
+
1034
+ # Validity assessment (basic data type validation)
1035
+ validity_issues = []
1036
+ for col in df.columns:
1037
+ if df[col].dtype == 'object':
1038
+ # Check for potential numeric columns stored as strings
1039
+ try:
1040
+ pd.to_numeric(df[col].dropna(), errors='raise')
1041
+ validity_issues.append(f"Column '{col}' appears to be numeric but stored as text")
1042
+ except:
1043
+ pass
1044
+
1045
+ validity_score = max(0, 100 - len(validity_issues) * 15)
1046
+ quality_report['dimensions']['validity'] = {
1047
+ 'score': validity_score,
1048
+ 'issues': validity_issues
1049
+ }
1050
+
1051
+ # Uniqueness assessment
1052
+ uniqueness_scores = {}
1053
+ for col in df.columns:
1054
+ unique_ratio = df[col].nunique() / len(df) if len(df) > 0 else 0
1055
+ uniqueness_scores[col] = unique_ratio * 100
1056
+
1057
+ avg_uniqueness = np.mean(list(uniqueness_scores.values()))
1058
+ quality_report['dimensions']['uniqueness'] = {
1059
+ 'score': avg_uniqueness,
1060
+ 'column_scores': uniqueness_scores,
1061
+ 'duplicate_rows': df.duplicated().sum()
1062
+ }
1063
+
1064
+ # Overall score calculation
1065
+ dimension_scores = [
1066
+ completeness_score,
1067
+ consistency_score,
1068
+ validity_score,
1069
+ avg_uniqueness
1070
+ ]
1071
+ quality_report['overall_score'] = np.mean(dimension_scores)
1072
+
1073
+ # Generate recommendations
1074
+ if completeness_score < 80:
1075
+ quality_report['recommendations'].append({
1076
+ 'type': 'completeness',
1077
+ 'priority': 'high',
1078
+ 'message': 'Consider imputing missing values or removing incomplete records'
1079
+ })
1080
+
1081
+ if consistency_score < 70:
1082
+ quality_report['recommendations'].append({
1083
+ 'type': 'consistency',
1084
+ 'priority': 'medium',
1085
+ 'message': 'Standardize text formatting and remove extra spaces'
1086
+ })
1087
+
1088
+ if validity_score < 80:
1089
+ quality_report['recommendations'].append({
1090
+ 'type': 'validity',
1091
+ 'priority': 'medium',
1092
+ 'message': 'Review data types and convert where appropriate'
1093
+ })
1094
+
1095
+ return jsonify(quality_report)
1096
+
1097
+ except Exception as e:
1098
+ logger.error(f"Data quality error: {str(e)}")
1099
+ return jsonify({'error': str(e)}), 500
1100
+
1101
+ @app.route('/api/statistical-tests', methods=['POST'])
1102
+ def run_statistical_tests():
1103
+ """Statistical hypothesis testing endpoint"""
1104
+ try:
1105
+ data = request.get_json()
1106
+ session_id = data.get('sessionId')
1107
+ file_id = data.get('fileId')
1108
+ test_type = data.get('testType')
1109
+ parameters = data.get('parameters', {})
1110
+
1111
+ if not all([session_id, file_id, test_type]):
1112
+ return jsonify({'error': 'Session ID, File ID, and test type required'}), 400
1113
+
1114
+ if session_id not in file_storage or file_id not in file_storage[session_id]:
1115
+ return jsonify({'error': 'File not found'}), 404
1116
+
1117
+ file_info = file_storage[session_id][file_id]
1118
+ df = load_data_file(file_info['filepath'], file_info['filename'])
1119
+
1120
+ results = {'test_type': test_type, 'results': {}}
1121
+
1122
+ if test_type == 'normality':
1123
+ column = parameters.get('column')
1124
+ if not column or column not in df.columns:
1125
+ return jsonify({'error': 'Column not specified or not found'}), 400
1126
+
1127
+ data_col = df[column].dropna()
1128
+
1129
+ # Shapiro-Wilk test
1130
+ shapiro_stat, shapiro_p = stats.shapiro(data_col.sample(min(5000, len(data_col))))
1131
+
1132
+ # Anderson-Darling test
1133
+ anderson_result = stats.anderson(data_col)
1134
+
1135
+ results['results'] = {
1136
+ 'shapiro_wilk': {
1137
+ 'statistic': shapiro_stat,
1138
+ 'p_value': shapiro_p,
1139
+ 'is_normal': shapiro_p > 0.05
1140
+ },
1141
+ 'anderson_darling': {
1142
+ 'statistic': anderson_result.statistic,
1143
+ 'critical_values': anderson_result.critical_values.tolist(),
1144
+ 'significance_levels': anderson_result.significance_level.tolist()
1145
+ }
1146
+ }
1147
+
1148
+ elif test_type == 'correlation_significance':
1149
+ col1 = parameters.get('column1')
1150
+ col2 = parameters.get('column2')
1151
+
1152
+ if not all([col1, col2]) or col1 not in df.columns or col2 not in df.columns:
1153
+ return jsonify({'error': 'Both columns must be specified and exist'}), 400
1154
+
1155
+ # Pearson correlation
1156
+ pearson_corr, pearson_p = stats.pearsonr(df[col1].dropna(), df[col2].dropna())
1157
+
1158
+ # Spearman correlation
1159
+ spearman_corr, spearman_p = stats.spearmanr(df[col1].dropna(), df[col2].dropna())
1160
+
1161
+ results['results'] = {
1162
+ 'pearson': {
1163
+ 'correlation': pearson_corr,
1164
+ 'p_value': pearson_p,
1165
+ 'significant': pearson_p < 0.05
1166
+ },
1167
+ 'spearman': {
1168
+ 'correlation': spearman_corr,
1169
+ 'p_value': spearman_p,
1170
+ 'significant': spearman_p < 0.05
1171
+ }
1172
+ }
1173
+
1174
+ elif test_type == 'group_comparison':
1175
+ group_col = parameters.get('groupColumn')
1176
+ value_col = parameters.get('valueColumn')
1177
+
1178
+ if not all([group_col, value_col]):
1179
+ return jsonify({'error': 'Group and value columns required'}), 400
1180
+
1181
+ groups = [group for name, group in df.groupby(group_col)[value_col] if len(group) > 1]
1182
+
1183
+ if len(groups) == 2:
1184
+ # Two-sample t-test
1185
+ t_stat, t_p = stats.ttest_ind(groups[0], groups[1])
1186
+
1187
+ # Mann-Whitney U test
1188
+ u_stat, u_p = stats.mannwhitneyu(groups[0], groups[1])
1189
+
1190
+ results['results'] = {
1191
+ 'two_sample_ttest': {
1192
+ 'statistic': t_stat,
1193
+ 'p_value': t_p,
1194
+ 'significant': t_p < 0.05
1195
+ },
1196
+ 'mann_whitney_u': {
1197
+ 'statistic': u_stat,
1198
+ 'p_value': u_p,
1199
+ 'significant': u_p < 0.05
1200
+ }
1201
+ }
1202
+
1203
+ elif len(groups) > 2:
1204
+ # ANOVA
1205
+ f_stat, f_p = stats.f_oneway(*groups)
1206
+
1207
+ # Kruskal-Wallis test
1208
+ h_stat, h_p = stats.kruskal(*groups)
1209
+
1210
+ results['results'] = {
1211
+ 'anova': {
1212
+ 'statistic': f_stat,
1213
+ 'p_value': f_p,
1214
+ 'significant': f_p < 0.05
1215
+ },
1216
+ 'kruskal_wallis': {
1217
+ 'statistic': h_stat,
1218
+ 'p_value': h_p,
1219
+ 'significant': h_p < 0.05
1220
+ }
1221
+ }
1222
+
1223
+ else:
1224
+ return jsonify({'error': 'Unsupported test type'}), 400
1225
+
1226
+ return jsonify(results)
1227
+
1228
+ except Exception as e:
1229
+ logger.error(f"Statistical test error: {str(e)}")
1230
+ return jsonify({'error': str(e)}), 500
1231
+
1232
+ @app.route('/api/analysis-history/<session_id>', methods=['GET'])
1233
+ def get_analysis_history(session_id):
1234
+ """Get analysis history for a session"""
1235
+ try:
1236
+ if session_id not in analysis_history:
1237
+ return jsonify({'history': []})
1238
+
1239
+ return jsonify({'history': list(analysis_history[session_id].values())})
1240
+
1241
+ except Exception as e:
1242
+ logger.error(f"History error: {str(e)}")
1243
+ return jsonify({'error': str(e)}), 500
1244
+
1245
+ @app.route('/api/export-report', methods=['POST'])
1246
+ def export_analysis_report():
1247
+ """Export comprehensive analysis report"""
1248
+ try:
1249
+ data = request.get_json()
1250
+ session_id = data.get('sessionId')
1251
+ analyses = data.get('analyses', []) # List of analysis result IDs
1252
 
1253
  if not session_id:
1254
  return jsonify({'error': 'Session ID required'}), 400
1255
 
1256
+ # Compile report
1257
+ report = {
1258
+ 'session_id': session_id,
1259
+ 'generated_at': datetime.now().isoformat(),
1260
+ 'analyses': [],
1261
+ 'summary': {
1262
+ 'total_analyses': len(analyses),
1263
+ 'data_files_processed': len(file_storage.get(session_id, {})),
1264
+ 'recommendations': []
1265
+ }
1266
+ }
1267
 
1268
+ # Load each analysis result
1269
+ for analysis_id in analyses:
1270
+ try:
1271
+ result_files = [
1272
+ f for f in os.listdir(os.path.join(PROCESSED_FOLDER, session_id))
1273
+ if f.startswith(analysis_id)
1274
+ ]
1275
+
1276
+ if result_files:
1277
+ filepath = os.path.join(PROCESSED_FOLDER, session_id, result_files[0])
1278
+ with open(filepath, 'r') as f:
1279
+ analysis_data = json.load(f)
1280
+ report['analyses'].append({
1281
+ 'id': analysis_id,
1282
+ 'type': result_files[0].split('_')[1].split('.')[0],
1283
+ 'data': analysis_data
1284
+ })
1285
+
1286
+ except Exception as e:
1287
+ logger.error(f"Error loading analysis {analysis_id}: {str(e)}")
1288
+ continue
1289
+
1290
+ # Generate summary recommendations
1291
+ if report['analyses']:
1292
+ report['summary']['recommendations'] = [
1293
+ "Review data quality scores and address high-priority issues",
1294
+ "Consider feature engineering for improved model performance",
1295
+ "Validate statistical assumptions before drawing conclusions",
1296
+ "Monitor model performance with cross-validation results"
1297
+ ]
1298
+
1299
+ # Save report
1300
+ report_id = str(uuid.uuid4())
1301
+ report_dir = os.path.join(PROCESSED_FOLDER, session_id)
1302
+ os.makedirs(report_dir, exist_ok=True)
1303
+
1304
+ report_filepath = os.path.join(report_dir, f"{report_id}_report.json")
1305
+ with open(report_filepath, 'w') as f:
1306
+ json.dump(report, f, indent=2, default=str)
1307
+
1308
+ return jsonify({
1309
+ 'reportId': report_id,
1310
+ 'message': 'Report generated successfully',
1311
+ 'downloadUrl': f'/api/download/{report_id}?sessionId={session_id}&format=json'
1312
+ })
1313
+
1314
+ except Exception as e:
1315
+ logger.error(f"Report export error: {str(e)}")
1316
+ return jsonify({'error': str(e)}), 500
1317
+
1318
+ # Update existing endpoints with enhanced functionality
1319
+
1320
+ @app.route('/api/preview/<file_id>', methods=['GET'])
1321
+ def preview_file(file_id):
1322
+ try:
1323
+ session_id = request.args.get('sessionId')
1324
+ if not session_id or session_id not in file_storage:
1325
+ return jsonify({'error': 'Invalid session'}), 400
1326
+
1327
+ if file_id not in file_storage[session_id]:
1328
+ return jsonify({'error': 'File not found'}), 404
1329
+
1330
+ file_info = file_storage[session_id][file_id]
1331
+ df = load_data_file(file_info['filepath'], file_info['filename'])
1332
+
1333
+ # Enhanced preview with data insights
1334
+ preview_data = {
1335
+ 'basic_info': {
1336
+ 'columns': df.columns.tolist(),
1337
+ 'dtypes': df.dtypes.astype(str).to_dict(),
1338
+ 'shape': df.shape,
1339
+ 'memory_usage': df.memory_usage(deep=True).sum()
1340
+ },
1341
+ 'sample_data': {
1342
+ 'head': df.head(5).to_dict('records'),
1343
+ 'tail': df.tail(5).to_dict('records')
1344
+ },
1345
+ 'data_quality': {
1346
+ 'missing_values': df.isnull().sum().to_dict(),
1347
+ 'duplicate_rows': df.duplicated().sum(),
1348
+ 'unique_values': df.nunique().to_dict()
1349
+ },
1350
+ 'quick_stats': {}
1351
+ }
1352
+
1353
+ # Quick statistics for numeric columns
1354
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
1355
+ if len(numeric_cols) > 0:
1356
+ preview_data['quick_stats']['numeric'] = df[numeric_cols].describe().to_dict()
1357
+
1358
+ # Quick statistics for categorical columns
1359
+ categorical_cols = df.select_dtypes(include=['object']).columns
1360
+ if len(categorical_cols) > 0:
1361
+ preview_data['quick_stats']['categorical'] = {}
1362
+ for col in categorical_cols[:5]: # Limit to first 5 categorical columns
1363
+ preview_data['quick_stats']['categorical'][col] = {
1364
+ 'top_values': df[col].value_counts().head(5).to_dict()
1365
+ }
1366
+
1367
+ return jsonify(preview_data)
1368
 
 
 
 
 
 
 
1369
  except Exception as e:
1370
+ logger.error(f"Preview error: {str(e)}")
1371
  return jsonify({'error': str(e)}), 500
1372
 
1373
  @app.route('/', methods=['GET'])
1374
  def home():
1375
  return jsonify({
1376
+ 'message': 'Enterprise Data Analytics Platform',
1377
+ 'version': '2.0.0-enterprise',
1378
+ 'features': {
1379
+ 'core': ['data_profiling', 'quality_assessment', 'statistical_tests'],
1380
+ 'machine_learning': ['automl', 'clustering', 'feature_engineering'],
1381
+ 'time_series': ['trend_analysis', 'forecasting', 'anomaly_detection'],
1382
+ 'visualization': ['advanced_charts', 'interactive_plots', 'correlation_heatmaps'],
1383
+ 'enterprise': ['report_generation', 'analysis_history', 'data_governance']
1384
+ },
1385
  'endpoints': {
1386
+ 'data_management': ['/api/upload', '/api/preview/<file_id>', '/api/profile/<file_id>'],
1387
+ 'analytics': ['/api/automl', '/api/clustering', '/api/timeseries'],
1388
+ 'quality': ['/api/data-quality', '/api/statistical-tests'],
1389
+ 'visualization': ['/api/advanced-visualization'],
1390
+ 'enterprise': ['/api/export-report', '/api/analysis-history/<session_id>']
 
 
1391
  },
1392
  'timestamp': datetime.now().isoformat()
1393
  })
1394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1395
  if __name__ == '__main__':
1396
+ app.run(host='0.0.0.0', port=7860, debug=False) # Production ready