Spaces:

mike23415
/

Data-analytics

Sleeping

App Files Files Community

mike23415 commited on Jun 22

Commit

415ccf1

verified ·

1 Parent(s): 66de5aa

Update app.py

Browse files

Files changed (1) hide show

app.py +1176 -446

app.py CHANGED Viewed

@@ -11,14 +11,28 @@ import threading
 import time
 import logging
 from scipy import stats
 import matplotlib
-matplotlib.use('Agg')  # Use non-interactive backend
 import matplotlib.pyplot as plt
 import seaborn as sns
 import io
 import base64
 from apscheduler.schedulers.background import BackgroundScheduler
 import atexit
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -30,31 +44,495 @@ CORS(app)
 # Configuration
 UPLOAD_FOLDER = '/tmp/uploads'
 PROCESSED_FOLDER = '/tmp/processed'
-MAX_FILE_SIZE = 512 * 1024 * 1024  # 512MB
-ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'xls', 'json', 'parquet', 'tsv'}
-FILE_EXPIRY_HOURS = 1
 # Ensure directories exist
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-os.makedirs(PROCESSED_FOLDER, exist_ok=True)
-# File storage to track sessions and files
 file_storage = {}
 def allowed_file(filename):
     return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-def get_file_age(filepath):
-    """Get file age in hours"""
-    if os.path.exists(filepath):
-        file_time = os.path.getmtime(filepath)
-        return (time.time() - file_time) / 3600
-    return float('inf')
 def cleanup_old_files():
-    """Remove files older than FILE_EXPIRY_HOURS"""
     try:
-        for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER]:
             for root, dirs, files in os.walk(folder):
                 for file in files:
                     filepath = os.path.join(root, file)
@@ -62,36 +540,40 @@ def cleanup_old_files():
                         os.remove(filepath)
                         logger.info(f"Cleaned up old file: {filepath}")
-        # Clean up file_storage entries
         current_time = datetime.now()
-        sessions_to_remove = []
-        for session_id, files in file_storage.items():
-            files_to_remove = []
-            for file_id, file_info in files.items():
-                file_time = datetime.fromisoformat(file_info['timestamp'])
-                if (current_time - file_time).total_seconds() > FILE_EXPIRY_HOURS * 3600:
-                    files_to_remove.append(file_id)
-            for file_id in files_to_remove:
-                del files[file_id]
-            if not files:
-                sessions_to_remove.append(session_id)
-        for session_id in sessions_to_remove:
-            del file_storage[session_id]
     except Exception as e:
         logger.error(f"Error during cleanup: {str(e)}")
-# Setup scheduler for automatic cleanup
-scheduler = BackgroundScheduler()
-scheduler.add_job(func=cleanup_old_files, trigger="interval", minutes=15)
-scheduler.start()
-atexit.register(lambda: scheduler.shutdown())
 def load_data_file(filepath, filename):
-    """Load data from various file formats"""
     try:
         file_ext = filename.rsplit('.', 1)[1].lower()
@@ -105,278 +587,29 @@ def load_data_file(filepath, filename):
             return pd.read_parquet(filepath)
         elif file_ext == 'tsv':
             return pd.read_csv(filepath, sep='\t')
         else:
             raise ValueError(f"Unsupported file format: {file_ext}")
     except Exception as e:
         raise Exception(f"Error loading file: {str(e)}")
-def perform_basic_statistics(df, columns=None):
-    """Perform basic statistical analysis"""
-    if columns:
-        df = df[columns]
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
-    result = {
-        'numeric_summary': {},
-         'categorical_summary': {},
-        'general_info': {
-            'total_rows': len(df),
-            'total_columns': len(df.columns),
-            'numeric_columns': len(numeric_cols),
-            'categorical_columns': len(categorical_cols),
-            'missing_values': df.isnull().sum().to_dict()
-        }
-    }
-    # Numeric statistics
-    if numeric_cols:
-        numeric_stats = df[numeric_cols].describe()
-        result['numeric_summary'] = numeric_stats.to_dict()
-    # Categorical statistics
-    if categorical_cols:
-        for col in categorical_cols:
-            result['categorical_summary'][col] = {
-                'unique_values': df[col].nunique(),
-                'top_values': df[col].value_counts().head(10).to_dict(),
-                'missing_count': df[col].isnull().sum()
-            }
-    return result
-def perform_groupby_analysis(df, group_column, target_column, operation='mean', filters=None):
-    """Perform group by analysis"""
-    # Apply filters if provided
-    if filters:
-        for f in filters:
-            col, op, val = f['column'], f['operator'], f['value']
-            if op == '>':
-                df = df[df[col] > val]
-            elif op == '<':
-                df = df[df[col] < val]
-            elif op == '==':
-                df = df[df[col] == val]
-            elif op == '!=':
-                df = df[df[col] != val]
-            elif op == '>=':
-                df = df[df[col] >= val]
-            elif op == '<=':
-                df = df[df[col] <= val]
-    # Perform groupby operation
-    grouped = df.groupby(group_column)[target_column]
-    if operation == 'mean':
-        result = grouped.mean()
-    elif operation == 'sum':
-        result = grouped.sum()
-    elif operation == 'count':
-        result = grouped.count()
-    elif operation == 'max':
-        result = grouped.max()
-    elif operation == 'min':
-        result = grouped.min()
-    elif operation == 'std':
-        result = grouped.std()
-    else:
-        raise ValueError(f"Unsupported operation: {operation}")
-    return {
-        'result': result.to_dict(),
-        'operation': operation,
-        'group_column': group_column,
-        'target_column': target_column,
-        'total_groups': len(result)
-    }
-def perform_correlation_analysis(df, columns=None, method='pearson'):
-    """Perform correlation analysis"""
-    if columns:
-        df = df[columns]
-    # Only numeric columns
-    numeric_df = df.select_dtypes(include=[np.number])
-    if numeric_df.empty:
-        raise ValueError("No numeric columns found for correlation analysis")
-    correlation_matrix = numeric_df.corr(method=method)
-    return {
-        'correlation_matrix': correlation_matrix.to_dict(),
-        'method': method,
-        'columns': numeric_df.columns.tolist()
-    }
-def detect_outliers(df, columns=None, method='iqr'):
-    """Detect outliers in numeric columns"""
-    if columns:
-        df = df[columns]
-    numeric_df = df.select_dtypes(include=[np.number])
-    outliers = {}
-    for col in numeric_df.columns:
-        if method == 'iqr':
-            Q1 = numeric_df[col].quantile(0.25)
-            Q3 = numeric_df[col].quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            outlier_indices = numeric_df[(numeric_df[col] < lower_bound) |
-                                       (numeric_df[col] > upper_bound)].index.tolist()
-        elif method == 'zscore':
-            z_scores = np.abs(stats.zscore(numeric_df[col].dropna()))
-            outlier_indices = numeric_df[z_scores > 3].index.tolist()
-        outliers[col] = {
-            'count': len(outlier_indices),
-            'indices': outlier_indices[:100],  # Limit to first 100
-            'percentage': (len(outlier_indices) / len(numeric_df)) * 100
-        }
-    return outliers
-def generate_visualization(df, chart_type, x_column, y_column=None, group_column=None):
-    """Generate visualization and return base64 encoded image"""
-    plt.figure(figsize=(10, 6))
-    try:
-        if chart_type == 'histogram':
-            plt.hist(df[x_column], bins=30, alpha=0.7)
-            plt.xlabel(x_column)
-            plt.ylabel('Frequency')
-            plt.title(f'Histogram of {x_column}')
-        elif chart_type == 'scatter':
-            if not y_column:
-                raise ValueError("Y column required for scatter plot")
-            plt.scatter(df[x_column], df[y_column], alpha=0.6)
-            plt.xlabel(x_column)
-            plt.ylabel(y_column)
-            plt.title(f'{x_column} vs {y_column}')
-        elif chart_type == 'bar':
-            if group_column:
-                grouped = df.groupby(group_column)[x_column].mean() if pd.api.types.is_numeric_dtype(df[x_column]) else df[group_column].value_counts()
-            else:
-                grouped = df[x_column].value_counts().head(20)
-            grouped.plot(kind='bar')
-            plt.xlabel(group_column or x_column)
-            plt.ylabel('Count' if not pd.api.types.is_numeric_dtype(df[x_column]) else f'Mean {x_column}')
-            plt.title(f'Bar Chart')
-            plt.xticks(rotation=45)
-        elif chart_type == 'line':
-            if y_column:
-                plt.plot(df[x_column], df[y_column])
-                plt.xlabel(x_column)
-                plt.ylabel(y_column)
-            else:
-                df[x_column].plot()
-                plt.ylabel(x_column)
-            plt.title('Line Chart')
-        elif chart_type == 'box':
-            if group_column:
-                df.boxplot(column=x_column, by=group_column)
-            else:
-                df.boxplot(column=x_column)
-            plt.title('Box Plot')
-        plt.tight_layout()
-        # Convert plot to base64 string
-        img_buffer = io.BytesIO()
-        plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight')
-        img_buffer.seek(0)
-        img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
-        plt.close()
-        return img_base64
-    except Exception as e:
-        plt.close()
-        raise Exception(f"Error generating visualization: {str(e)}")
-def parse_natural_language_query(query, df_columns):
-    """Simple natural language query parser"""
-    query_lower = query.lower()
-    # Define operation keywords
-    operations = {
-        'average': 'mean', 'mean': 'mean', 'avg': 'mean',
-        'sum': 'sum', 'total': 'sum',
-        'count': 'count', 'number': 'count',
-        'max': 'max', 'maximum': 'max', 'highest': 'max',
-        'min': 'min', 'minimum': 'min', 'lowest': 'min'
-    }
-    # Find operation
-    operation = 'mean'  # default
-    for keyword, op in operations.items():
-        if keyword in query_lower:
-            operation = op
-            break
-    # Find columns mentioned in query
-    mentioned_columns = [col for col in df_columns if col.lower() in query_lower]
-    # Simple parsing patterns
-    if 'by' in query_lower and len(mentioned_columns) >= 2:
-        # Group by analysis
-        target_col = mentioned_columns[0]
-        group_col = mentioned_columns[-1]
-        return {
-            'analysisType': 'groupby',
-            'parameters': {
-                'groupByColumn': group_col,
-                'targetColumn': target_col,
-                'operation': operation
-            }
-        }
-    elif 'correlation' in query_lower:
-        return {
-            'analysisType': 'correlation',
-            'parameters': {
-                'columns': mentioned_columns if mentioned_columns else None
-            }
-        }
-    elif any(word in query_lower for word in ['chart', 'plot', 'graph', 'visualize']):
-        chart_type = 'bar'  # default
-        if 'scatter' in query_lower:
-            chart_type = 'scatter'
-        elif 'line' in query_lower:
-            chart_type = 'line'
-        elif 'histogram' in query_lower:
-            chart_type = 'histogram'
-        return {
-            'analysisType': 'visualization',
-            'parameters': {
-                'chartType': chart_type,
-                'xColumn': mentioned_columns[0] if mentioned_columns else None,
-                'yColumn': mentioned_columns[1] if len(mentioned_columns) > 1 else None
-            }
-        }
-    else:
-        # Default to basic statistics
-        return {
-            'analysisType': 'statistics',
-            'parameters': {
-                'columns': mentioned_columns if mentioned_columns else None
-            }
-        }
 @app.route('/api/health', methods=['GET'])
 def health_check():
-    return jsonify({'status': 'healthy', 'timestamp': datetime.now().isoformat()})
 @app.route('/api/upload', methods=['POST'])
 def upload_file():
@@ -397,9 +630,9 @@ def upload_file():
             return jsonify({'error': 'File type not supported'}), 400
         # Check file size
-        file.seek(0, 2)  # Seek to end
         file_size = file.tell()
-        file.seek(0)  # Reset to beginning
         if file_size > MAX_FILE_SIZE:
             return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400
@@ -416,7 +649,7 @@ def upload_file():
         filepath = os.path.join(session_dir, f"{file_id}_{filename}")
         file.save(filepath)
-        # Store file info
         if session_id not in file_storage:
             file_storage[session_id] = {}
@@ -424,13 +657,16 @@ def upload_file():
             'filename': filename,
             'filepath': filepath,
             'size': file_size,
-            'timestamp': datetime.now().isoformat()
         }
         return jsonify({
             'fileId': file_id,
             'filename': filename,
             'size': file_size,
             'message': 'File uploaded successfully'
         })
@@ -438,8 +674,9 @@ def upload_file():
         logger.error(f"Upload error: {str(e)}")
         return jsonify({'error': str(e)}), 500
-@app.route('/api/preview/<file_id>', methods=['GET'])
-def preview_file(file_id):
     try:
         session_id = request.args.get('sessionId')
         if not session_id or session_id not in file_storage:
@@ -449,33 +686,70 @@ def preview_file(file_id):
             return jsonify({'error': 'File not found'}), 404
         file_info = file_storage[session_id][file_id]
-        # Load data and get preview
         df = load_data_file(file_info['filepath'], file_info['filename'])
-        preview_data = {
-            'columns': df.columns.tolist(),
-            'dtypes': df.dtypes.astype(str).to_dict(),
-            'shape': df.shape,
-            'head': df.head(5).to_dict('records'),
-            'missing_values': df.isnull().sum().to_dict()
-        }
-        return jsonify(preview_data)
     except Exception as e:
-        logger.error(f"Preview error: {str(e)}")
         return jsonify({'error': str(e)}), 500
-@app.route('/api/analyze', methods=['POST'])
-def analyze_data():
     try:
         data = request.get_json()
         session_id = data.get('sessionId')
         file_id = data.get('fileId')
-        analysis_type = data.get('analysisType')
-        parameters = data.get('parameters', {})
-        natural_query = data.get('naturalQuery')
         if not all([session_id, file_id]):
             return jsonify({'error': 'Session ID and File ID required'}), 400
@@ -486,181 +760,637 @@ def analyze_data():
         file_info = file_storage[session_id][file_id]
         df = load_data_file(file_info['filepath'], file_info['filename'])
-        # Handle natural language query
-        if natural_query and not analysis_type:
-            parsed_query = parse_natural_language_query(natural_query, df.columns.tolist())
-            analysis_type = parsed_query['analysisType']
-            parameters = parsed_query['parameters']
-        result = {}
-        if analysis_type == 'statistics':
-            result = perform_basic_statistics(df, parameters.get('columns'))
-        elif analysis_type == 'groupby':
-            result = perform_groupby_analysis(
-                df,
-                parameters.get('groupByColumn'),
-                parameters.get('targetColumn'),
-                parameters.get('operation', 'mean'),
-                parameters.get('filters')
-            )
-        elif analysis_type == 'correlation':
-            result = perform_correlation_analysis(
-                df,
-                parameters.get('columns'),
-                parameters.get('method', 'pearson')
-            )
-        elif analysis_type == 'outliers':
-            result = detect_outliers(
-                df,
-                parameters.get('columns'),
-                parameters.get('method', 'iqr')
-            )
-        elif analysis_type == 'visualization':
-            chart_base64 = generate_visualization(
-                df,
-                parameters.get('chartType', 'bar'),
-                parameters.get('xColumn'),
-                parameters.get('yColumn'),
-                parameters.get('groupColumn')
-            )
-            result = {
-                'chart': chart_base64,
-                'chartType': parameters.get('chartType', 'bar')
-            }
-        else:
-            return jsonify({'error': 'Invalid analysis type'}), 400
-        # Save result to processed folder
         result_id = str(uuid.uuid4())
         result_dir = os.path.join(PROCESSED_FOLDER, session_id)
         os.makedirs(result_dir, exist_ok=True)
-        result_filepath = os.path.join(result_dir, f"{result_id}_result.json")
         with open(result_filepath, 'w') as f:
-            json.dump(result, f, indent=2, default=str)
         return jsonify({
             'resultId': result_id,
-            'result': result,
-            'analysisType': analysis_type,
             'timestamp': datetime.now().isoformat()
         })
     except Exception as e:
-        logger.error(f"Analysis error: {str(e)}")
         return jsonify({'error': str(e)}), 500
-@app.route('/api/files/<session_id>', methods=['GET'])
-def list_files(session_id):
     try:
         if session_id not in file_storage:
-            return jsonify({'files': []})
-        files = []
-        for file_id, file_info in file_storage[session_id].items():
-            # Check if file still exists
-            if os.path.exists(file_info['filepath']):
-                files.append({
-                    'fileId': file_id,
-                    'filename': file_info['filename'],
-                    'size': file_info['size'],
-                    'timestamp': file_info['timestamp']
-                })
-        return jsonify({'files': files})
     except Exception as e:
-        logger.error(f"List files error: {str(e)}")
         return jsonify({'error': str(e)}), 500
-@app.route('/api/file/<file_id>', methods=['DELETE'])
-def delete_file(file_id):
     try:
-        session_id = request.args.get('sessionId')
-        if not session_id or session_id not in file_storage:
-            return jsonify({'error': 'Invalid session'}), 400
-        if file_id not in file_storage[session_id]:
             return jsonify({'error': 'File not found'}), 404
         file_info = file_storage[session_id][file_id]
-        # Remove file from filesystem
-        if os.path.exists(file_info['filepath']):
-            os.remove(file_info['filepath'])
-        # Remove from storage
-        del file_storage[session_id][file_id]
-        return jsonify({'message': 'File deleted successfully'})
     except Exception as e:
-        logger.error(f"Delete error: {str(e)}")
         return jsonify({'error': str(e)}), 500
-@app.route('/api/download/<result_id>', methods=['GET'])
-def download_result(result_id):
     try:
-        session_id = request.args.get('sessionId')
-        format_type = request.args.get('format', 'json')
         if not session_id:
             return jsonify({'error': 'Session ID required'}), 400
-        result_filepath = os.path.join(PROCESSED_FOLDER, session_id, f"{result_id}_result.json")
-        if not os.path.exists(result_filepath):
-            return jsonify({'error': 'Result not found'}), 404
-        if format_type == 'json':
-            return send_file(result_filepath, as_attachment=True,
-                           download_name=f"analysis_result_{result_id}.json")
-        else:
-            return jsonify({'error': 'Format not supported'}), 400
     except Exception as e:
-        logger.error(f"Download error: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/', methods=['GET'])
 def home():
     return jsonify({
-        'message': 'Data Analytics API is running!',
-        'version': '1.0.0',
         'endpoints': {
-            'health': '/api/health',
-            'upload': '/api/upload',
-            'preview': '/api/preview/<file_id>',
-            'analyze': '/api/analyze',
-            'files': '/api/files/<session_id>',
-            'delete': '/api/file/<file_id>',
-            'download': '/api/download/<result_id>'
         },
         'timestamp': datetime.now().isoformat()
     })
-@app.errorhandler(404)
-def not_found(error):
-    return jsonify({
-        'error': 'Endpoint not found',
-        'message': 'Please check the API documentation',
-        'available_endpoints': [
-            '/',
-            '/api/health',
-            '/api/upload',
-            '/api/preview/<file_id>',
-            '/api/analyze',
-            '/api/files/<session_id>',
-            '/api/file/<file_id>',
-            '/api/download/<result_id>'
-        ]
-    }), 404
 if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=7860, debug=True)

 import time
 import logging
 from scipy import stats
+from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.decomposition import PCA
+from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
+from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
 import matplotlib
+matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import seaborn as sns
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.utils import PlotlyJSONEncoder
 import io
 import base64
 from apscheduler.schedulers.background import BackgroundScheduler
 import atexit
+import warnings
+warnings.filterwarnings('ignore')
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Configuration
 UPLOAD_FOLDER = '/tmp/uploads'
 PROCESSED_FOLDER = '/tmp/processed'
+MODELS_FOLDER = '/tmp/models'
+MAX_FILE_SIZE = 1024 * 1024 * 1024  # 1GB for enterprise
+ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'xls', 'json', 'parquet', 'tsv', 'feather'}
+FILE_EXPIRY_HOURS = 24  # Extended for enterprise use
 # Ensure directories exist
+for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER, MODELS_FOLDER]:
+    os.makedirs(folder, exist_ok=True)
+# Enhanced file storage with metadata
 file_storage = {}
+model_storage = {}
+analysis_history = {}
+class EnterpriseAnalytics:
+    """Enterprise-grade analytics engine"""
+    def __init__(self):
+        self.scaler = StandardScaler()
+        self.models = {}
+    def advanced_data_profiling(self, df):
+        """Comprehensive data profiling like enterprise tools"""
+        profile = {
+            'dataset_overview': {
+                'rows': len(df),
+                'columns': len(df.columns),
+                'memory_usage': df.memory_usage(deep=True).sum(),
+                'duplicate_rows': df.duplicated().sum()
+            },
+            'column_analysis': {},
+            'data_quality': {},
+            'relationships': {},
+            'recommendations': []
+        }
+        for col in df.columns:
+            col_data = df[col]
+            col_profile = {
+                'dtype': str(col_data.dtype),
+                'missing_count': col_data.isnull().sum(),
+                'missing_percentage': (col_data.isnull().sum() / len(df)) * 100,
+                'unique_values': col_data.nunique(),
+                'cardinality': col_data.nunique() / len(df) if len(df) > 0 else 0
+            }
+            if pd.api.types.is_numeric_dtype(col_data):
+                col_profile.update({
+                    'statistics': {
+                        'mean': col_data.mean(),
+                        'median': col_data.median(),
+                        'std': col_data.std(),
+                        'min': col_data.min(),
+                        'max': col_data.max(),
+                        'q25': col_data.quantile(0.25),
+                        'q75': col_data.quantile(0.75),
+                        'skewness': stats.skew(col_data.dropna()),
+                        'kurtosis': stats.kurtosis(col_data.dropna())
+                    },
+                    'distribution': 'normal' if abs(stats.skew(col_data.dropna())) < 0.5 else 'skewed'
+                })
+            else:
+                col_profile.update({
+                    'top_categories': col_data.value_counts().head(10).to_dict(),
+                    'category_distribution': 'uniform' if col_data.value_counts().std() < col_data.value_counts().mean() * 0.5 else 'imbalanced'
+                })
+            profile['column_analysis'][col] = col_profile
+        # Data quality assessment
+        profile['data_quality'] = {
+            'completeness_score': (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,
+            'uniqueness_score': (df.nunique().sum() / (len(df) * len(df.columns))) * 100,
+            'consistency_score': self._calculate_consistency_score(df)
+        }
+        # Generate recommendations
+        profile['recommendations'] = self._generate_recommendations(df, profile)
+        return profile
+    def _calculate_consistency_score(self, df):
+        """Calculate data consistency score"""
+        score = 100
+        for col in df.select_dtypes(include=['object']):
+            # Check for inconsistent formatting
+            values = df[col].dropna().astype(str)
+            if len(values) > 0:
+                # Check for mixed case
+                if len(set([v.lower() for v in values])) != len(set(values)):
+                    score -= 5
+                # Check for leading/trailing spaces
+                if any(v != v.strip() for v in values):
+                    score -= 5
+        return max(0, score)
+    def _generate_recommendations(self, df, profile):
+        """Generate actionable recommendations"""
+        recommendations = []
+        # High missing value columns
+        for col, analysis in profile['column_analysis'].items():
+            if analysis['missing_percentage'] > 20:
+                recommendations.append({
+                    'type': 'data_quality',
+                    'priority': 'high',
+                    'message': f"Column '{col}' has {analysis['missing_percentage']:.1f}% missing values. Consider imputation or removal.",
+                    'action': 'handle_missing_values'
+                })
+        # High cardinality categorical columns
+        for col, analysis in profile['column_analysis'].items():
+            if analysis.get('cardinality', 0) > 0.8 and df[col].dtype == 'object':
+                recommendations.append({
+                    'type': 'feature_engineering',
+                    'priority': 'medium',
+                    'message': f"Column '{col}' has high cardinality. Consider feature encoding or dimensionality reduction.",
+                    'action': 'encode_categorical'
+                })
+        # Skewed distributions
+        for col, analysis in profile['column_analysis'].items():
+            if 'statistics' in analysis and abs(analysis['statistics']['skewness']) > 2:
+                recommendations.append({
+                    'type': 'data_transformation',
+                    'priority': 'medium',
+                    'message': f"Column '{col}' is highly skewed. Consider log transformation or scaling.",
+                    'action': 'transform_distribution'
+                })
+        return recommendations
+    def advanced_feature_engineering(self, df, target_column=None):
+        """Enterprise-level feature engineering"""
+        engineered_features = {}
+        # Numeric feature engineering
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            if col != target_column:
+                # Polynomial features
+                engineered_features[f'{col}_squared'] = df[col] ** 2
+                engineered_features[f'{col}_log'] = np.log1p(df[col].abs())
+                # Binning
+                engineered_features[f'{col}_binned'] = pd.cut(df[col], bins=5, labels=False)
+                # Rolling statistics (if data has time component)
+                if len(df) > 10:
+                    engineered_features[f'{col}_rolling_mean'] = df[col].rolling(window=min(5, len(df)//2)).mean()
+        # Categorical feature engineering
+        categorical_cols = df.select_dtypes(include=['object']).columns
+        for col in categorical_cols:
+            if col != target_column:
+                # Frequency encoding
+                freq_map = df[col].value_counts().to_dict()
+                engineered_features[f'{col}_frequency'] = df[col].map(freq_map)
+                # Target encoding (if target is provided)
+                if target_column and target_column in df.columns:
+                    target_mean = df.groupby(col)[target_column].mean()
+                    engineered_features[f'{col}_target_encoded'] = df[col].map(target_mean)
+        # Interaction features
+        if len(numeric_cols) >= 2:
+            col_pairs = [(numeric_cols[i], numeric_cols[j])
+                        for i in range(len(numeric_cols))
+                        for j in range(i+1, min(i+3, len(numeric_cols)))]  # Limit combinations
+            for col1, col2 in col_pairs:
+                if col1 != target_column and col2 != target_column:
+                    engineered_features[f'{col1}_{col2}_interaction'] = df[col1] * df[col2]
+                    engineered_features[f'{col1}_{col2}_ratio'] = df[col1] / (df[col2] + 1e-8)
+        return pd.DataFrame(engineered_features, index=df.index)
+    def automated_ml_pipeline(self, df, target_column, problem_type='auto'):
+        """Enterprise AutoML pipeline"""
+        results = {
+            'preprocessing': {},
+            'feature_selection': {},
+            'models': {},
+            'best_model': {},
+            'predictions': {},
+            'feature_importance': {}
+        }
+        # Determine problem type
+        if problem_type == 'auto':
+            if df[target_column].dtype in ['object', 'category'] or df[target_column].nunique() < 10:
+                problem_type = 'classification'
+            else:
+                problem_type = 'regression'
+        # Preprocessing
+        feature_cols = [col for col in df.columns if col != target_column]
+        X = df[feature_cols].copy()
+        y = df[target_column].copy()
+        # Handle missing values
+        X_numeric = X.select_dtypes(include=[np.number])
+        X_categorical = X.select_dtypes(include=['object'])
+        if not X_numeric.empty:
+            X_numeric = X_numeric.fillna(X_numeric.median())
+        if not X_categorical.empty:
+            X_categorical = X_categorical.fillna(X_categorical.mode().iloc[0] if not X_categorical.mode().empty else 'Unknown')
+        # Encode categorical variables
+        if not X_categorical.empty:
+            le = LabelEncoder()
+            for col in X_categorical.columns:
+                X_categorical[col] = le.fit_transform(X_categorical[col].astype(str))
+        X_processed = pd.concat([X_numeric, X_categorical], axis=1)
+        # Handle target variable for classification
+        if problem_type == 'classification' and y.dtype == 'object':
+            le_target = LabelEncoder()
+            y = le_target.fit_transform(y)
+        # Feature selection
+        if len(X_processed.columns) > 10:
+            selector = SelectKBest(f_regression, k=min(10, len(X_processed.columns)))
+            X_selected = selector.fit_transform(X_processed, y)
+            selected_features = X_processed.columns[selector.get_support()].tolist()
+            X_processed = pd.DataFrame(X_selected, columns=selected_features)
+            results['feature_selection']['selected_features'] = selected_features
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X_processed, y, test_size=0.2, random_state=42
+        )
+        # Scale features
+        scaler = StandardScaler()
+        X_train_scaled = scaler.fit_transform(X_train)
+        X_test_scaled = scaler.transform(X_test)
+        # Model selection based on problem type
+        if problem_type == 'regression':
+            models = {
+                'Linear Regression': LinearRegression(),
+                'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
+                'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
+                'Ridge Regression': Ridge()
+            }
+        else:
+            models = {
+                'Logistic Regression': LogisticRegression(random_state=42),
+                'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
+                'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
+            }
+        # Train and evaluate models
+        best_score = -np.inf if problem_type == 'regression' else 0
+        best_model_name = None
+        for name, model in models.items():
+            try:
+                # Cross-validation
+                if problem_type == 'regression':
+                    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
+                    score = scores.mean()
+                else:
+                    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
+                    score = scores.mean()
+                # Train final model
+                model.fit(X_train_scaled, y_train)
+                y_pred = model.predict(X_test_scaled)
+                if problem_type == 'regression':
+                    test_score = r2_score(y_test, y_pred)
+                    mse = mean_squared_error(y_test, y_pred)
+                    results['models'][name] = {
+                        'cv_score': score,
+                        'test_r2': test_score,
+                        'test_mse': mse,
+                        'predictions': y_pred.tolist()
+                    }
+                else:
+                    test_score = model.score(X_test_scaled, y_test)
+                    results['models'][name] = {
+                        'cv_score': score,
+                        'test_accuracy': test_score,
+                        'predictions': y_pred.tolist()
+                    }
+                # Track best model
+                if score > best_score:
+                    best_score = score
+                    best_model_name = name
+                    # Feature importance
+                    if hasattr(model, 'feature_importances_'):
+                        importance = dict(zip(X_processed.columns, model.feature_importances_))
+                        results['feature_importance'] = dict(sorted(importance.items(), key=lambda x: x[1], reverse=True))
+            except Exception as e:
+                logger.error(f"Error training {name}: {str(e)}")
+                continue
+        results['best_model'] = {
+            'name': best_model_name,
+            'score': best_score,
+            'problem_type': problem_type
+        }
+        results['preprocessing'] = {
+            'numeric_features': X_numeric.columns.tolist() if not X_numeric.empty else [],
+            'categorical_features': X_categorical.columns.tolist() if not X_categorical.empty else [],
+            'scaling_applied': True,
+            'missing_values_handled': True
+        }
+        return results
+    def advanced_clustering_analysis(self, df, n_clusters=None):
+        """Enterprise clustering with multiple algorithms"""
+        # Prepare data
+        numeric_df = df.select_dtypes(include=[np.number])
+        if numeric_df.empty:
+            raise ValueError("No numeric columns for clustering")
+        # Handle missing values
+        numeric_df = numeric_df.fillna(numeric_df.median())
+        # Scale data
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(numeric_df)
+        results = {
+            'algorithms': {},
+            'optimal_clusters': {},
+            'silhouette_scores': {},
+            'recommendations': []
+        }
+        # Determine optimal number of clusters if not provided
+        if n_clusters is None:
+            # Elbow method for K-means
+            inertias = []
+            k_range = range(2, min(11, len(numeric_df) // 2))
+            for k in k_range:
+                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+                kmeans.fit(X_scaled)
+                inertias.append(kmeans.inertia_)
+            # Find elbow point (simplified)
+            if len(inertias) > 2:
+                diffs = np.diff(inertias)
+                second_diffs = np.diff(diffs)
+                n_clusters = k_range[np.argmax(second_diffs) + 1] if len(second_diffs) > 0 else 3
+            else:
+                n_clusters = 3
+        # Apply multiple clustering algorithms
+        algorithms = {
+            'K-Means': KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
+            'Hierarchical': AgglomerativeClustering(n_clusters=n_clusters),
+            'DBSCAN': DBSCAN(eps=0.5, min_samples=5)
+        }
+        for name, algo in algorithms.items():
+            try:
+                if name == 'DBSCAN':
+                    labels = algo.fit_predict(X_scaled)
+                    n_clusters_found = len(set(labels)) - (1 if -1 in labels else 0)
+                else:
+                    labels = algo.fit_predict(X_scaled)
+                    n_clusters_found = n_clusters
+                # Calculate silhouette score
+                if len(set(labels)) > 1:
+                    from sklearn.metrics import silhouette_score
+                    sil_score = silhouette_score(X_scaled, labels)
+                else:
+                    sil_score = 0
+                results['algorithms'][name] = {
+                    'labels': labels.tolist(),
+                    'n_clusters': n_clusters_found,
+                    'silhouette_score': sil_score
+                }
+                results['silhouette_scores'][name] = sil_score
+            except Exception as e:
+                logger.error(f"Error in {name} clustering: {str(e)}")
+                continue
+        # PCA for visualization
+        if len(numeric_df.columns) > 2:
+            pca = PCA(n_components=2)
+            X_pca = pca.fit_transform(X_scaled)
+            results['pca_components'] = X_pca.tolist()
+            results['pca_explained_variance'] = pca.explained_variance_ratio_.tolist()
+        # Generate recommendations
+        best_algo = max(results['silhouette_scores'].items(), key=lambda x: x[1])[0]
+        results['recommendations'].append({
+            'type': 'clustering',
+            'message': f"Best clustering algorithm: {best_algo} with silhouette score: {results['silhouette_scores'][best_algo]:.3f}",
+            'optimal_clusters': results['algorithms'][best_algo]['n_clusters']
+        })
+        return results
+    def time_series_analysis(self, df, date_column, value_column):
+        """Advanced time series analysis"""
+        # Convert date column
+        df[date_column] = pd.to_datetime(df[date_column])
+        df = df.sort_values(date_column)
+        # Set date as index
+        ts_df = df.set_index(date_column)[value_column]
+        results = {
+            'trend_analysis': {},
+            'seasonality': {},
+            'forecasting': {},
+            'anomalies': {},
+            'statistics': {}
+        }
+        # Basic statistics
+        results['statistics'] = {
+            'mean': ts_df.mean(),
+            'std': ts_df.std(),
+            'min': ts_df.min(),
+            'max': ts_df.max(),
+            'trend': 'increasing' if ts_df.iloc[-1] > ts_df.iloc[0] else 'decreasing'
+        }
+        # Trend analysis using linear regression
+        X = np.arange(len(ts_df)).reshape(-1, 1)
+        y = ts_df.values
+        lr = LinearRegression()
+        lr.fit(X, y)
+        trend_slope = lr.coef_[0]
+        results['trend_analysis'] = {
+            'slope': trend_slope,
+            'direction': 'increasing' if trend_slope > 0 else 'decreasing',
+            'strength': abs(trend_slope)
+        }
+        # Simple anomaly detection using IQR
+        Q1 = ts_df.quantile(0.25)
+        Q3 = ts_df.quantile(0.75)
+        IQR = Q3 - Q1
+        anomalies = ts_df[(ts_df < Q1 - 1.5 * IQR) | (ts_df > Q3 + 1.5 * IQR)]
+        results['anomalies'] = {
+            'count': len(anomalies),
+            'dates': anomalies.index.strftime('%Y-%m-%d').tolist(),
+            'values': anomalies.values.tolist()
+        }
+        # Simple forecasting (moving average)
+        window = min(7, len(ts_df) // 4)
+        if window > 0:
+            forecast_periods = min(10, len(ts_df) // 4)
+            last_values = ts_df.tail(window).mean()
+            results['forecasting'] = {
+                'method': 'moving_average',
+                'forecast_periods': forecast_periods,
+                'forecast_values': [last_values] * forecast_periods
+            }
+        return results
+# Initialize analytics engine
+analytics_engine = EnterpriseAnalytics()
 def allowed_file(filename):
     return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 def cleanup_old_files():
+    """Enhanced cleanup with model cleanup"""
     try:
+        # Existing cleanup logic...
+        for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER, MODELS_FOLDER]:
             for root, dirs, files in os.walk(folder):
                 for file in files:
                     filepath = os.path.join(root, file)
                         os.remove(filepath)
                         logger.info(f"Cleaned up old file: {filepath}")
+        # Clean up storage entries
         current_time = datetime.now()
+        for storage in [file_storage, model_storage, analysis_history]:
+            sessions_to_remove = []
+            for session_id, session_data in storage.items():
+                if isinstance(session_data, dict):
+                    items_to_remove = []
+                    for item_id, item_info in session_data.items():
+                        if 'timestamp' in item_info:
+                            item_time = datetime.fromisoformat(item_info['timestamp'])
+                            if (current_time - item_time).total_seconds() > FILE_EXPIRY_HOURS * 3600:
+                                items_to_remove.append(item_id)
+                    for item_id in items_to_remove:
+                        del session_data[item_id]
+                    if not session_data:
+                        sessions_to_remove.append(session_id)
+            for session_id in sessions_to_remove:
+                del storage[session_id]
     except Exception as e:
         logger.error(f"Error during cleanup: {str(e)}")
+def get_file_age(filepath):
+    """Get file age in hours"""
+    if os.path.exists(filepath):
+        file_time = os.path.getmtime(filepath)
+        return (time.time() - file_time) / 3600
+    return float('inf')
 def load_data_file(filepath, filename):
+    """Enhanced data loading with more formats"""
     try:
         file_ext = filename.rsplit('.', 1)[1].lower()
             return pd.read_parquet(filepath)
         elif file_ext == 'tsv':
             return pd.read_csv(filepath, sep='\t')
+        elif file_ext == 'feather':
+            return pd.read_feather(filepath)
         else:
             raise ValueError(f"Unsupported file format: {file_ext}")
     except Exception as e:
         raise Exception(f"Error loading file: {str(e)}")
+# Setup enhanced scheduler
+scheduler = BackgroundScheduler()
+scheduler.add_job(func=cleanup_old_files, trigger="interval", hours=1)
+scheduler.start()
+atexit.register(lambda: scheduler.shutdown())
+# API Endpoints
 @app.route('/api/health', methods=['GET'])
 def health_check():
+    return jsonify({
+        'status': 'healthy',
+        'version': '2.0.0-enterprise',
+        'features': ['advanced_profiling', 'automl', 'clustering', 'time_series'],
+        'timestamp': datetime.now().isoformat()
+    })
 @app.route('/api/upload', methods=['POST'])
 def upload_file():
             return jsonify({'error': 'File type not supported'}), 400
         # Check file size
+        file.seek(0, 2)
         file_size = file.tell()
+        file.seek(0)
         if file_size > MAX_FILE_SIZE:
             return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400
         filepath = os.path.join(session_dir, f"{file_id}_{filename}")
         file.save(filepath)
+        # Enhanced file metadata
         if session_id not in file_storage:
             file_storage[session_id] = {}
             'filename': filename,
             'filepath': filepath,
             'size': file_size,
+            'timestamp': datetime.now().isoformat(),
+            'format': filename.rsplit('.', 1)[1].lower(),
+            'status': 'uploaded'
         }
         return jsonify({
             'fileId': file_id,
             'filename': filename,
             'size': file_size,
+            'format': filename.rsplit('.', 1)[1].lower(),
             'message': 'File uploaded successfully'
         })
         logger.error(f"Upload error: {str(e)}")
         return jsonify({'error': str(e)}), 500
+@app.route('/api/profile/<file_id>', methods=['GET'])
+def profile_data(file_id):
+    """Advanced data profiling endpoint"""
     try:
         session_id = request.args.get('sessionId')
         if not session_id or session_id not in file_storage:
             return jsonify({'error': 'File not found'}), 404
         file_info = file_storage[session_id][file_id]
+        df = load_data_file(file_info['filepath'], file_info['filename'])
+        # Perform advanced profiling
+        profile = analytics_engine.advanced_data_profiling(df)
+        return jsonify(profile)
+    except Exception as e:
+        logger.error(f"Profiling error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/automl', methods=['POST'])
+def run_automl():
+    """Automated ML pipeline endpoint"""
+    try:
+        data = request.get_json()
+        session_id = data.get('sessionId')
+        file_id = data.get('fileId')
+        target_column = data.get('targetColumn')
+        problem_type = data.get('problemType', 'auto')
+        if not all([session_id, file_id, target_column]):
+            return jsonify({'error': 'Session ID, File ID, and target column required'}), 400
+        if session_id not in file_storage or file_id not in file_storage[session_id]:
+            return jsonify({'error': 'File not found'}), 404
+        file_info = file_storage[session_id][file_id]
         df = load_data_file(file_info['filepath'], file_info['filename'])
+        if target_column not in df.columns:
+            return jsonify({'error': f'Target column {target_column} not found'}), 400
+        # Run AutoML pipeline
+        results = analytics_engine.automated_ml_pipeline(df, target_column, problem_type)
+        # Save results
+        result_id = str(uuid.uuid4())
+        result_dir = os.path.join(PROCESSED_FOLDER, session_id)
+        os.makedirs(result_dir, exist_ok=True)
+        result_filepath = os.path.join(result_dir, f"{result_id}_automl.json")
+        with open(result_filepath, 'w') as f:
+            json.dump(results, f, indent=2, default=str)
+        return jsonify({
+            'resultId': result_id,
+            'results': results,
+            'analysisType': 'automl',
+            'timestamp': datetime.now().isoformat()
+        })
     except Exception as e:
+        logger.error(f"AutoML error: {str(e)}")
         return jsonify({'error': str(e)}), 500
+@app.route('/api/clustering', methods=['POST'])
+def run_clustering():
+    """Advanced clustering analysis endpoint"""
     try:
         data = request.get_json()
         session_id = data.get('sessionId')
         file_id = data.get('fileId')
+        n_clusters = data.get('nClusters')
         if not all([session_id, file_id]):
             return jsonify({'error': 'Session ID and File ID required'}), 400
         file_info = file_storage[session_id][file_id]
         df = load_data_file(file_info['filepath'], file_info['filename'])
+        # Run clustering analysis
+        results = analytics_engine.advanced_clustering_analysis(df, n_clusters)
+        # Save results
+        result_id = str(uuid.uuid4())
+        result_dir = os.path.join(PROCESSED_FOLDER, session_id)
+        os.makedirs(result_dir, exist_ok=True)
+        result_filepath = os.path.join(result_dir, f"{result_id}_clustering.json")
+        with open(result_filepath, 'w') as f:
+            json.dump(results, f, indent=2, default=str)
+        return jsonify({
+            'resultId': result_id,
+            'results': results,
+            'analysisType': 'clustering',
+            'timestamp': datetime.now().isoformat()
+        })
+    except Exception as e:
+        logger.error(f"Clustering error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/timeseries', methods=['POST'])
+def run_timeseries():
+    """Time series analysis endpoint"""
+    try:
+        data = request.get_json()
+        session_id = data.get('sessionId')
+        file_id = data.get('fileId')
+        date_column = data.get('dateColumn')
+        value_column = data.get('valueColumn')
+        if not all([session_id, file_id, date_column, value_column]):
+            return jsonify({'error': 'Session ID, File ID, date column, and value column required'}), 400
+        if session_id not in file_storage or file_id not in file_storage[session_id]:
+            return jsonify({'error': 'File not found'}), 404
+        file_info = file_storage[session_id][file_id]
+        df = load_data_file(file_info['filepath'], file_info['filename'])
+        if date_column not in df.columns or value_column not in df.columns:
+            return jsonify({'error': 'Date or value column not found'}), 400
+        # Run time series analysis
+        results = analytics_engine.time_series_analysis(df, date_column, value_column)
+        # Save results
         result_id = str(uuid.uuid4())
         result_dir = os.path.join(PROCESSED_FOLDER, session_id)
         os.makedirs(result_dir, exist_ok=True)
+        result_filepath = os.path.join(result_dir, f"{result_id}_timeseries.json")
         with open(result_filepath, 'w') as f:
+            json.dump(results, f, indent=2, default=str)
         return jsonify({
             'resultId': result_id,
+            'results': results,
+            'analysisType': 'timeseries',
             'timestamp': datetime.now().isoformat()
         })
     except Exception as e:
+        logger.error(f"Time series error: {str(e)}")
         return jsonify({'error': str(e)}), 500
+@app.route('/api/feature-engineering', methods=['POST'])
+def run_feature_engineering():
+    """Feature engineering endpoint"""
     try:
+        data = request.get_json()
+        session_id = data.get('sessionId')
+        file_id = data.get('fileId')
+        target_column = data.get('targetColumn')
+        if not all([session_id, file_id]):
+            return jsonify({'error': 'Session ID and File ID required'}), 400
+        if session_id not in file_storage or file_id not in file_storage[session_id]:
+            return jsonify({'error': 'File not found'}), 404
+        file_info = file_storage[session_id][file_id]
+        df = load_data_file(file_info['filepath'], file_info['filename'])
+        # Generate engineered features
+        engineered_df = analytics_engine.advanced_feature_engineering(df, target_column)
+        # Save engineered dataset
+        engineered_file_id = str(uuid.uuid4())
+        engineered_filepath = os.path.join(
+            PROCESSED_FOLDER, session_id, f"{engineered_file_id}_engineered.csv"
+        )
+        os.makedirs(os.path.dirname(engineered_filepath), exist_ok=True)
+        # Combine original and engineered features
+        combined_df = pd.concat([df, engineered_df], axis=1)
+        combined_df.to_csv(engineered_filepath, index=False)
+        # Store engineered file info
         if session_id not in file_storage:
+            file_storage[session_id] = {}
+        file_storage[session_id][engineered_file_id] = {
+            'filename': f"{file_info['filename'].split('.')[0]}_engineered.csv",
+            'filepath': engineered_filepath,
+            'size': os.path.getsize(engineered_filepath),
+            'timestamp': datetime.now().isoformat(),
+            'format': 'csv',
+            'status': 'engineered',
+            'parent_file': file_id
+        }
+        return jsonify({
+            'engineeredFileId': engineered_file_id,
+            'originalFeatures': len(df.columns),
+            'engineeredFeatures': len(engineered_df.columns),
+            'totalFeatures': len(combined_df.columns),
+            'featureNames': engineered_df.columns.tolist(),
+            'message': 'Feature engineering completed successfully'
+        })
     except Exception as e:
+        logger.error(f"Feature engineering error: {str(e)}")
         return jsonify({'error': str(e)}), 500
+@app.route('/api/advanced-visualization', methods=['POST'])
+def create_advanced_visualization():
+    """Advanced visualization endpoint with Plotly"""
     try:
+        data = request.get_json()
+        session_id = data.get('sessionId')
+        file_id = data.get('fileId')
+        chart_type = data.get('chartType')
+        parameters = data.get('parameters', {})
+        if not all([session_id, file_id, chart_type]):
+            return jsonify({'error': 'Session ID, File ID, and chart type required'}), 400
+        if session_id not in file_storage or file_id not in file_storage[session_id]:
             return jsonify({'error': 'File not found'}), 404
         file_info = file_storage[session_id][file_id]
+        df = load_data_file(file_info['filepath'], file_info['filename'])
+        # Create advanced visualizations using Plotly
+        if chart_type == 'correlation_heatmap':
+            numeric_df = df.select_dtypes(include=[np.number])
+            corr_matrix = numeric_df.corr()
+            fig = px.imshow(corr_matrix,
+                          title='Correlation Heatmap',
+                          color_continuous_scale='RdBu_r',
+                          aspect='auto')
+        elif chart_type == 'distribution_plots':
+            column = parameters.get('column')
+            if not column or column not in df.columns:
+                return jsonify({'error': 'Column not specified or not found'}), 400
+            fig = px.histogram(df, x=column,
+                             title=f'Distribution of {column}',
+                             marginal='box')
+        elif chart_type == 'scatter_matrix':
+            columns = parameters.get('columns', df.select_dtypes(include=[np.number]).columns[:4])
+            fig = px.scatter_matrix(df[columns],
+                                  title='Scatter Matrix',
+                                  dimensions=columns)
+        elif chart_type == 'parallel_coordinates':
+            columns = parameters.get('columns', df.select_dtypes(include=[np.number]).columns[:5])
+            fig = px.parallel_coordinates(df,
+                                        dimensions=columns,
+                                        title='Parallel Coordinates Plot')
+        elif chart_type == 'box_plots':
+            columns = parameters.get('columns', df.select_dtypes(include=[np.number]).columns[:5])
+            fig = px.box(df[columns],
+                        title='Box Plots Comparison')
+        elif chart_type == '3d_scatter':
+            x_col = parameters.get('x_column')
+            y_col = parameters.get('y_column')
+            z_col = parameters.get('z_column')
+            if not all([x_col, y_col, z_col]):
+                return jsonify({'error': '3D scatter requires x, y, and z columns'}), 400
+            fig = px.scatter_3d(df, x=x_col, y=y_col, z=z_col,
+                              title=f'3D Scatter: {x_col} vs {y_col} vs {z_col}')
+        else:
+            return jsonify({'error': 'Unsupported chart type'}), 400
+        # Convert to JSON
+        chart_json = json.dumps(fig, cls=PlotlyJSONEncoder)
+        return jsonify({
+            'chart': chart_json,
+            'chartType': chart_type,
+            'timestamp': datetime.now().isoformat()
+        })
     except Exception as e:
+        logger.error(f"Visualization error: {str(e)}")
         return jsonify({'error': str(e)}), 500
+@app.route('/api/data-quality', methods=['POST'])
+def assess_data_quality():
+    """Data quality assessment endpoint"""
     try:
+        data = request.get_json()
+        session_id = data.get('sessionId')
+        file_id = data.get('fileId')
+        if not all([session_id, file_id]):
+            return jsonify({'error': 'Session ID and File ID required'}), 400
+        if session_id not in file_storage or file_id not in file_storage[session_id]:
+            return jsonify({'error': 'File not found'}), 404
+        file_info = file_storage[session_id][file_id]
+        df = load_data_file(file_info['filepath'], file_info['filename'])
+        quality_report = {
+            'overall_score': 0,
+            'dimensions': {
+                'completeness': {},
+                'consistency': {},
+                'validity': {},
+                'uniqueness': {},
+                'accuracy': {}
+            },
+            'issues': [],
+            'recommendations': []
+        }
+        # Completeness assessment
+        total_cells = len(df) * len(df.columns)
+        missing_cells = df.isnull().sum().sum()
+        completeness_score = ((total_cells - missing_cells) / total_cells) * 100
+        quality_report['dimensions']['completeness'] = {
+            'score': completeness_score,
+            'missing_values': df.isnull().sum().to_dict(),
+            'missing_percentage': (df.isnull().sum() / len(df) * 100).to_dict()
+        }
+        # Consistency assessment
+        consistency_issues = []
+        for col in df.select_dtypes(include=['object']):
+            # Check for inconsistent formatting
+            values = df[col].dropna().astype(str)
+            if len(values) > 0:
+                # Mixed case issues
+                lowercase_values = set(v.lower() for v in values)
+                if len(lowercase_values) != len(set(values)):
+                    consistency_issues.append(f"Column '{col}' has mixed case values")
+                # Leading/trailing spaces
+                if any(v != v.strip() for v in values):
+                    consistency_issues.append(f"Column '{col}' has leading/trailing spaces")
+        consistency_score = max(0, 100 - len(consistency_issues) * 10)
+        quality_report['dimensions']['consistency'] = {
+            'score': consistency_score,
+            'issues': consistency_issues
+        }
+        # Validity assessment (basic data type validation)
+        validity_issues = []
+        for col in df.columns:
+            if df[col].dtype == 'object':
+                # Check for potential numeric columns stored as strings
+                try:
+                    pd.to_numeric(df[col].dropna(), errors='raise')
+                    validity_issues.append(f"Column '{col}' appears to be numeric but stored as text")
+                except:
+                    pass
+        validity_score = max(0, 100 - len(validity_issues) * 15)
+        quality_report['dimensions']['validity'] = {
+            'score': validity_score,
+            'issues': validity_issues
+        }
+        # Uniqueness assessment
+        uniqueness_scores = {}
+        for col in df.columns:
+            unique_ratio = df[col].nunique() / len(df) if len(df) > 0 else 0
+            uniqueness_scores[col] = unique_ratio * 100
+        avg_uniqueness = np.mean(list(uniqueness_scores.values()))
+        quality_report['dimensions']['uniqueness'] = {
+            'score': avg_uniqueness,
+            'column_scores': uniqueness_scores,
+            'duplicate_rows': df.duplicated().sum()
+        }
+        # Overall score calculation
+        dimension_scores = [
+            completeness_score,
+            consistency_score,
+            validity_score,
+            avg_uniqueness
+        ]
+        quality_report['overall_score'] = np.mean(dimension_scores)
+        # Generate recommendations
+        if completeness_score < 80:
+            quality_report['recommendations'].append({
+                'type': 'completeness',
+                'priority': 'high',
+                'message': 'Consider imputing missing values or removing incomplete records'
+            })
+        if consistency_score < 70:
+            quality_report['recommendations'].append({
+                'type': 'consistency',
+                'priority': 'medium',
+                'message': 'Standardize text formatting and remove extra spaces'
+            })
+        if validity_score < 80:
+            quality_report['recommendations'].append({
+                'type': 'validity',
+                'priority': 'medium',
+                'message': 'Review data types and convert where appropriate'
+            })
+        return jsonify(quality_report)
+    except Exception as e:
+        logger.error(f"Data quality error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/statistical-tests', methods=['POST'])
+def run_statistical_tests():
+    """Statistical hypothesis testing endpoint"""
+    try:
+        data = request.get_json()
+        session_id = data.get('sessionId')
+        file_id = data.get('fileId')
+        test_type = data.get('testType')
+        parameters = data.get('parameters', {})
+        if not all([session_id, file_id, test_type]):
+            return jsonify({'error': 'Session ID, File ID, and test type required'}), 400
+        if session_id not in file_storage or file_id not in file_storage[session_id]:
+            return jsonify({'error': 'File not found'}), 404
+        file_info = file_storage[session_id][file_id]
+        df = load_data_file(file_info['filepath'], file_info['filename'])
+        results = {'test_type': test_type, 'results': {}}
+        if test_type == 'normality':
+            column = parameters.get('column')
+            if not column or column not in df.columns:
+                return jsonify({'error': 'Column not specified or not found'}), 400
+            data_col = df[column].dropna()
+            # Shapiro-Wilk test
+            shapiro_stat, shapiro_p = stats.shapiro(data_col.sample(min(5000, len(data_col))))
+            # Anderson-Darling test
+            anderson_result = stats.anderson(data_col)
+            results['results'] = {
+                'shapiro_wilk': {
+                    'statistic': shapiro_stat,
+                    'p_value': shapiro_p,
+                    'is_normal': shapiro_p > 0.05
+                },
+                'anderson_darling': {
+                    'statistic': anderson_result.statistic,
+                    'critical_values': anderson_result.critical_values.tolist(),
+                    'significance_levels': anderson_result.significance_level.tolist()
+                }
+            }
+        elif test_type == 'correlation_significance':
+            col1 = parameters.get('column1')
+            col2 = parameters.get('column2')
+            if not all([col1, col2]) or col1 not in df.columns or col2 not in df.columns:
+                return jsonify({'error': 'Both columns must be specified and exist'}), 400
+            # Pearson correlation
+            pearson_corr, pearson_p = stats.pearsonr(df[col1].dropna(), df[col2].dropna())
+            # Spearman correlation
+            spearman_corr, spearman_p = stats.spearmanr(df[col1].dropna(), df[col2].dropna())
+            results['results'] = {
+                'pearson': {
+                    'correlation': pearson_corr,
+                    'p_value': pearson_p,
+                    'significant': pearson_p < 0.05
+                },
+                'spearman': {
+                    'correlation': spearman_corr,
+                    'p_value': spearman_p,
+                    'significant': spearman_p < 0.05
+                }
+            }
+        elif test_type == 'group_comparison':
+            group_col = parameters.get('groupColumn')
+            value_col = parameters.get('valueColumn')
+            if not all([group_col, value_col]):
+                return jsonify({'error': 'Group and value columns required'}), 400
+            groups = [group for name, group in df.groupby(group_col)[value_col] if len(group) > 1]
+            if len(groups) == 2:
+                # Two-sample t-test
+                t_stat, t_p = stats.ttest_ind(groups[0], groups[1])
+                # Mann-Whitney U test
+                u_stat, u_p = stats.mannwhitneyu(groups[0], groups[1])
+                results['results'] = {
+                    'two_sample_ttest': {
+                        'statistic': t_stat,
+                        'p_value': t_p,
+                        'significant': t_p < 0.05
+                    },
+                    'mann_whitney_u': {
+                        'statistic': u_stat,
+                        'p_value': u_p,
+                        'significant': u_p < 0.05
+                    }
+                }
+            elif len(groups) > 2:
+                # ANOVA
+                f_stat, f_p = stats.f_oneway(*groups)
+                # Kruskal-Wallis test
+                h_stat, h_p = stats.kruskal(*groups)
+                results['results'] = {
+                    'anova': {
+                        'statistic': f_stat,
+                        'p_value': f_p,
+                        'significant': f_p < 0.05
+                    },
+                    'kruskal_wallis': {
+                        'statistic': h_stat,
+                        'p_value': h_p,
+                        'significant': h_p < 0.05
+                    }
+                }
+        else:
+            return jsonify({'error': 'Unsupported test type'}), 400
+        return jsonify(results)
+    except Exception as e:
+        logger.error(f"Statistical test error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/analysis-history/<session_id>', methods=['GET'])
+def get_analysis_history(session_id):
+    """Get analysis history for a session"""
+    try:
+        if session_id not in analysis_history:
+            return jsonify({'history': []})
+        return jsonify({'history': list(analysis_history[session_id].values())})
+    except Exception as e:
+        logger.error(f"History error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/export-report', methods=['POST'])
+def export_analysis_report():
+    """Export comprehensive analysis report"""
+    try:
+        data = request.get_json()
+        session_id = data.get('sessionId')
+        analyses = data.get('analyses', [])  # List of analysis result IDs
         if not session_id:
             return jsonify({'error': 'Session ID required'}), 400
+        # Compile report
+        report = {
+            'session_id': session_id,
+            'generated_at': datetime.now().isoformat(),
+            'analyses': [],
+            'summary': {
+                'total_analyses': len(analyses),
+                'data_files_processed': len(file_storage.get(session_id, {})),
+                'recommendations': []
+            }
+        }
+        # Load each analysis result
+        for analysis_id in analyses:
+            try:
+                result_files = [
+                    f for f in os.listdir(os.path.join(PROCESSED_FOLDER, session_id))
+                    if f.startswith(analysis_id)
+                ]
+                if result_files:
+                    filepath = os.path.join(PROCESSED_FOLDER, session_id, result_files[0])
+                    with open(filepath, 'r') as f:
+                        analysis_data = json.load(f)
+                        report['analyses'].append({
+                            'id': analysis_id,
+                            'type': result_files[0].split('_')[1].split('.')[0],
+                            'data': analysis_data
+                        })
+            except Exception as e:
+                logger.error(f"Error loading analysis {analysis_id}: {str(e)}")
+                continue
+        # Generate summary recommendations
+        if report['analyses']:
+            report['summary']['recommendations'] = [
+                "Review data quality scores and address high-priority issues",
+                "Consider feature engineering for improved model performance",
+                "Validate statistical assumptions before drawing conclusions",
+                "Monitor model performance with cross-validation results"
+            ]
+        # Save report
+        report_id = str(uuid.uuid4())
+        report_dir = os.path.join(PROCESSED_FOLDER, session_id)
+        os.makedirs(report_dir, exist_ok=True)
+        report_filepath = os.path.join(report_dir, f"{report_id}_report.json")
+        with open(report_filepath, 'w') as f:
+            json.dump(report, f, indent=2, default=str)
+        return jsonify({
+            'reportId': report_id,
+            'message': 'Report generated successfully',
+            'downloadUrl': f'/api/download/{report_id}?sessionId={session_id}&format=json'
+        })
+    except Exception as e:
+        logger.error(f"Report export error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+# Update existing endpoints with enhanced functionality
+@app.route('/api/preview/<file_id>', methods=['GET'])
+def preview_file(file_id):
+    try:
+        session_id = request.args.get('sessionId')
+        if not session_id or session_id not in file_storage:
+            return jsonify({'error': 'Invalid session'}), 400
+        if file_id not in file_storage[session_id]:
+            return jsonify({'error': 'File not found'}), 404
+        file_info = file_storage[session_id][file_id]
+        df = load_data_file(file_info['filepath'], file_info['filename'])
+        # Enhanced preview with data insights
+        preview_data = {
+            'basic_info': {
+                'columns': df.columns.tolist(),
+                'dtypes': df.dtypes.astype(str).to_dict(),
+                'shape': df.shape,
+                'memory_usage': df.memory_usage(deep=True).sum()
+            },
+            'sample_data': {
+                'head': df.head(5).to_dict('records'),
+                'tail': df.tail(5).to_dict('records')
+            },
+            'data_quality': {
+                'missing_values': df.isnull().sum().to_dict(),
+                'duplicate_rows': df.duplicated().sum(),
+                'unique_values': df.nunique().to_dict()
+            },
+            'quick_stats': {}
+        }
+        # Quick statistics for numeric columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) > 0:
+            preview_data['quick_stats']['numeric'] = df[numeric_cols].describe().to_dict()
+        # Quick statistics for categorical columns
+        categorical_cols = df.select_dtypes(include=['object']).columns
+        if len(categorical_cols) > 0:
+            preview_data['quick_stats']['categorical'] = {}
+            for col in categorical_cols[:5]:  # Limit to first 5 categorical columns
+                preview_data['quick_stats']['categorical'][col] = {
+                    'top_values': df[col].value_counts().head(5).to_dict()
+                }
+        return jsonify(preview_data)
     except Exception as e:
+        logger.error(f"Preview error: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/', methods=['GET'])
 def home():
     return jsonify({
+        'message': 'Enterprise Data Analytics Platform',
+        'version': '2.0.0-enterprise',
+        'features': {
+            'core': ['data_profiling', 'quality_assessment', 'statistical_tests'],
+            'machine_learning': ['automl', 'clustering', 'feature_engineering'],
+            'time_series': ['trend_analysis', 'forecasting', 'anomaly_detection'],
+            'visualization': ['advanced_charts', 'interactive_plots', 'correlation_heatmaps'],
+            'enterprise': ['report_generation', 'analysis_history', 'data_governance']
+        },
         'endpoints': {
+            'data_management': ['/api/upload', '/api/preview/<file_id>', '/api/profile/<file_id>'],
+            'analytics': ['/api/automl', '/api/clustering', '/api/timeseries'],
+            'quality': ['/api/data-quality', '/api/statistical-tests'],
+            'visualization': ['/api/advanced-visualization'],
+            'enterprise': ['/api/export-report', '/api/analysis-history/<session_id>']
         },
         'timestamp': datetime.now().isoformat()
     })
 if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860, debug=False)  # Production ready