File size: 5,728 Bytes
d1943e0
12fa967
 
 
fe02df7
12fa967
a00699a
fe02df7
 
12fa967
d1943e0
 
 
 
 
7d40c30
 
 
fe02df7
 
 
d1943e0
 
12fa967
d1943e0
7d40c30
 
 
d1943e0
7d40c30
12fa967
fe02df7
 
 
12fa967
fcc261b
 
 
 
 
 
a00699a
fcc261b
 
fe02df7
 
 
 
7d40c30
fcc261b
fe02df7
 
 
 
 
 
 
 
 
 
 
7d40c30
fcc261b
a00699a
fcc261b
 
 
 
 
a00699a
fe02df7
fcc261b
7d40c30
a00699a
 
 
7d40c30
 
fe02df7
fcc261b
 
 
 
 
 
 
fe02df7
fcc261b
 
 
 
a00699a
fe02df7
 
 
 
 
 
 
 
 
 
 
 
 
 
a00699a
fcc261b
 
 
 
7d40c30
a00699a
 
 
fcc261b
7d40c30
fe02df7
7d40c30
 
 
fe02df7
 
7d40c30
 
 
fe02df7
 
 
7d40c30
fcc261b
7d40c30
fe02df7
fcc261b
a00699a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# ui/callbacks.py

# -*- coding: utf-8 -*-
#
# PROJECT:      CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION:  This module contains the core logic for all Gradio event handlers.
#               The main analysis pipeline now includes a strategic feature
#               engineering step before analysis.

import gradio as gr
import pandas as pd
import logging
from threading import Thread

import plotly.graph_objects as go
import plotly.express as px

# --- MODIFIED IMPORT ---
# Import both the analyzer class and the new feature engineering function
from core.analyzer import DataAnalyzer, engineer_features
from core.llm import GeminiNarrativeGenerator
from core.config import settings
from core.exceptions import DataProcessingError
from modules.clustering import perform_clustering


# --- Primary Analysis Chain ---

def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
    """
    Phase 1: Now includes the strategic feature engineering step.
    Validates inputs, loads raw data, applies feature engineering, and then
    creates the core DataAnalyzer object on the transformed data.
    """
    if file_obj is None:
        raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")

    progress(0, desc="Validating configuration...")
    if not settings.GOOGLE_API_KEY:
        logging.error("Analysis attempted without GOOGLE_API_KEY set.")
        raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured. Please add it as a secret.")

    try:
        progress(0.1, desc="Loading raw data...")
        df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
        if len(df_raw) > settings.MAX_UI_ROWS:
            df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
            logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")

        # --- INTEGRATION POINT ---
        # Apply the feature engineering function immediately after loading
        progress(0.5, desc="Applying strategic feature engineering...")
        df_engineered = engineer_features(df_raw)
        # -------------------------

        progress(0.8, desc="Instantiating analysis engine on engineered data...")
        # The analyzer now works with the transformed, high-value dataset
        analyzer = DataAnalyzer(df_engineered) 
        
        progress(1.0, desc="Analysis complete. Generating reports...")
        return analyzer
    except Exception as e:
        logging.error(f"A critical error occurred during initial analysis: {e}", exc_info=True)
        raise gr.Error(f"Analysis Failed! An unexpected error occurred: {str(e)}")


def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
    """
    Phase 2: Slower, multi-stage report and visual generation.
    Yields tuples of UI updates based on the *engineered* data.
    """
    if not isinstance(analyzer, DataAnalyzer):
        logging.warning("generate_reports_and_visuals called without a valid analyzer. Clearing UI.")
        yield (None,) * 14
        return

    progress(0, desc="Spawning AI report thread...")
    ai_report_queue = [""]
    def generate_ai_report_threaded(analyzer_instance):
        narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY)
        ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance)
    
    thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
    thread.start()

    progress(0.4, desc="Generating reports and visuals...")
    meta = analyzer.metadata
    missing_df, num_df, cat_df = analyzer.get_profiling_reports()
    fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()

    initial_updates = (
        gr.update(value="⏳ Generating AI report... Dashboard is ready."),
        gr.update(value=missing_df),
        gr.update(value=num_df),
        gr.update(value=cat_df),
        gr.update(value=fig_types),
        gr.update(value=fig_missing),
        gr.update(value=fig_corr),
        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
        gr.update(choices=meta['columns']),
        gr.update(visible=bool(meta['datetime_cols'])),
        gr.update(visible=bool(meta['text_cols'])),
        gr.update(visible=len(meta['numeric_cols']) > 1)
    )
    yield initial_updates

    thread.join()
    progress(1.0, desc="AI Report complete!")

    final_updates_list = list(initial_updates)
    final_updates_list[0] = gr.update(value=ai_report_queue[0])
    yield tuple(final_updates_list)


# --- Interactive Explorer & Module Callbacks ---

def create_histogram(analyzer, col):
    if not isinstance(analyzer, DataAnalyzer) or not col:
        return go.Figure()
    return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")

def create_scatterplot(analyzer, x_col, y_col, color_col):
    if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
        return go.Figure()
    df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
    return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)

def update_clustering(analyzer, k):
    if not isinstance(analyzer, DataAnalyzer):
        return gr.update(), gr.update(), gr.update()
    fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k)
    return fig_cluster, fig_elbow, summary