Spaces:

mgbam
/

DataBiz

Sleeping

App Files Files Community

mgbam commited on Jan 28

Commit

2311473

verified ·

1 Parent(s): 439fcbb

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -10

app.py CHANGED Viewed

@@ -26,12 +26,27 @@ import uuid  # For generating unique report IDs
 # ------------------------------
 class GroqLLM:
     """Enhanced LLM interface with support for generating natural language summaries."""
-    def __init__(self, model_name="llama-3.1-8B-Instant"):
         self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
         self.model_name = model_name
     def __call__(self, prompt: Union[str, dict, List[Dict]]) -> str:
-        """Make the class callable as required by smolagents"""
         try:
             # Handle different prompt formats
             if isinstance(prompt, (dict, list)):
@@ -63,18 +78,39 @@ class GroqLLM:
 # ------------------------------
 class DataAnalysisAgent(CodeAgent):
     """Extended CodeAgent with dataset awareness and predictive analytics capabilities."""
     def __init__(self, dataset: pd.DataFrame, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._dataset = dataset
         self.models = {}  # To store trained models
     @property
     def dataset(self) -> pd.DataFrame:
-        """Access the stored dataset"""
         return self._dataset
     def run(self, prompt: str) -> str:
-        """Override run method to include dataset context and support predictive tasks"""
         dataset_info = f"""
         Dataset Shape: {self.dataset.shape}
         Columns: {', '.join(self.dataset.columns)}
@@ -96,7 +132,22 @@ class DataAnalysisAgent(CodeAgent):
 @tool
 def analyze_basic_stats(data: pd.DataFrame) -> str:
-    """Calculate and visualize basic statistical measures for numerical columns."""
     if data is None:
         data = tool.agent.dataset
@@ -134,7 +185,21 @@ def analyze_basic_stats(data: pd.DataFrame) -> str:
 @tool
 def generate_correlation_matrix(data: pd.DataFrame) -> str:
-    """Generate an interactive correlation matrix using Plotly."""
     if data is None:
         data = tool.agent.dataset
@@ -156,7 +221,21 @@ def generate_correlation_matrix(data: pd.DataFrame) -> str:
 @tool
 def analyze_categorical_columns(data: pd.DataFrame) -> str:
-    """Analyze categorical columns with visualizations."""
     if data is None:
         data = tool.agent.dataset
@@ -197,7 +276,20 @@ def analyze_categorical_columns(data: pd.DataFrame) -> str:
 @tool
 def suggest_features(data: pd.DataFrame) -> str:
-    """Suggest potential feature engineering steps based on data characteristics."""
     if data is None:
         data = tool.agent.dataset
@@ -231,7 +323,21 @@ def suggest_features(data: pd.DataFrame) -> str:
 @tool
 def predictive_analysis(data: pd.DataFrame, target: str) -> str:
-    """Perform predictive analytics by training a classification model."""
     if data is None:
         data = tool.agent.dataset
@@ -326,7 +432,19 @@ def predictive_analysis(data: pd.DataFrame, target: str) -> str:
 # Report Exporting Function
 # ------------------------------
 def export_report(content: str, filename: str):
-    """Export the given content as a PDF report."""
     # Save content to a temporary HTML file
     with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as tmp_file:
         tmp_file.write(content.encode('utf-8'))

 # ------------------------------
 class GroqLLM:
     """Enhanced LLM interface with support for generating natural language summaries."""
+    def __init__(self, model_name: str = "llama-3.1-8B-Instant"):
+        """
+        Initialize the GroqLLM with a specified model.
+        Args:
+            model_name (str): The name of the language model to use.
+        """
         self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
         self.model_name = model_name
     def __call__(self, prompt: Union[str, dict, List[Dict]]) -> str:
+        """
+        Make the class callable as required by smolagents.
+        Args:
+            prompt (Union[str, dict, List[Dict]]): The input prompt for the language model.
+        Returns:
+            str: The generated response from the language model.
+        """
         try:
             # Handle different prompt formats
             if isinstance(prompt, (dict, list)):
 # ------------------------------
 class DataAnalysisAgent(CodeAgent):
     """Extended CodeAgent with dataset awareness and predictive analytics capabilities."""
     def __init__(self, dataset: pd.DataFrame, *args, **kwargs):
+        """
+        Initialize the DataAnalysisAgent with the provided dataset.
+        Args:
+            dataset (pd.DataFrame): The dataset to analyze.
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
         super().__init__(*args, **kwargs)
         self._dataset = dataset
         self.models = {}  # To store trained models
     @property
     def dataset(self) -> pd.DataFrame:
+        """Access the stored dataset.
+        Returns:
+            pd.DataFrame: The dataset stored in the agent.
+        """
         return self._dataset
     def run(self, prompt: str) -> str:
+        """
+        Override the run method to include dataset context and support predictive tasks.
+        Args:
+            prompt (str): The task prompt for analysis.
+        Returns:
+            str: The result of the analysis.
+        """
         dataset_info = f"""
         Dataset Shape: {self.dataset.shape}
         Columns: {', '.join(self.dataset.columns)}
 @tool
 def analyze_basic_stats(data: pd.DataFrame) -> str:
+    """
+    Calculate and visualize basic statistical measures for numerical columns.
+    This function computes fundamental statistical metrics including mean, median,
+    standard deviation, skewness, and counts of missing values for all numerical
+    columns in the provided DataFrame. It also generates a bar chart visualizing
+    the mean, median, and standard deviation for each numerical feature.
+    Args:
+        data (pd.DataFrame): A pandas DataFrame containing the dataset to analyze.
+                             The DataFrame should contain at least one numerical column
+                             for meaningful analysis.
+    Returns:
+        str: A markdown-formatted string containing the statistics and the generated plot.
+    """
     if data is None:
         data = tool.agent.dataset
 @tool
 def generate_correlation_matrix(data: pd.DataFrame) -> str:
+    """
+    Generate an interactive correlation matrix using Plotly.
+    This function creates an interactive heatmap visualization showing the correlations between
+    all numerical columns in the dataset. Users can hover over cells to see correlation values
+    and interact with the plot (zoom, pan).
+    Args:
+        data (pd.DataFrame): A pandas DataFrame containing the dataset to analyze.
+                             The DataFrame should contain at least two numerical columns
+                             for correlation analysis.
+    Returns:
+        str: An HTML string representing the interactive correlation matrix plot.
+    """
     if data is None:
         data = tool.agent.dataset
 @tool
 def analyze_categorical_columns(data: pd.DataFrame) -> str:
+    """
+    Analyze categorical columns with visualizations.
+    This function examines categorical columns to identify unique values, top categories,
+    and missing value counts. It also generates bar charts for the top 5 categories in each
+    categorical feature.
+    Args:
+        data (pd.DataFrame): A pandas DataFrame containing the dataset to analyze.
+                             The DataFrame should contain at least one categorical column
+                             for meaningful analysis.
+    Returns:
+        str: A markdown-formatted string containing analysis results and embedded plots.
+    """
     if data is None:
         data = tool.agent.dataset
 @tool
 def suggest_features(data: pd.DataFrame) -> str:
+    """
+    Suggest potential feature engineering steps based on data characteristics.
+    This function analyzes the dataset's structure and statistical properties to
+    recommend possible feature engineering steps that could improve model performance.
+    Args:
+        data (pd.DataFrame): A pandas DataFrame containing the dataset to analyze.
+                             The DataFrame can contain both numerical and categorical columns.
+    Returns:
+        str: A string containing suggestions for feature engineering based on
+             the characteristics of the input data.
+    """
     if data is None:
         data = tool.agent.dataset
 @tool
 def predictive_analysis(data: pd.DataFrame, target: str) -> str:
+    """
+    Perform predictive analytics by training a classification model.
+    This function builds a classification model using Random Forest, evaluates its performance,
+    and provides detailed metrics and visualizations such as the confusion matrix and ROC curve.
+    Args:
+        data (pd.DataFrame): A pandas DataFrame containing the dataset to analyze.
+                             The DataFrame should contain the target variable for prediction.
+        target (str): The name of the target variable column in the dataset.
+    Returns:
+        str: A markdown-formatted string containing the classification report, confusion matrix,
+             ROC curve, AUC score, and a unique Model ID.
+    """
     if data is None:
         data = tool.agent.dataset
 # Report Exporting Function
 # ------------------------------
 def export_report(content: str, filename: str):
+    """
+    Export the given content as a PDF report.
+    This function converts markdown content into a PDF file using pdfkit and provides
+    a download button for users to obtain the report.
+    Args:
+        content (str): The markdown content to be included in the PDF report.
+        filename (str): The desired name for the exported PDF file.
+    Returns:
+        None
+    """
     # Save content to a temporary HTML file
     with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as tmp_file:
         tmp_file.write(content.encode('utf-8'))