Spaces:

mgbam
/

DataBiz

Sleeping

App Files Files Community

mgbam commited on Jan 28

Commit

f748c28

verified ·

1 Parent(s): 659fba8

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -94

app.py CHANGED Viewed

@@ -1,80 +1,19 @@
 import streamlit as st
 import numpy as np
 import pandas as pd
-from smolagents import CodeAgent, tool
 from typing import Union, List, Dict, Optional
 import matplotlib.pyplot as plt
 import seaborn as sns
 import os
-from groq import Groq
 import base64
 import io
-class GroqLLM:
-    """Compatible LLM interface for smolagents CodeAgent."""
-    def __init__(self, model_name="llama-3.1-8B-Instant"):
-        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-        self.model_name = model_name
-    def __call__(self, prompt: Union[str, dict, List[Dict]]) -> str:
-        """Make the class callable as required by smolagents."""
-        try:
-            # Handle different prompt formats
-            if isinstance(prompt, (dict, list)):
-                prompt_str = str(prompt)
-            else:
-                prompt_str = str(prompt)
-            # Create a properly formatted message
-            completion = self.client.chat.completions.create(
-                model=self.model_name,
-                messages=[{
-                    "role": "user",
-                    "content": prompt_str
-                }],
-                temperature=0.7,
-                max_tokens=1024,
-                stream=False
-            )
-            # Ensure the response is properly formatted
-            if completion.choices and hasattr(completion.choices[0].message, 'content'):
-                return completion.choices[0].message.content
-            else:
-                return "Error: No valid response generated from the model."
-        except Exception as e:
-            error_msg = f"Error generating response: {str(e)}"
-            print(error_msg)
-            return error_msg
-class DataAnalysisAgent(CodeAgent):
-    """Extended CodeAgent with dataset awareness."""
-    def __init__(self, dataset: pd.DataFrame, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._dataset = dataset
-    @property
-    def dataset(self) -> pd.DataFrame:
-        """Access the stored dataset."""
-        return self._dataset
-    def run(self, prompt: str) -> str:
-        """Override run method to include dataset context."""
-        dataset_info = f"""
-        Dataset Shape: {self.dataset.shape}
-        Columns: {', '.join(self.dataset.columns)}
-        Data Types: {self.dataset.dtypes.to_dict()}
-        """
-        enhanced_prompt = f"""
-        Analyze the following dataset:
-        {dataset_info}
-        Task: {prompt}
-        Use the provided tools to analyze this specific dataset and return detailed results.
-        """
-        return super().run(enhanced_prompt)
 @tool
 def analyze_basic_stats(data: pd.DataFrame) -> str:
@@ -87,9 +26,6 @@ def analyze_basic_stats(data: pd.DataFrame) -> str:
         str: A string containing formatted basic statistics for each numerical column,
             including mean, median, standard deviation, skewness, and missing value counts.
     """
-    if data is None:
-        data = tool.agent.dataset
     stats = {}
     numeric_cols = data.select_dtypes(include=[np.number]).columns
@@ -114,9 +50,6 @@ def generate_correlation_matrix(data: pd.DataFrame) -> str:
     Returns:
         str: A base64 encoded string representing the correlation matrix plot image.
     """
-    if data is None:
-        data = tool.agent.dataset
     numeric_data = data.select_dtypes(include=[np.number])
     plt.figure(figsize=(10, 8))
@@ -139,9 +72,6 @@ def analyze_categorical_columns(data: pd.DataFrame) -> str:
         str: A string containing formatted analysis results for each categorical column,
             including unique value counts, top categories, and missing value counts.
     """
-    if data is None:
-        data = tool.agent.dataset
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
     analysis = {}
@@ -165,9 +95,6 @@ def suggest_features(data: pd.DataFrame) -> str:
         str: A string containing suggestions for feature engineering based on
             the characteristics of the input data.
     """
-    if data is None:
-        data = tool.agent.dataset
     suggestions = []
     numeric_cols = data.select_dtypes(include=[np.number]).columns
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
@@ -204,13 +131,14 @@ def main():
                 data = pd.read_csv(uploaded_file)
                 st.session_state['data'] = data
-                # Initialize the agent with the dataset
-                st.session_state['agent'] = DataAnalysisAgent(
-                    dataset=data,
-                    tools=[analyze_basic_stats, generate_correlation_matrix,
-                           analyze_categorical_columns, suggest_features],
-                    model=GroqLLM(),
-                    additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
                 )
                 st.success(f'Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns')
@@ -227,16 +155,14 @@ def main():
             if analysis_type == "Basic Statistics":
                 with st.spinner('Analyzing basic statistics...'):
                     result = st.session_state['agent'].run(
-                        "Use the analyze_basic_stats tool to analyze this dataset and "
-                        "provide insights about the numerical distributions."
                     )
                     st.write(result)
             elif analysis_type == "Correlation Analysis":
                 with st.spinner('Generating correlation matrix...'):
                     result = st.session_state['agent'].run(
-                        "Use the generate_correlation_matrix tool to analyze correlations "
-                        "and explain any strong relationships found."
                     )
                     if isinstance(result, str) and result.startswith('data:image') or ',' in result:
                         st.image(f"data:image/png;base64,{result.split(',')[-1]}")
@@ -246,16 +172,14 @@ def main():
             elif analysis_type == "Categorical Analysis":
                 with st.spinner('Analyzing categorical columns...'):
                     result = st.session_state['agent'].run(
-                        "Use the analyze_categorical_columns tool to examine the "
-                        "categorical variables and explain the distributions."
                     )
                     st.write(result)
             elif analysis_type == "Feature Engineering":
                 with st.spinner('Generating feature suggestions...'):
                     result = st.session_state['agent'].run(
-                        "Use the suggest_features tool to recommend potential "
-                        "feature engineering steps for this dataset."
                     )
                     st.write(result)

 import streamlit as st
 import numpy as np
 import pandas as pd
+from langchain.tools import tool
+from langchain.agents import initialize_agent, AgentType
+from langchain.chat_models import ChatOpenAI
 from typing import Union, List, Dict, Optional
 import matplotlib.pyplot as plt
 import seaborn as sns
 import os
 import base64
 import io
+# Set up LangChain with OpenAI (or any other LLM)
+os.environ["OPENAI_API_KEY"] = "your-openai-api-key"  # Replace with your OpenAI API key
+llm = ChatOpenAI(model="gpt-4", temperature=0.7)
 @tool
 def analyze_basic_stats(data: pd.DataFrame) -> str:
         str: A string containing formatted basic statistics for each numerical column,
             including mean, median, standard deviation, skewness, and missing value counts.
     """
     stats = {}
     numeric_cols = data.select_dtypes(include=[np.number]).columns
     Returns:
         str: A base64 encoded string representing the correlation matrix plot image.
     """
     numeric_data = data.select_dtypes(include=[np.number])
     plt.figure(figsize=(10, 8))
         str: A string containing formatted analysis results for each categorical column,
             including unique value counts, top categories, and missing value counts.
     """
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
     analysis = {}
         str: A string containing suggestions for feature engineering based on
             the characteristics of the input data.
     """
     suggestions = []
     numeric_cols = data.select_dtypes(include=[np.number]).columns
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
                 data = pd.read_csv(uploaded_file)
                 st.session_state['data'] = data
+                # Initialize the LangChain agent with the tools
+                tools = [analyze_basic_stats, generate_correlation_matrix,
+                         analyze_categorical_columns, suggest_features]
+                st.session_state['agent'] = initialize_agent(
+                    tools=tools,
+                    llm=llm,
+                    agent=AgentType.OPENAI_FUNCTIONS,
+                    verbose=True
                 )
                 st.success(f'Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns')
             if analysis_type == "Basic Statistics":
                 with st.spinner('Analyzing basic statistics...'):
                     result = st.session_state['agent'].run(
+                        f"Analyze the dataset and provide basic statistics: {st.session_state['data']}"
                     )
                     st.write(result)
             elif analysis_type == "Correlation Analysis":
                 with st.spinner('Generating correlation matrix...'):
                     result = st.session_state['agent'].run(
+                        f"Generate a correlation matrix for the dataset: {st.session_state['data']}"
                     )
                     if isinstance(result, str) and result.startswith('data:image') or ',' in result:
                         st.image(f"data:image/png;base64,{result.split(',')[-1]}")
             elif analysis_type == "Categorical Analysis":
                 with st.spinner('Analyzing categorical columns...'):
                     result = st.session_state['agent'].run(
+                        f"Analyze categorical columns in the dataset: {st.session_state['data']}"
                     )
                     st.write(result)
             elif analysis_type == "Feature Engineering":
                 with st.spinner('Generating feature suggestions...'):
                     result = st.session_state['agent'].run(
+                        f"Suggest feature engineering steps for the dataset: {st.session_state['data']}"
                     )
                     st.write(result)