Spaces:

datajoi
/

Dataset-Test-Workflow

Sleeping

App Files Files Community

Mustehson commited on Oct 21, 2024

Commit

6dda383

1 Parent(s): d14334a

Refactoring & Logs

Browse files

Files changed (3) hide show

app.py +20 -48
prompt.py +0 -123
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -6,10 +6,9 @@ import pandas as pd
 import pandera as pa
 from pandera import Column
 import ydata_profiling as pp
-from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
-from prompt import PROMPT_PANDERA, PANDERA_USER_INPUT_PROMPT
 from langsmith import traceable
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
@@ -18,29 +17,6 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
 TAB_LINES = 8
 # Load Token
 md_token = os.getenv('MD_TOKEN')
-INPUT_PROMPT = '''
-Here are the first few samples of data:
-<Sample Data>
-{data}
-</Sample Data<>
-'''
-USER_INPUT = '''
-Here are the first few samples of data:
-<Sample Data>
-{data}
-</Sample Data<>
-Here is the User Description:
-<User Description>
-{user_description}
-</User Description>
-'''
-print('Connecting to DB...')
 # Connect to DB
 conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
@@ -60,6 +36,12 @@ for model in models:
 llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
 # Get Databases
 def get_schemas():
     schemas = conn.execute("""
@@ -84,28 +66,20 @@ def get_data_df(schema):
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
-def chat_template(system_prompt, user_prompt, df):
-    messages = [
-        SystemMessage(content=system_prompt),
-        HumanMessage(content=user_prompt.format(data=df.head().to_json(orient='records'))),
-    ]
-    return messages
-def chat_template_user(system_prompt, user_prompt, user_description, df):
-    messages = [
-        SystemMessage(content=system_prompt),
-        HumanMessage(content=user_prompt.format(data=df.head(1).to_json(orient='records'), user_description=user_description)),
-    ]
-    return messages
-@traceable()
 def run_llm(messages):
   try:
     response = llm.invoke(messages)
-    print(response.content)
     tests = json.loads(response.content)
   except Exception as e:
       return e
@@ -199,11 +173,11 @@ def main(table):
     df = get_data_df(schema)
     df_statistics, df_alerts = statistics(df)
     describe_num, describe_cat  = describe(df)
-    messages = chat_template(system_prompt=PROMPT_PANDERA, user_prompt=INPUT_PROMPT, df=df)
     tests = run_llm(messages)
     print(tests)
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
         return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests, pd.DataFrame([])
@@ -219,11 +193,9 @@ def user_results(table, text_query):
     schema = get_table_schema(table)
     df = get_data_df(schema)
-    messages = chat_template_user(system_prompt=PANDERA_USER_INPUT_PROMPT,
-                                  user_prompt=USER_INPUT, user_description=text_query,
-                                  df=df)
-    print(messages)
     tests = run_llm(messages)
     print(f'Generated Tests from user input: {tests}')
     if isinstance(tests, Exception):

 import pandera as pa
 from pandera import Column
 import ydata_profiling as pp
 from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
 from langsmith import traceable
+from langchain import hub
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 TAB_LINES = 8
 # Load Token
 md_token = os.getenv('MD_TOKEN')
 # Connect to DB
 conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
 llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
+prompt_autogenerate = hub.pull("autogenerate-rules-testworkflow")
+prompt_user_input = hub.pull("usergenerate-rules-testworkflow")
 # Get Databases
 def get_schemas():
     schemas = conn.execute("""
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
+def format_prompt(df):
+  return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'))
+def format_user_prompt(df, user_description):
+  return prompt_user_input.format_prompt(data=df.head(2).to_json(orient='records'), user_description=user_description)
+def process_inputs(inputs) :
+    print(inputs)
+    return {'input_query': inputs['messages'].to_messages()[1]}
+@traceable(process_inputs=process_inputs)
 def run_llm(messages):
   try:
     response = llm.invoke(messages)
     tests = json.loads(response.content)
   except Exception as e:
       return e
     df = get_data_df(schema)
     df_statistics, df_alerts = statistics(df)
     describe_num, describe_cat  = describe(df)
+    messages = format_prompt(df=df)
     tests = run_llm(messages)
     print(tests)
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
         return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests, pd.DataFrame([])
     schema = get_table_schema(table)
     df = get_data_df(schema)
+    messages = format_user_prompt(df=df, user_description=text_query)
     tests = run_llm(messages)
     print(f'Generated Tests from user input: {tests}')
     if isinstance(tests, Exception):

prompt.py DELETED Viewed

@@ -1,123 +0,0 @@
-PROMPT_PANDERA = """
-You are a data quality engineer. Your role is to create deterministic rules to validate the quality of a dataset using **Pandera**.
-You will be provided with the first few rows of data below that represents the dataset for which you need to create validation rules. Please note that this is only a sample of the data, and there may be additional rows and categorical columns that are not fully represented in the sample. Keep in mind that the sample may not cover all possible values, but the validation rules must handle all data in the dataset.
-Follow this process:
-1. **Observe the sample data.**
-2. **For each column**, create a validation rule using Pandera syntax.
-    Here are the valid pandera check class methods DO NOT USE ANYOTHER METHODS OTHER THAN THE BELOW GIVEN METHODS:
-    DO NOT USE SINGLE backslashes \  BUT USE DOUBLE backslashes  \\ IN PATTERN
-    USE CORRECT SYNTAX AS SHOWN GIVEN BELOW
-    [
-    'pa.Check.between(min_value, max_value, include_min=True, include_max=True, **kwargs)',
-    'pa.Check.eq(value, **kwargs)',
-    'pa.Check.equal_to(value, **kwargs)',
-    'pa.Check.ge(min_value, **kwargs)',
-    'pa.Check.greater_than(min_value, **kwargs)',
-    'pa.Check.greater_than_or_equal_to(min_value, **kwargs)',
-    'pa.Check.gt(min_value, **kwargs)',
-    'pa.Check.in_range(min_value, max_value, include_min=True, include_max=True, **kwargs)',
-    'pa.Check.isin(allowed_values, **kwargs)',
-    'pa.Check.le(max_value, **kwargs)',
-    'pa.Check.less_than(max_value, **kwargs)',
-    'pa.Check.less_than_or_equal_to(max_value, **kwargs)',
-    'pa.Check.lt(max_value, **kwargs)',
-    'pa.Check.ne(value, **kwargs)',
-    'pa.Check.not_equal_to(value, **kwargs)',
-    'pa.Check.notin(forbidden_values, **kwargs)',
-    'pa.Check.str_contains(pattern, **kwargs)',
-    'pa.Check.str_endswith(string, **kwargs)',
-    'pa.Check.str_length(min_value=None, max_value=None, **kwargs)',
-    'pa.Check.str_matches(pattern, **kwargs)',
-    'pa.Check.str_startswith(string, **kwargs)',
-    'pa.Check.unique_values_eq(values, **kwargs)'
-    ]
-3. Ensure that each rule specifies the expected data type and applies necessary checks such as:
-      name argument should be a valid column name. DO NOT USE ANYOTHER PANDERA
-   - **Data Type Validation** (e.g., `pa.Column(int, nullable=False, name="age")` ensures integers)
-   - **Non-null Check** (e.g., `pa.Column(str, nullable=False, name="name")` to ensure no nulls are allowed)
-   - **Unique Value Check** (e.g., `pa.Column(int, unique=True, name="ID")` for uniqueness)
-   - **Range or Bound Checks** (e.g., `pa.Column(float, checks=pa.Check.in_range(min_value=0, max_value=100), name="score")` for numerical ranges)
-   - **Allowed Value Checks** (e.g., `pa.Column(str, checks=pa.Check.isin([value1, value2]), name="gender")` to restrict values to a set)
-   - **Custom Validation Logic** using `pa.Column(int, checks=pa.Check(lambda x: x % 2 == 0), name="even_number")` with lambda functions (e.g., custom logic for even numbers or string patterns)
-  FOR DATETIME OR DATE COLUMN USE THE BELOW VALIDATION DO NOT CONISER IT AS INT OR FLOAT
-   - **DateTime or Date Validation** (e.g., `pa.Column(pa.dtypes.Timestamp, nullable=False), name="date_column")` to ensure dates or datetime)
-   For each column, provide a **column name**, **rule name** and a pandera_rule. Example structure (It should be list of dicts):
-   [
-     {
-       "column_name": "age",
-       "rule_name": "Ensure Column is Integer",
-       "pandera_rule": "Column(int, nullable=False, name='age')"
-     },
-     {
-       "column_name": "ID",
-       "rule_name": "Unique Identifier Check",
-       "pandera_rule": "Column(int, unique=True, name='ID')"
-     }
-   ]
-3 Repeat this process for max 5 columns in the dataset. If the data is less than 5 columns than include all columns. Group all the rules into a single JSON object and ensure that there is at least one validation rule for each column.
-Return the final rules as a single JSON object, ensuring that each column is thoroughly validated based on the observations of the sample data.
-DO NOT OUTPUT ANYTHING OR ANY EXPLAINATION OTHER THAN JSON OBJECT
-"""
-PANDERA_USER_INPUT_PROMPT = """
-You are a data quality engineer. Your role is to assist the user in creating deterministic rules to validate the quality of a dataset using **Pandera**.
-You will be provided with the first few rows of data below that represents the dataset for which you need to help the user create validation rules. Please note that this is only a sample of the data, and there may be additional rows and categorical columns that are not fully represented in the sample. Keep in mind that the sample may not cover all possible values, but the validation rules must handle all data in the dataset.
-Follow this process:
-1. **Observe the sample data.**
-2. Observe description and create a valid check
-   Here are the valid **Pandera** Checks that you can use:
-   1. 'pa.Check.between(min_value, max_value, include_min=True, include_max=True, **kwargs)'
-   2. 'pa.Check.eq(value, **kwargs)' Checks if a value is equal to the specified value.
-   3. 'pa.Check.equal_to(value, **kwargs)' Alias for eq(). Checks if a value is equal to the specified value.
-   4. 'pa.Check.ge(min_value, **kwargs)' Checks if a value is greater than or equal to the specified minimum value.
-   5. 'pa.Check.greater_than(min_value, **kwargs)' Checks if a value is strictly greater than the specified minimum value.
-   6. 'pa.Check.greater_than_or_equal_to(min_value, **kwargs)' Checks if a value is greater than or equal to the specified minimum value.
-   7. 'pa.Check.gt(min_value, **kwargs)' Alias for greater_than(). Checks if a value is strictly greater than the specified minimum value.
-   8. 'pa.Check.in_range(min_value, max_value, include_min=True, include_max=True, **kwargs)' Checks if a value is within the specified range. By default, it's inclusive of both min and max values.
-   9. 'pa.Check.isin(allowed_values, **kwargs)' Checks if a value is in the set of allowed values.
-   10. 'pa.Check.le(max_value, **kwargs)' Checks if a value is less than or equal to the specified maximum value.
-   11. 'pa.Check.less_than(max_value, **kwargs)' ): Checks if a value is strictly less than the specified maximum value.
-   12. 'pa.Check.less_than_or_equal_to(max_value, **kwargs)' Checks if a value is less than or equal to the specified maximum value.
-   13. 'pa.Check.lt(max_value, **kwargs)' Checks if a value is strictly less than the specified maximum value.
-   14. 'pa.Check.ne(value, **kwargs)' Checks if a value is not equal to the specified value.
-   15. 'pa.Check.not_equal_to(value, **kwargs)' Checks if a value is not equal to the specified value.
-   16. 'pa.Check.notin(forbidden_values, **kwargs)' Checks if a value is not in the set of forbidden values.
-   17. 'pa.Check.str_contains(pattern, **kwargs)' Checks if a string contains the specified pattern.
-   18. 'pa.Check.str_endswith(string, **kwargs)' Checks if a string ends with the specified substring.
-   19. 'pa.Check.str_length(min_value=None, max_value=None, **kwargs)' Checks if the length of a string is within the specified range.
-   20. 'pa.Check.str_matches(pattern, **kwargs)' Checks if a string matches the specified regular expression pattern.
-   21. 'pa.Check.str_startswith(string, **kwargs)' Checks if a string starts with the specified substring.
-   22. 'pa.Check.unique_values_eq(values, **kwargs)' Checks if the unique values in a column are equal to the specified set of values.
-   23. 'pa.Check(lambda x: x )' with lambda functions for custom logic.
-   **ALWAY USE THE COMPLETE PANDERA SYNTAX**
-3. For each column, generate a **column name**, **rule name**, and a **Pandera rule** based on the user’s description. Example structure (It should be list of dicts):
-   [
-    {
-        "column_name": "unique_key",
-        "rule_name": "Unique Identifiers",
-        "pandera_rule": "pa.Column(int, nullable=False, unique=True, name='unique_key')"
-    }
-   ]
-4. Repeat this process for a maximum of 5 columns or based on user input. Group all the rules into a single JSON object and return it.
-IMPORTANT: You should only generate rules based on the user’s input for each column. Return the final rules as a single JSON object, ensuring that the user's instructions are reflected in the validations.
-DO NOT RETURN ANYTHING OR ANY EXPLANATION OTHER THAN JSON """

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ duckdb==1.1.1
 langsmith==0.1.135
 pandera==0.20.4
 ydata-profiling==v4.11.0
-langchain-core==0.3.12

 langsmith==0.1.135
 pandera==0.20.4
 ydata-profiling==v4.11.0
+langchain-core==0.3.12
+langchain==0.3.4