Spaces:

datajoi
/

Dataset-Test-Workflow

Sleeping

App Files Files Community

Mustehson commited on Oct 8, 2024

Commit

99c2740

1 Parent(s): eab6d7f

Added Text to Validation

Browse files

Files changed (2) hide show

app.py +80 -11
prompt.py +53 -0

app.py CHANGED Viewed

@@ -7,7 +7,9 @@ import pandera as pa
 from pandera import Column
 import ydata_profiling as pp
 from huggingface_hub import InferenceClient
-from prompt import PROMPT_PANDERA
 # Height of the Tabs Text Area
@@ -17,6 +19,7 @@ md_token = os.getenv('MD_TOKEN')
 os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
 INPUT_PROMPT = '''
 Here is the frist few samples of data:
 <Sample Data>
@@ -25,6 +28,19 @@ Here is the frist few samples of data:
 '''
 print('Connecting to DB...')
 # Connect to DB
 conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
@@ -53,11 +69,23 @@ def get_data_df(schema):
     print('Getting Dataframe from the Database')
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
-def run_llm(df):
-  messages=[
-        {"role": "system", "content": PROMPT_PANDERA},
-        {"role": "user", "content": INPUT_PROMPT.format(data=df.head().to_json(orient='records'))},
     ]
   try:
     response = client.chat_completion(messages, max_tokens=1024)
     print(response.choices[0].message.content)
@@ -149,8 +177,10 @@ def main(table):
     df = get_data_df(schema)
     df_statistics, df_alerts = statistics(df)
     describe_num, describe_cat  = describe(df)
-    tests = run_llm(df)
     print(tests)
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
@@ -162,10 +192,34 @@ def main(table):
     return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests_df, pandera_results
 # Custom CSS styling
 custom_css = """
 .gradio-container {
     background-color: #f0f4f8;
 }
 .logo {
     max-width: 200px;
@@ -196,7 +250,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
             schema_dropdown = gr.Dropdown(choices=get_schemas(), label="Select Schema", interactive=True)
             tables_dropdown = gr.Dropdown(choices=[], label="Available Tables", value=None)
             with gr.Row():
-                generate_query_button = gr.Button("Validate Data", variant="primary")
         with gr.Column(scale=2):
             with gr.Tabs():
@@ -220,11 +274,26 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
                 with gr.Tab("Data"):
                     result_output = gr.DataFrame(label="Dataframe (10 Rows)", value=[], interactive=False)
         schema_dropdown.change(update_table_names, inputs=schema_dropdown, outputs=tables_dropdown)
-        generate_query_button.click(main, inputs=[tables_dropdown], outputs=[result_output, data_description, data_alerts, describe_cat, describe_num, tests_output, test_result_output])
 if __name__ == "__main__":
     demo.launch(debug=True)

 from pandera import Column
 import ydata_profiling as pp
 from huggingface_hub import InferenceClient
+from prompt import PROMPT_PANDERA, PANDERA_USER_INPUT_PROMPT
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
 # Height of the Tabs Text Area
 os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
 INPUT_PROMPT = '''
 Here is the frist few samples of data:
 <Sample Data>
 '''
+USER_INPUT = '''
+Here is the frist few samples of data:
+<Sample Data>
+{data}
+</Sample Data<>
+Here is the User Description:
+<User Description>
+{user_description}
+</User Description>
+'''
 print('Connecting to DB...')
 # Connect to DB
 conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
     print('Getting Dataframe from the Database')
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
+def chat_template(system_prompt, user_prompt, df):
+    messages=[
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt.format(data=df.head().to_json(orient='records'))},
     ]
+    return messages
+def chat_template_user(system_prompt, user_prompt, user_description, df):
+    messages=[
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt.format(data=df.head(1).to_json(orient='records'), user_description=user_description)},
+    ]
+    return messages
+def run_llm(messages):
   try:
     response = client.chat_completion(messages, max_tokens=1024)
     print(response.choices[0].message.content)
     df = get_data_df(schema)
     df_statistics, df_alerts = statistics(df)
     describe_num, describe_cat  = describe(df)
+    messages = chat_template(system_prompt=PROMPT_PANDERA, user_prompt=INPUT_PROMPT, df=df)
+    tests = run_llm(messages)
     print(tests)
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
     return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests_df, pandera_results
+def user_results(table, text_query):
+    schema = get_table_schema(table)
+    df = get_data_df(schema)
+    messages = chat_template_user(system_prompt=PANDERA_USER_INPUT_PROMPT,
+                                  user_prompt=USER_INPUT, user_description=text_query,
+                                  df=df)
+    print(messages)
+    tests = run_llm(messages)
+    print(f'Generated Tests from user input: {tests}')
+    if isinstance(tests, Exception):
+        tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
+        return tests, pd.DataFrame([])
+    tests_df = pd.DataFrame(tests)
+    tests_df.rename(columns={tests_df.columns[0]: 'Column', tests_df.columns[1]: 'Rule Name', tests_df.columns[2]: 'Rules' }, inplace=True)
+    pandera_results = validate_pandera(tests, df)
+    return tests_df, pandera_results
 # Custom CSS styling
 custom_css = """
+    print('Validated Tests with Pandera')
 .gradio-container {
     background-color: #f0f4f8;
 }
 .logo {
     max-width: 200px;
             schema_dropdown = gr.Dropdown(choices=get_schemas(), label="Select Schema", interactive=True)
             tables_dropdown = gr.Dropdown(choices=[], label="Available Tables", value=None)
             with gr.Row():
+                generate_result = gr.Button("Validate Data", variant="primary")
         with gr.Column(scale=2):
             with gr.Tabs():
                 with gr.Tab("Data"):
                     result_output = gr.DataFrame(label="Dataframe (10 Rows)", value=[], interactive=False)
+                with gr.Tab('Text to Validation'):
+                    with gr.Row():
+                        query_input = gr.Textbox(lines=5, label="Text Query", placeholder="Enter Text Query to Generate Validation e.g. Validate that the incident_zip column contains valid 5-digit ZIP codes.")
+                    with gr.Row():
+                        with gr.Column():
+                            pass
+                        with gr.Column(scale=1, min_width=50):
+                            user_generate_result = gr.Button("Validate Data", variant="primary" )
+                    with gr.Row():
+                        with gr.Column():
+                            query_tests = gr.DataFrame(label="Validation Rules", value=[], interactive=False)
+                        with gr.Column():
+                            query_result = gr.DataFrame(label="Validation Result", value=[], interactive=False)
         schema_dropdown.change(update_table_names, inputs=schema_dropdown, outputs=tables_dropdown)
+        generate_result.click(main, inputs=[tables_dropdown], outputs=[result_output, data_description, data_alerts, describe_cat, describe_num, tests_output, test_result_output])
+        user_generate_result.click(user_results, inputs=[tables_dropdown, query_input], outputs=[query_tests, query_result])
 if __name__ == "__main__":
     demo.launch(debug=True)

prompt.py CHANGED Viewed

@@ -66,3 +66,56 @@ Return the final rules as a single JSON object, ensuring that each column is tho
 DO NOT OUTPUT ANYTHING OR ANY EXPLAINATION OTHER THAN JSON OBJECT
 """

 DO NOT OUTPUT ANYTHING OR ANY EXPLAINATION OTHER THAN JSON OBJECT
 """
+PANDERA_USER_INPUT_PROMPT = """
+You are a data quality engineer. Your role is to assist the user in creating deterministic rules to validate the quality of a dataset using **Pandera**.
+You will be provided with the first few rows of data below that represents the dataset for which you need to help the user create validation rules. Please note that this is only a sample of the data, and there may be additional rows and categorical columns that are not fully represented in the sample. Keep in mind that the sample may not cover all possible values, but the validation rules must handle all data in the dataset.
+Follow this process:
+1. **Observe the sample data.**
+2. Observe description and create a valid pander check
+   Here are the valid **Pandera** Checks that you can use:
+   1. 'pa.Check.between(min_value, max_value, include_min=True, include_max=True, **kwargs)'
+   2. 'pa.Check.eq(value, **kwargs)' Checks if a value is equal to the specified value.
+   3. 'pa.Check.equal_to(value, **kwargs)' Alias for eq(). Checks if a value is equal to the specified value.
+   4. 'pa.Check.ge(min_value, **kwargs)' Checks if a value is greater than or equal to the specified minimum value.
+   5. 'pa.Check.greater_than(min_value, **kwargs)' Checks if a value is strictly greater than the specified minimum value.
+   6. 'pa.Check.greater_than_or_equal_to(min_value, **kwargs)' Checks if a value is greater than or equal to the specified minimum value.
+   7. 'pa.Check.gt(min_value, **kwargs)' Alias for greater_than(). Checks if a value is strictly greater than the specified minimum value.
+   8. 'pa.Check.in_range(min_value, max_value, include_min=True, include_max=True, **kwargs)' Checks if a value is within the specified range. By default, it's inclusive of both min and max values.
+   9. 'pa.Check.isin(allowed_values, **kwargs)' Checks if a value is in the set of allowed values.
+   10. 'pa.Check.le(max_value, **kwargs)' Checks if a value is less than or equal to the specified maximum value.
+   11. 'pa.Check.less_than(max_value, **kwargs)' ): Checks if a value is strictly less than the specified maximum value.
+   12. 'pa.Check.less_than_or_equal_to(max_value, **kwargs)' Checks if a value is less than or equal to the specified maximum value.
+   13. 'pa.Check.lt(max_value, **kwargs)' Checks if a value is strictly less than the specified maximum value.
+   14. 'pa.Check.ne(value, **kwargs)' Checks if a value is not equal to the specified value.
+   15. 'pa.Check.not_equal_to(value, **kwargs)' Checks if a value is not equal to the specified value.
+   16. 'pa.Check.notin(forbidden_values, **kwargs)' Checks if a value is not in the set of forbidden values.
+   17. 'pa.Check.str_contains(pattern, **kwargs)' Checks if a string contains the specified pattern.
+   18. 'pa.Check.str_endswith(string, **kwargs)' Checks if a string ends with the specified substring.
+   19. 'pa.Check.str_length(min_value=None, max_value=None, **kwargs)' Checks if the length of a string is within the specified range.
+   20. 'pa.Check.str_matches(pattern, **kwargs)' Checks if a string matches the specified regular expression pattern.
+   21. 'pa.Check.str_startswith(string, **kwargs)' Checks if a string starts with the specified substring.
+   22. 'pa.Check.unique_values_eq(values, **kwargs)' Checks if the unique values in a column are equal to the specified set of values.
+   23. 'pa.Check(lambda x: x )' with lambda functions for custom logic.
+   24. 'pa.Column(int, nullable=False, unique=True, name='column_name') For unqiue values
+   **ALWAY USE THE COMPLETE PANDERA SYNTAX
+3. For each column, generate a **column name**, **rule name**, and a **Pandera rule** based on the user’s description. Example structure:
+   ```json
+   [
+     {
+       "column_name": "OS",
+       "rule_name": "Allowed Operating Systems",
+       "pandera_rule": "Column(str, pa.Check.isin(['macOS', 'Windows', 'Linux']), nullable=False, name='OS')"
+     }
+   ]
+4. Repeat this process for a maximum of 5 columns or based on user input. Group all the rules into a single JSON object and return it.
+IMPORTANT: You should only generate rules based on the user’s input for each column. Return the final rules as a single JSON object, ensuring that the user's instructions are reflected in the validations.
+DO NOT RETURN ANYTHING OR ANY EXPLANATION OTHER THAN JSON """