Mustehson commited on
Commit
9ecf6e0
·
verified ·
1 Parent(s): 4e0396a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -87
app.py CHANGED
@@ -5,19 +5,13 @@ import gradio as gr
5
  import pandas as pd
6
  import pandera as pa
7
  from pandera import Column
8
- import random
9
- from dataprep.eda import compute
10
  from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
11
- from .utils import (
12
- format_num_stats, format_cat_stats,
13
- format_ov_stats, format_insights
14
- )
15
  from langsmith import traceable
16
  from langchain import hub
17
  import warnings
18
  warnings.filterwarnings("ignore", category=DeprecationWarning)
19
 
20
-
21
  # Height of the Tabs Text Area
22
  TAB_LINES = 8
23
 
@@ -43,7 +37,7 @@ for model in models:
43
  print(f"Error for model {model}: {e}")
44
  continue
45
 
46
- llm = ChatHuggingFace(llm=endpoint).bind(max_tokens=4096)
47
  #---------------------------------------
48
 
49
  #-----LOAD PROMPT FROM LANCHAIN HUB-----
@@ -69,98 +63,44 @@ def get_tables_names(schema_name):
69
  def update_table_names(schema_name):
70
  tables = get_tables_names(schema_name)
71
  return gr.update(choices=tables)
72
-
73
- # Get Schema
74
- def get_table_schema(table):
75
- result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
76
- ddl_create = result.iloc[0,0]
77
- parent_database = result.iloc[0,1]
78
- schema_name = result.iloc[0,2]
79
- full_path = f"{parent_database}.{schema_name}.{table}"
80
- if schema_name != "main":
81
- old_path = f"{schema_name}.{table}"
82
- else:
83
- old_path = table
84
- ddl_create = ddl_create.replace(old_path, full_path)
85
- return full_path
86
-
87
  def get_data_df(schema):
88
  print('Getting Dataframe from the Database')
89
  return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
90
 
91
- <<<<<<< HEAD
92
- def calcualte_stats(df):
93
- indev_stats = []
94
- cols = []
95
-
96
- _df = df.copy()
97
-
98
- num_cols = _df.select_dtypes(include=['number'], exclude=['datetime']).columns
99
- cat_cols = _df.select_dtypes(include=['object'], exclude=['datetime']).columns
100
-
101
-
102
- _all_stats = compute(_df)
103
- all_stats = format_ov_stats(_all_stats['stats'])
104
- insights = format_insights(_all_stats['overview_insights'])
105
-
106
- for i, col in enumerate(random.sample(num_cols.tolist()+cat_cols.tolist(), 2)):
107
- _indv_data = compute(_df, col)
108
-
109
- if col in cat_cols:
110
- indev_data_cat = format_cat_stats(_indv_data["data"])
111
-
112
- indev_stats.append(pd.DataFrame([indev_data_cat['Overview']], index=[f'{col}_stats']).T)
113
-
114
- elif col in num_cols:
115
- try:
116
- indev_data_num = format_num_stats(_indv_data["data"])
117
- except:
118
- indev_data_num = format_cat_stats(_indv_data["data"])
119
-
120
- indev_stats.append(pd.DataFrame([indev_data_num['Overview']], index=[f'{col}_stats']).T)
121
-
122
- return {
123
- "overall_stats": pd.DataFrame(all_stats[0], index=['Dataset Statistics']).T,
124
- "insights": insights,
125
- "stats_1": indev_stats[0],
126
- "stats_2": indev_stats[1]
127
- }
128
-
129
  def df_summary(df):
130
  summary = []
131
 
132
  for column in df.columns:
133
  if pd.api.types.is_numeric_dtype(df[column]):
134
  summary.append({
135
- "column": column, "max": df[column].max(), "min": df[column].min(),
136
- "count": df[column].count(), "nunique": df[column].nunique(),
137
- "dtype": str(df[column].dtype), "top": None
 
 
 
 
138
  })
139
 
140
  elif pd.api.types.is_categorical_dtype(df[column]) or pd.api.types.is_object_dtype(df[column]):
141
  top_value = df[column].mode().iloc[0] if not df[column].mode().empty else None
142
 
143
  summary.append({
144
- "column": column, "max": None, "min": None, "count": df[column].count(),
145
- "nunique": df[column].nunique(), "dtype": str(df[column].dtype), "top": top_value
 
 
 
 
 
146
  })
147
-
148
  summary_df = pd.DataFrame(summary)
149
  return summary_df.reset_index(drop=True)
150
- =======
151
- >>>>>>> parent of 7c2e7ac (Summary Added)
152
 
153
  def format_prompt(df):
154
- summary_df = pd.DataFrame({
155
- "max": df.max(),
156
- "min": df.min(),
157
- "top": df.mode().iloc[0],
158
- "nunique": df.nunique(),
159
- "count": df.count(),
160
- "dtype": df.dtypes.astype(str)
161
- }).reset_index().rename(columns={"index": "column"})
162
  return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'),
163
- summary=summary_df.to_json(orient='records'))
164
  def format_user_prompt(df):
165
  return prompt_user_input.format_prompt(data=df.head().to_json(orient='records'))
166
 
@@ -177,6 +117,33 @@ def run_llm(messages):
177
  return tests
178
 
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  def validate_pandera(tests, df):
181
  validation_results = []
182
 
@@ -196,6 +163,41 @@ def validate_pandera(tests, df):
196
  })
197
  return pd.DataFrame(validation_results)
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  #---------------------------------------
200
 
201
 
@@ -204,26 +206,22 @@ def validate_pandera(tests, df):
204
  def main(table):
205
  schema = get_table_schema(table)
206
  df = get_data_df(schema)
207
-
 
 
208
  messages = format_prompt(df=df)
209
  tests = run_llm(messages)
210
  print(tests)
211
 
212
- stats = calcualte_stats(df)
213
- df_insights = stats['insights']
214
- df_statistics = stats['overall_stats']
215
- df_stat_1 = stats['stats_1']
216
- df_stat_2 = stats['stats_2']
217
-
218
  if isinstance(tests, Exception):
219
  tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
220
- return df.head(10), df_statistics, df_insights, df_stat_1, df_stat_2, tests, pd.DataFrame([])
221
 
222
  tests_df = pd.DataFrame(tests)
223
  tests_df.rename(columns={tests_df.columns[0]: 'Column', tests_df.columns[1]: 'Rule Name', tests_df.columns[2]: 'Rules' }, inplace=True)
224
  pandera_results = validate_pandera(tests, df)
225
 
226
- return df.head(10), df_statistics, df_insights, df_stat_1, df_stat_2, tests_df, pandera_results
227
 
228
  def user_results(table, text_query):
229
 
@@ -328,3 +326,4 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
328
  if __name__ == "__main__":
329
  demo.launch(debug=True)
330
 
 
 
5
  import pandas as pd
6
  import pandera as pa
7
  from pandera import Column
8
+ import ydata_profiling as pp
 
9
  from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
 
 
 
 
10
  from langsmith import traceable
11
  from langchain import hub
12
  import warnings
13
  warnings.filterwarnings("ignore", category=DeprecationWarning)
14
 
 
15
  # Height of the Tabs Text Area
16
  TAB_LINES = 8
17
 
 
37
  print(f"Error for model {model}: {e}")
38
  continue
39
 
40
+ llm = ChatHuggingFace(llm=endpoint).bind(max_tokens=8192)
41
  #---------------------------------------
42
 
43
  #-----LOAD PROMPT FROM LANCHAIN HUB-----
 
63
  def update_table_names(schema_name):
64
  tables = get_tables_names(schema_name)
65
  return gr.update(choices=tables)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def get_data_df(schema):
67
  print('Getting Dataframe from the Database')
68
  return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def df_summary(df):
71
  summary = []
72
 
73
  for column in df.columns:
74
  if pd.api.types.is_numeric_dtype(df[column]):
75
  summary.append({
76
+ "column": column,
77
+ "max": df[column].max(),
78
+ "min": df[column].min(),
79
+ "count": df[column].count(),
80
+ "nunique": df[column].nunique(),
81
+ "dtype": str(df[column].dtype),
82
+ "top": None
83
  })
84
 
85
  elif pd.api.types.is_categorical_dtype(df[column]) or pd.api.types.is_object_dtype(df[column]):
86
  top_value = df[column].mode().iloc[0] if not df[column].mode().empty else None
87
 
88
  summary.append({
89
+ "column": column,
90
+ "max": None,
91
+ "min": None,
92
+ "count": df[column].count(),
93
+ "nunique": df[column].nunique(),
94
+ "dtype": str(df[column].dtype),
95
+ "top": top_value
96
  })
 
97
  summary_df = pd.DataFrame(summary)
98
  return summary_df.reset_index(drop=True)
 
 
99
 
100
  def format_prompt(df):
101
+ summary = df_summary(df)
 
 
 
 
 
 
 
102
  return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'),
103
+ summary=summary.to_json(orient='records'))
104
  def format_user_prompt(df):
105
  return prompt_user_input.format_prompt(data=df.head().to_json(orient='records'))
106
 
 
117
  return tests
118
 
119
 
120
+ # Get Schema
121
+ def get_table_schema(table):
122
+ result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
123
+ ddl_create = result.iloc[0,0]
124
+ parent_database = result.iloc[0,1]
125
+ schema_name = result.iloc[0,2]
126
+ full_path = f"{parent_database}.{schema_name}.{table}"
127
+ if schema_name != "main":
128
+ old_path = f"{schema_name}.{table}"
129
+ else:
130
+ old_path = table
131
+ ddl_create = ddl_create.replace(old_path, full_path)
132
+ return full_path
133
+
134
+ def describe(df):
135
+
136
+ numerical_info = pd.DataFrame()
137
+ categorical_info = pd.DataFrame()
138
+ if len(df.select_dtypes(include=['number']).columns) >= 1:
139
+ numerical_info = df.select_dtypes(include=['number']).describe().T.reset_index()
140
+ numerical_info.rename(columns={'index': 'column'}, inplace=True)
141
+ if len(df.select_dtypes(include=['object']).columns) >= 1:
142
+ categorical_info = df.select_dtypes(include=['object']).describe().T.reset_index()
143
+ categorical_info.rename(columns={'index': 'column'}, inplace=True)
144
+
145
+ return numerical_info, categorical_info
146
+
147
  def validate_pandera(tests, df):
148
  validation_results = []
149
 
 
163
  })
164
  return pd.DataFrame(validation_results)
165
 
166
+ def statistics(df):
167
+ profile = pp.ProfileReport(df)
168
+ report_dict = profile.get_description()
169
+ description, alerts = report_dict.table, report_dict.alerts
170
+ # Statistics
171
+ mapping = {
172
+ 'n': 'Number of observations',
173
+ 'n_var': 'Number of variables',
174
+ 'n_cells_missing': 'Number of cells missing',
175
+ 'n_vars_with_missing': 'Number of columns with missing data',
176
+ 'n_vars_all_missing': 'Columns with all missing data',
177
+ 'p_cells_missing': 'Missing cells (%)',
178
+ 'n_duplicates': 'Duplicated rows',
179
+ 'p_duplicates': 'Duplicated rows (%)',
180
+ }
181
+
182
+ updated_data = {mapping.get(k, k): v for k, v in description.items() if k != 'types'}
183
+ # Add flattened types information
184
+ if 'Text' in description.get('types', {}):
185
+ updated_data['Number of text columns'] = description['types']['Text']
186
+ if 'Categorical' in description.get('types', {}):
187
+ updated_data['Number of categorical columns'] = description['types']['Categorical']
188
+ if 'Numeric' in description.get('types', {}):
189
+ updated_data['Number of numeric columns'] = description['types']['Numeric']
190
+ if 'DateTime' in description.get('types', {}):
191
+ updated_data['Number of datetime columns'] = description['types']['DateTime']
192
+
193
+ df_statistics = pd.DataFrame(list(updated_data.items()), columns=['Statistic Description', 'Value'])
194
+ df_statistics['Value'] = df_statistics['Value'].astype(int)
195
+
196
+ # Alerts
197
+ alerts_list = [(str(alert).replace('[', '').replace(']', ''), alert.alert_type_name) for alert in alerts]
198
+ df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
199
+
200
+ return df_statistics, df_alerts
201
  #---------------------------------------
202
 
203
 
 
206
  def main(table):
207
  schema = get_table_schema(table)
208
  df = get_data_df(schema)
209
+ df_statistics, df_alerts = statistics(df)
210
+ describe_num, describe_cat = describe(df)
211
+
212
  messages = format_prompt(df=df)
213
  tests = run_llm(messages)
214
  print(tests)
215
 
 
 
 
 
 
 
216
  if isinstance(tests, Exception):
217
  tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
218
+ return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests, pd.DataFrame([])
219
 
220
  tests_df = pd.DataFrame(tests)
221
  tests_df.rename(columns={tests_df.columns[0]: 'Column', tests_df.columns[1]: 'Rule Name', tests_df.columns[2]: 'Rules' }, inplace=True)
222
  pandera_results = validate_pandera(tests, df)
223
 
224
+ return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests_df, pandera_results
225
 
226
  def user_results(table, text_query):
227
 
 
326
  if __name__ == "__main__":
327
  demo.launch(debug=True)
328
 
329
+