Spaces:
Sleeping
Sleeping
Mustehson
commited on
Commit
·
5f9d608
1
Parent(s):
ded67a4
DLT Pipeline
Browse files- app.py +53 -5
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -10,6 +10,7 @@ from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
|
|
| 10 |
from langsmith import traceable
|
| 11 |
from langchain import hub
|
| 12 |
import warnings
|
|
|
|
| 13 |
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 14 |
|
| 15 |
# Height of the Tabs Text Area
|
|
@@ -63,9 +64,44 @@ def get_tables_names(schema_name):
|
|
| 63 |
def update_table_names(schema_name):
|
| 64 |
tables = get_tables_names(schema_name)
|
| 65 |
return gr.update(choices=tables)
|
| 66 |
-
def get_data_df(schema):
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
def df_summary(df):
|
| 71 |
summary = []
|
|
@@ -205,7 +241,14 @@ def statistics(df):
|
|
| 205 |
# Main Function
|
| 206 |
def main(table):
|
| 207 |
schema = get_table_schema(table)
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
df_statistics, df_alerts = statistics(df)
|
| 210 |
describe_num, describe_cat = describe(df)
|
| 211 |
|
|
@@ -226,7 +269,12 @@ def main(table):
|
|
| 226 |
def user_results(table, text_query):
|
| 227 |
|
| 228 |
schema = get_table_schema(table)
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
messages = format_user_prompt(df=df, user_description=text_query)
|
| 232 |
tests = run_llm(messages)
|
|
|
|
| 10 |
from langsmith import traceable
|
| 11 |
from langchain import hub
|
| 12 |
import warnings
|
| 13 |
+
import dlt
|
| 14 |
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 15 |
|
| 16 |
# Height of the Tabs Text Area
|
|
|
|
| 64 |
def update_table_names(schema_name):
|
| 65 |
tables = get_tables_names(schema_name)
|
| 66 |
return gr.update(choices=tables)
|
| 67 |
+
# def get_data_df(schema):
|
| 68 |
+
# print('Getting Dataframe from the Database')
|
| 69 |
+
# return conn.sql(f"SELECT * FROM {schema} LIMIT 1000")
|
| 70 |
+
|
| 71 |
+
@dlt.resource
|
| 72 |
+
def fetch_data(schema):
|
| 73 |
+
result = conn.sql(f"SELECT * FROM {schema} LIMIT 1000")
|
| 74 |
+
|
| 75 |
+
while True:
|
| 76 |
+
chunk_df = result.fetch_df_chunk(2)
|
| 77 |
+
|
| 78 |
+
if chunk_df is None or len(chunk_df) == 0:
|
| 79 |
+
break
|
| 80 |
+
else:
|
| 81 |
+
yield chunk_df
|
| 82 |
+
|
| 83 |
+
def create_pipeline(schema):
|
| 84 |
+
dataset_name = schema.split('.')[1]
|
| 85 |
+
print("Dataset Name: ", dataset_name)
|
| 86 |
+
|
| 87 |
+
table_name = schema.split('.')[2]
|
| 88 |
+
print("Table Name: ", table_name)
|
| 89 |
+
|
| 90 |
+
pipeline =dlt.pipeline(
|
| 91 |
+
pipeline_name='duckdb_pipeline',
|
| 92 |
+
destination='duckdb',
|
| 93 |
+
dataset_name= dataset_name,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
load_info = pipeline.run(fetch_data(schema), table_name = table_name,
|
| 97 |
+
write_disposition = "replace")
|
| 98 |
+
|
| 99 |
+
print(load_info)
|
| 100 |
+
return dataset_name + "." + table_name
|
| 101 |
+
|
| 102 |
+
def load_pipeline(table_name):
|
| 103 |
+
_conn = duckdb.connect("duckdb_pipeline.duckdb")
|
| 104 |
+
return _conn.sql(f"SELECT * FROM {table_name} LIMIT 1000").df()
|
| 105 |
|
| 106 |
def df_summary(df):
|
| 107 |
summary = []
|
|
|
|
| 241 |
# Main Function
|
| 242 |
def main(table):
|
| 243 |
schema = get_table_schema(table)
|
| 244 |
+
|
| 245 |
+
# Create dlt pipeline
|
| 246 |
+
table_name = create_pipeline(schema)
|
| 247 |
+
|
| 248 |
+
# Load dlt pipeline
|
| 249 |
+
df = load_pipeline(table_name)
|
| 250 |
+
|
| 251 |
+
# df = get_data_df(schema)
|
| 252 |
df_statistics, df_alerts = statistics(df)
|
| 253 |
describe_num, describe_cat = describe(df)
|
| 254 |
|
|
|
|
| 269 |
def user_results(table, text_query):
|
| 270 |
|
| 271 |
schema = get_table_schema(table)
|
| 272 |
+
|
| 273 |
+
# Create dlt pipeline
|
| 274 |
+
table_name = create_pipeline(schema)
|
| 275 |
+
|
| 276 |
+
# Load dlt pipeline
|
| 277 |
+
df = load_pipeline(table_name)
|
| 278 |
|
| 279 |
messages = format_user_prompt(df=df, user_description=text_query)
|
| 280 |
tests = run_llm(messages)
|
requirements.txt
CHANGED
|
@@ -7,4 +7,5 @@ pandera==0.20.4
|
|
| 7 |
ydata-profiling==v4.11.0
|
| 8 |
langchain-core==0.3.12
|
| 9 |
langchain-huggingface
|
| 10 |
-
langchain==0.3.4
|
|
|
|
|
|
| 7 |
ydata-profiling==v4.11.0
|
| 8 |
langchain-core==0.3.12
|
| 9 |
langchain-huggingface
|
| 10 |
+
langchain==0.3.4
|
| 11 |
+
dlt==1.3.0
|