Spaces:

claudion-ai
/

Frappe

Sleeping

File size: 2,417 Bytes

6c6d2f7
f38ba4d
6c6d2f7
 
887c95b
 
f1efe67
f38ba4d
514fc02
f38ba4d
 
887c95b
514fc02
d256c7e
 
 
 
6c6d2f7
f38ba4d
514fc02
 
4b8f9d6
f38ba4d
 
 
514fc02
 
 
 
f38ba4d
 
 
 
 
 
 
6c6d2f7
 
f38ba4d
 
 
 
 
 
 
 
 
 
7cbc7f5
6c6d2f7
 
f38ba4d
 
 
887c95b
 
6c6d2f7
 
 
db1852e

import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset

# Load the Spider dataset
spider_dataset = load_dataset("spider", split='train')  # Load a subset of the dataset

# Extract schema information from the dataset
db_ids = set()
table_names = set()
column_names = set()
for item in spider_dataset:
    db_ids.add(item['db_id'])
    for table in item['db']['table_names_original']:
        table_names.add(table)
    for column in item['db']['column_names_original']:
        column_names.add(column[1])

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")

def post_process_sql_query(sql_query):
    # Modify the SQL query to match the dataset's schema
    # This is just an example and might need to be adapted based on the dataset and model output
    for db_id in db_ids:
        if "DB_ID" in sql_query:
            sql_query = sql_query.replace("DB_ID", db_id)
            break  # Assuming only one database is referenced in the query
    for table_name in table_names:
        if "TABLE" in sql_query:
            sql_query = sql_query.replace("TABLE", table_name)
            break  # Assuming only one table is referenced in the query
    for column_name in column_names:
        if "COLUMN" in sql_query:
            sql_query = sql_query.replace("COLUMN", column_name, 1)
    return sql_query

def generate_sql_from_user_input(query):
    # Generate SQL for the user's query
    input_text = "translate English to SQL: " + query
    inputs = tokenizer(input_text, return_tensors="pt", padding=True)
    outputs = model.generate(**inputs, max_length=512)
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Post-process the SQL query to match the dataset's schema
    sql_query = post_process_sql_query(sql_query)
    return sql_query

# Create a Gradio interface
interface = gr.Interface(
    fn=generate_sql_from_user_input,
    inputs=gr.Textbox(label="Enter your natural language query"),
    outputs=gr.Textbox(label="Generated SQL Query"),
    title="NL to SQL with T5 using Spider Dataset",
    description="This model generates an SQL query for your natural language input based on the Spider dataset."
)

# Launch the app
if __name__ == "__main__":
    interface.launch()