from groq import Groq from pydantic import BaseModel import json import gradio as gr import pandas as pd class ValidationStatus(BaseModel): # Indicates whether the generated SQL query is syntactically valid is_valid: bool # A list of SQL syntax error messages (empty if no errors are found) syntax_errors: list[str] class SQLQueryGeneration(BaseModel): query: str ### SELECT product_id, name, price FROM products WHERE price < 50 ORDER BY price ASC # The type of SQL query (e.g., SELECT, INSERT, UPDATE, DELETE) query_type: str ### "SELECT", # A list of table names referenced in the SQL query tables_used: list[str] ### products # Estimated complexity of the query (e.g., LOW, MEDIUM, HIGH) estimated_complexity: str ### low # Notes describing how the query executes or any assumptions made execution_notes: list[str] # "Simple SELECT query on products table", "Filter products with price less than $50", "Order results by price ascending" # Validation results for the generated SQL query validation_status: ValidationStatus # "is_valid": true/false, "syntax_errors": [] # SQL CREATE TABLE statement describing the table schema table_schema: str ### CREATE Table query (We create the table which is products here) # Sample data used to populate the table (INSERT statements or table view) sample_data: str #### INSERT DATA INTO _______ # Results of executing the SQL query, formatted as a pipe-delimited table execution_results: str #### EXECUTION # Suggestions for optimizing the SQL query (indexes, joins, filters, etc.) optimization_notes: list[str] ### INSTRUCTIONS def parse_execution_results_to_dataframe(execution_results): """Convert text-based table results to pandas DataFrame""" try: # Remove leading/trailing whitespace and split the text into lines lines = execution_results.strip().split('\n') # If there are fewer than 3 lines, it's not a valid table (header, separator, data) if len(lines) < 3: return None # -------------------- # Extract header row # -------------------- # The first line contains the column headers header_line = lines[0] # Split the header by '|' and strip whitespace from each column name headers = [col.strip() for col in header_line.split('|')] # -------------------- # Extract data rows # -------------------- # Initialize a list to store parsed data rows data_rows = [] # Skip the second line (usually a separator like ----|----) for line in lines[2:]: # Ignore empty lines and separator-only lines if line.strip() and not line.strip().startswith('-'): # Split the row by '|' and clean up whitespace row = [cell.strip() for cell in line.split('|')] # Only keep rows that match the number of headers if len(row) == len(headers): data_rows.append(row) # -------------------- # Create DataFrame # -------------------- # If we successfully collected data rows if data_rows: # Create a pandas DataFrame using headers as column names df = pd.DataFrame(data_rows, columns=headers) ### column with heading names from my original text based table and data rows collected from there.... return df # Return None if no valid data rows were found return None except Exception as e: # Catch and print any parsing errors print(f"Error parsing results: {e}") return None def generate_sql_query(api_key, user_query): """Generate SQL query from natural language using GROQ API""" try: # -------------------- # Input validation # -------------------- # Check if API key is missing if not api_key: # Return error message and placeholders for expected return values return "Error: Please enter your GROQ API key", "", "", "", None, "" # Check if user query is missing if not user_query: # Return error message and placeholders for expected return values return "Error: Please enter a query description", "", "", "", None, "" # -------------------- # Initialize GROQ client # -------------------- # Create a GROQ client using the provided API key client = Groq(api_key=api_key) # -------------------- # Call GROQ Chat Completion API # -------------------- # Send a request to the GROQ chat completion endpoint response = client.chat.completions.create( # Specify the LLM model to use model="moonshotai/kimi-k2-instruct-0905", # Provide system and user messages messages=[ { # System prompt defines the assistant's role and output format "role": "system", "content": """You are a SQL expert. Generate structured SQL queries from natural language descriptions with proper syntax validation and metadata. After generating the SQL query, you must: 1. Create a sample SQL table schema based on the natural language description, including all necessary columns with appropriate data types 2. Populate the table with realistic sample data that demonstrates the query's functionality 3. Execute the generated SQL query against the sample table 4. Display the SQL table structure and data clearly 5. Show the query execution results in a pipe-delimited table format IMPORTANT: The execution_results field must contain a properly formatted table with: - Header row with column names separated by pipes (|) - A separator row with dashes - Data rows with values separated by pipes (|) Example format: column1 | column2 | column3 --------|---------|-------- value1 | value2 | value3 value4 | value5 | value6 Always present your response in this order: - Generated SQL query with syntax explanation - Table schema (CREATE TABLE statement) - Sample data (INSERT statements or table visualization) - Query execution results (in pipe-delimited table format) - Any relevant notes about assumptions made or query optimization suggestions""", }, { # User-provided natural language query "role": "user", "content": user_query ### NLQ }, ], # Enforce structured JSON output using a predefined schema response_format={ "type": "json_schema", "json_schema": { "name": "sql_query_generation", # Convert Pydantic model into JSON schema "schema": SQLQueryGeneration.model_json_schema() } } ) # -------------------- # Parse and validate model output # -------------------- # Convert the JSON string returned by the model into a Python object sql_query_generation = SQLQueryGeneration.model_validate( json.loads(response.choices[0].message.content) ) # -------------------- # Format validation results # -------------------- # Start validation summary with overall validity flag validation_text = f"Valid: {sql_query_generation.validation_status.is_valid}\n" ## true or false # If syntax errors exist, list them if sql_query_generation.validation_status.syntax_errors: ## if any syntax error is there validation_text += "Errors:\n" + "\n".join( f"- {error}" for error in sql_query_generation.validation_status.syntax_errors ) else: # No syntax issues found validation_text += "No syntax errors found" # Build a metadata summary string describing the query metadata = f"""Query Type: {sql_query_generation.query_type} Tables Used: {', '.join(sql_query_generation.tables_used)} Complexity: {sql_query_generation.estimated_complexity} Execution Notes: {chr(10).join(f"- {note}" for note in sql_query_generation.execution_notes)} Optimization Notes: {chr(10).join(f"- {note}" for note in sql_query_generation.optimization_notes)}""" # -------------------- # Parse execution results into DataFrame # -------------------- # Convert the pipe-delimited execution results into a pandas DataFrame results_df = parse_execution_results_to_dataframe( sql_query_generation.execution_results ) # -------------------- # Return all outputs # -------------------- return ( # Generated SQL query sql_query_generation.query, # Metadata summary metadata, # SQL CREATE TABLE schema sql_query_generation.table_schema, # Sample INSERT data or table visualization sql_query_generation.sample_data, # Pandas DataFrame of execution results results_df, # SQL validation summary validation_text ) except Exception as e: # Catch unexpected errors and return an error message error_msg = f"Error: {str(e)}" return error_msg, "", "", "", None, "" # Create Gradio interface with gr.Blocks(title="SQL Query Generator", theme=gr.themes.Ocean()) as demo: gr.Markdown( """ # 🗄️ Natural Language to SQL Query Generator Convert your natural language descriptions into structured SQL queries with validation and execution results. """ ) with gr.Row(): with gr.Column(): api_key_input = gr.Textbox( label="GROQ API Key", type="password", placeholder="Enter your GROQ API key here...", info="Your API key is not stored and only used for this session" ) query_input = gr.Textbox( label="Natural Language Query", placeholder="e.g., Find all the students who scored more than 90 out of 100", lines=3, value="Find all the students who scored more than 90 out of 100" ) generate_btn = gr.Button("Generate SQL Query", variant="primary", size="lg") gr.Examples( examples=[ ["Find all the students who scored more than 90 out of 100"], ["Get the top 5 customers by total purchase amount"], ["List all employees hired in the last 6 months"], ["Find products with price between $50 and $100"], ["Show average salary by department"] ], inputs=query_input, label="Example Queries" ) with gr.Row(): with gr.Column(): sql_output = gr.Code( label="Generated SQL Query", language="sql", lines=5 ) metadata_output = gr.Textbox( label="Query Metadata", lines=8 ) validation_output = gr.Textbox( label="Validation Status", lines=3 ) with gr.Row(): with gr.Column(): schema_output = gr.Code( label="Table Schema", language="sql", lines=8 ) with gr.Column(): sample_data_output = gr.Code( label="Sample Data", language="sql", lines=8 ) with gr.Row(): execution_output = gr.Dataframe( label="📊 Execution Results", headers=None, datatype="str", row_count=10, col_count=None, wrap=True, interactive=False ) generate_btn.click( fn=generate_sql_query, inputs=[api_key_input, query_input], outputs=[ sql_output, metadata_output, schema_output, sample_data_output, execution_output, validation_output ] ) if __name__ == "__main__": demo.launch(share=True)