#!/usr/bin/env python3 """ ai_csv_editor_hf.py ── AI-powered CSV editor using a Hugging Face model on CPU. This version patches Gradio’s JSON‐schema introspector to skip over boolean schemas and avoid the "const in schema" TypeError. """ # ───────────────────────────────────────────────────────────────────────────── # 0. MONKEY-PATCH for gradio_client.utils.get_type to handle bool schemas # ───────────────────────────────────────────────────────────────────────────── try: import gradio_client.utils as _client_utils _old_get_type = _client_utils.get_type def _patched_get_type(schema): # If schema is unexpectedly a bool, just return a generic "Any" if isinstance(schema, bool): return "Any" return _old_get_type(schema) _client_utils.get_type = _patched_get_type except ImportError: # If gradio_client isn't present yet, we'll let it import later pass # ───────────────────────────────────────────────────────────────────────────── # 1. LOAD A SMALL INSTRUCTION-FOLLOWING MODEL (CPU only) # ───────────────────────────────────────────────────────────────────────────── import json import tempfile import textwrap import pathlib from typing import List, Dict, Any import pandas as pd import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM MODEL_NAME = "google/flan-t5-base" MAX_NEW_TOKS = 256 TEMPERATURE = 0.0 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForSeq2SeqLM.from_pretrained( MODEL_NAME, device_map="cpu", # force CPU placement torch_dtype="auto" ) generator = pipeline( "text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=MAX_NEW_TOKS, temperature=TEMPERATURE, do_sample=False, ) # ───────────────────────────────────────────────────────────────────────────── # 2. PROMPT → JSON “EDIT PLAN” # ───────────────────────────────────────────────────────────────────────────── SYSTEM_PROMPT = textwrap.dedent("""\ You are an assistant that converts natural-language spreadsheet commands into JSON edit plans. Respond with ONLY valid JSON matching this schema: { "actions": [ { "operation": "concat | vlookup | xlookup | sumif", "target": "string", # For CONCAT: "columns": ["colA","colB"], "separator": " ", # For VLOOKUP / XLOOKUP: "lookup_value": "KeyInMain", "lookup_file": "other.csv", "lookup_column": "KeyInOther", "return_column": "Value", "exact": true, # For SUMIF: "criteria_column": "Category", "criteria": "Foo", "sum_column": "Amount" } ] } """) def plan_from_command(cmd: str) -> Dict[str, Any]: prompt = f"{SYSTEM_PROMPT}\n\nUser: {cmd}\nJSON:" output = generator( prompt, max_new_tokens=MAX_NEW_TOKS, temperature=TEMPERATURE, do_sample=False, )[0]["generated_text"] try: return json.loads(output) except json.JSONDecodeError as e: raise ValueError(f"Model returned invalid JSON:\n{output}") from e # ───────────────────────────────────────────────────────────────────────────── # 3. DATA OPERATIONS # ───────────────────────────────────────────────────────────────────────────── def apply_action(df: pd.DataFrame, uploads: Dict[str, pd.DataFrame], act: Dict[str, Any]) -> pd.DataFrame: op = act["operation"] if op == "concat": sep = act.get("separator", "") df[act["target"]] = ( df[act["columns"]] .astype(str) .agg(sep.join, axis=1) ) elif op in {"vlookup", "xlookup"}: lookup_df = uploads[act["lookup_file"]] right = lookup_df[[act["lookup_column"], act["return_column"]]] \ .rename(columns={ act["lookup_column"]: act["lookup_value"], act["return_column"]: act["target"] }) df = df.merge(right, on=act["lookup_value"], how="left") elif op == "sumif": mask = df[act["criteria_column"]] == act["criteria"] total = df.loc[mask, act["sum_column"]].sum() df[act["target"]] = total else: raise ValueError(f"Unsupported operation: {op}") return df # ───────────────────────────────────────────────────────────────────────────── # 4. GRADIO UI # ───────────────────────────────────────────────────────────────────────────── def run_editor(files: List[gr.File], command: str): if not files: return None, "⚠️ Please upload at least one CSV file.", None uploads = { pathlib.Path(f.name).name: pd.read_csv(f.name) for f in files } main_name = list(uploads.keys())[0] df = uploads[main_name] try: plan = plan_from_command(command) except Exception as e: return None, f"❌ LLM error: {e}", None try: for act in plan["actions"]: df = apply_action(df, uploads, act) except Exception as e: return None, f"❌ Execution error: {e}", None tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") df.to_csv(tmp.name, index=False) return df.head(20), "✅ Success! Download below.", tmp.name with gr.Blocks(title="AI CSV Editor (HF, CPU)") as demo: gr.Markdown("## AI-powered CSV Editor \n" "1. Upload one main CSV (first) plus any lookup tables \n" "2. Type a spreadsheet-style instruction \n" "3. Download the modified CSV") csv_files = gr.Files(file_types=[".csv"], label="Upload CSV file(s)") cmd_box = gr.Textbox(lines=2, placeholder="e.g. concat First Last → FullName") run_btn = gr.Button("Apply") preview = gr.Dataframe(label="Preview (first 20 rows)") status = gr.Markdown() download = gr.File(label="Download Result") run_btn.click( run_editor, [csv_files, cmd_box], [preview, status, download] ) if __name__ == "__main__": demo.launch()