Coding / app.py
wifix199's picture
Update app.py
4b1dcee verified
#!/usr/bin/env python3
"""
ai_csv_editor_hf.py ── AI-powered CSV editor using a Hugging Face model on CPU.
This version patches Gradio’s JSON‐schema introspector to skip over
boolean schemas and avoid the "const in schema" TypeError.
"""
# ─────────────────────────────────────────────────────────────────────────────
# 0. MONKEY-PATCH for gradio_client.utils.get_type to handle bool schemas
# ─────────────────────────────────────────────────────────────────────────────
try:
import gradio_client.utils as _client_utils
_old_get_type = _client_utils.get_type
def _patched_get_type(schema):
# If schema is unexpectedly a bool, just return a generic "Any"
if isinstance(schema, bool):
return "Any"
return _old_get_type(schema)
_client_utils.get_type = _patched_get_type
except ImportError:
# If gradio_client isn't present yet, we'll let it import later
pass
# ─────────────────────────────────────────────────────────────────────────────
# 1. LOAD A SMALL INSTRUCTION-FOLLOWING MODEL (CPU only)
# ─────────────────────────────────────────────────────────────────────────────
import json
import tempfile
import textwrap
import pathlib
from typing import List, Dict, Any
import pandas as pd
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
MODEL_NAME = "google/flan-t5-base"
MAX_NEW_TOKS = 256
TEMPERATURE = 0.0
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
MODEL_NAME,
device_map="cpu", # force CPU placement
torch_dtype="auto"
)
generator = pipeline(
"text2text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=MAX_NEW_TOKS,
temperature=TEMPERATURE,
do_sample=False,
)
# ─────────────────────────────────────────────────────────────────────────────
# 2. PROMPT β†’ JSON β€œEDIT PLAN”
# ─────────────────────────────────────────────────────────────────────────────
SYSTEM_PROMPT = textwrap.dedent("""\
You are an assistant that converts natural-language spreadsheet commands
into JSON edit plans. Respond with ONLY valid JSON matching this schema:
{
"actions": [
{
"operation": "concat | vlookup | xlookup | sumif",
"target": "string",
# For CONCAT:
"columns": ["colA","colB"],
"separator": " ",
# For VLOOKUP / XLOOKUP:
"lookup_value": "KeyInMain",
"lookup_file": "other.csv",
"lookup_column": "KeyInOther",
"return_column": "Value",
"exact": true,
# For SUMIF:
"criteria_column": "Category",
"criteria": "Foo",
"sum_column": "Amount"
}
]
}
""")
def plan_from_command(cmd: str) -> Dict[str, Any]:
prompt = f"{SYSTEM_PROMPT}\n\nUser: {cmd}\nJSON:"
output = generator(
prompt,
max_new_tokens=MAX_NEW_TOKS,
temperature=TEMPERATURE,
do_sample=False,
)[0]["generated_text"]
try:
return json.loads(output)
except json.JSONDecodeError as e:
raise ValueError(f"Model returned invalid JSON:\n{output}") from e
# ─────────────────────────────────────────────────────────────────────────────
# 3. DATA OPERATIONS
# ─────────────────────────────────────────────────────────────────────────────
def apply_action(df: pd.DataFrame,
uploads: Dict[str, pd.DataFrame],
act: Dict[str, Any]) -> pd.DataFrame:
op = act["operation"]
if op == "concat":
sep = act.get("separator", "")
df[act["target"]] = (
df[act["columns"]]
.astype(str)
.agg(sep.join, axis=1)
)
elif op in {"vlookup", "xlookup"}:
lookup_df = uploads[act["lookup_file"]]
right = lookup_df[[act["lookup_column"], act["return_column"]]] \
.rename(columns={
act["lookup_column"]: act["lookup_value"],
act["return_column"]: act["target"]
})
df = df.merge(right, on=act["lookup_value"], how="left")
elif op == "sumif":
mask = df[act["criteria_column"]] == act["criteria"]
total = df.loc[mask, act["sum_column"]].sum()
df[act["target"]] = total
else:
raise ValueError(f"Unsupported operation: {op}")
return df
# ─────────────────────────────────────────────────────────────────────────────
# 4. GRADIO UI
# ─────────────────────────────────────────────────────────────────────────────
def run_editor(files: List[gr.File], command: str):
if not files:
return None, "⚠️ Please upload at least one CSV file.", None
uploads = {
pathlib.Path(f.name).name: pd.read_csv(f.name)
for f in files
}
main_name = list(uploads.keys())[0]
df = uploads[main_name]
try:
plan = plan_from_command(command)
except Exception as e:
return None, f"❌ LLM error: {e}", None
try:
for act in plan["actions"]:
df = apply_action(df, uploads, act)
except Exception as e:
return None, f"❌ Execution error: {e}", None
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
df.to_csv(tmp.name, index=False)
return df.head(20), "βœ… Success! Download below.", tmp.name
with gr.Blocks(title="AI CSV Editor (HF, CPU)") as demo:
gr.Markdown("## AI-powered CSV Editor \n"
"1. Upload one main CSV (first) plus any lookup tables \n"
"2. Type a spreadsheet-style instruction \n"
"3. Download the modified CSV")
csv_files = gr.Files(file_types=[".csv"], label="Upload CSV file(s)")
cmd_box = gr.Textbox(lines=2, placeholder="e.g. concat First Last β†’ FullName")
run_btn = gr.Button("Apply")
preview = gr.Dataframe(label="Preview (first 20 rows)")
status = gr.Markdown()
download = gr.File(label="Download Result")
run_btn.click(
run_editor,
[csv_files, cmd_box],
[preview, status, download]
)
if __name__ == "__main__":
demo.launch()