|
|
|
""" |
|
ai_csv_editor_hf.py ββ AI-powered CSV editor using a Hugging Face model on CPU. |
|
|
|
This version patches Gradioβs JSONβschema introspector to skip over |
|
boolean schemas and avoid the "const in schema" TypeError. |
|
""" |
|
|
|
|
|
|
|
|
|
try: |
|
import gradio_client.utils as _client_utils |
|
_old_get_type = _client_utils.get_type |
|
def _patched_get_type(schema): |
|
|
|
if isinstance(schema, bool): |
|
return "Any" |
|
return _old_get_type(schema) |
|
_client_utils.get_type = _patched_get_type |
|
except ImportError: |
|
|
|
pass |
|
|
|
|
|
|
|
|
|
import json |
|
import tempfile |
|
import textwrap |
|
import pathlib |
|
from typing import List, Dict, Any |
|
|
|
import pandas as pd |
|
import gradio as gr |
|
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
MODEL_NAME = "google/flan-t5-base" |
|
MAX_NEW_TOKS = 256 |
|
TEMPERATURE = 0.0 |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
model = AutoModelForSeq2SeqLM.from_pretrained( |
|
MODEL_NAME, |
|
device_map="cpu", |
|
torch_dtype="auto" |
|
) |
|
generator = pipeline( |
|
"text2text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
max_new_tokens=MAX_NEW_TOKS, |
|
temperature=TEMPERATURE, |
|
do_sample=False, |
|
) |
|
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = textwrap.dedent("""\ |
|
You are an assistant that converts natural-language spreadsheet commands |
|
into JSON edit plans. Respond with ONLY valid JSON matching this schema: |
|
|
|
{ |
|
"actions": [ |
|
{ |
|
"operation": "concat | vlookup | xlookup | sumif", |
|
"target": "string", |
|
|
|
# For CONCAT: |
|
"columns": ["colA","colB"], |
|
"separator": " ", |
|
|
|
# For VLOOKUP / XLOOKUP: |
|
"lookup_value": "KeyInMain", |
|
"lookup_file": "other.csv", |
|
"lookup_column": "KeyInOther", |
|
"return_column": "Value", |
|
"exact": true, |
|
|
|
# For SUMIF: |
|
"criteria_column": "Category", |
|
"criteria": "Foo", |
|
"sum_column": "Amount" |
|
} |
|
] |
|
} |
|
""") |
|
|
|
def plan_from_command(cmd: str) -> Dict[str, Any]: |
|
prompt = f"{SYSTEM_PROMPT}\n\nUser: {cmd}\nJSON:" |
|
output = generator( |
|
prompt, |
|
max_new_tokens=MAX_NEW_TOKS, |
|
temperature=TEMPERATURE, |
|
do_sample=False, |
|
)[0]["generated_text"] |
|
try: |
|
return json.loads(output) |
|
except json.JSONDecodeError as e: |
|
raise ValueError(f"Model returned invalid JSON:\n{output}") from e |
|
|
|
|
|
|
|
|
|
def apply_action(df: pd.DataFrame, |
|
uploads: Dict[str, pd.DataFrame], |
|
act: Dict[str, Any]) -> pd.DataFrame: |
|
op = act["operation"] |
|
if op == "concat": |
|
sep = act.get("separator", "") |
|
df[act["target"]] = ( |
|
df[act["columns"]] |
|
.astype(str) |
|
.agg(sep.join, axis=1) |
|
) |
|
elif op in {"vlookup", "xlookup"}: |
|
lookup_df = uploads[act["lookup_file"]] |
|
right = lookup_df[[act["lookup_column"], act["return_column"]]] \ |
|
.rename(columns={ |
|
act["lookup_column"]: act["lookup_value"], |
|
act["return_column"]: act["target"] |
|
}) |
|
df = df.merge(right, on=act["lookup_value"], how="left") |
|
elif op == "sumif": |
|
mask = df[act["criteria_column"]] == act["criteria"] |
|
total = df.loc[mask, act["sum_column"]].sum() |
|
df[act["target"]] = total |
|
else: |
|
raise ValueError(f"Unsupported operation: {op}") |
|
return df |
|
|
|
|
|
|
|
|
|
def run_editor(files: List[gr.File], command: str): |
|
if not files: |
|
return None, "β οΈ Please upload at least one CSV file.", None |
|
|
|
uploads = { |
|
pathlib.Path(f.name).name: pd.read_csv(f.name) |
|
for f in files |
|
} |
|
main_name = list(uploads.keys())[0] |
|
df = uploads[main_name] |
|
|
|
try: |
|
plan = plan_from_command(command) |
|
except Exception as e: |
|
return None, f"β LLM error: {e}", None |
|
|
|
try: |
|
for act in plan["actions"]: |
|
df = apply_action(df, uploads, act) |
|
except Exception as e: |
|
return None, f"β Execution error: {e}", None |
|
|
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") |
|
df.to_csv(tmp.name, index=False) |
|
return df.head(20), "β
Success! Download below.", tmp.name |
|
|
|
with gr.Blocks(title="AI CSV Editor (HF, CPU)") as demo: |
|
gr.Markdown("## AI-powered CSV Editor \n" |
|
"1. Upload one main CSV (first) plus any lookup tables \n" |
|
"2. Type a spreadsheet-style instruction \n" |
|
"3. Download the modified CSV") |
|
csv_files = gr.Files(file_types=[".csv"], label="Upload CSV file(s)") |
|
cmd_box = gr.Textbox(lines=2, placeholder="e.g. concat First Last β FullName") |
|
run_btn = gr.Button("Apply") |
|
preview = gr.Dataframe(label="Preview (first 20 rows)") |
|
status = gr.Markdown() |
|
download = gr.File(label="Download Result") |
|
|
|
run_btn.click( |
|
run_editor, |
|
[csv_files, cmd_box], |
|
[preview, status, download] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|