Spaces:

wifix199
/

Coding

Running

App Files Files Community

Coding / app.py

wifix199

Update app.py

4b1dcee verified 1 day ago

raw

history blame contribute delete

7.58 kB

	#!/usr/bin/env python3
	"""
	ai_csv_editor_hf.py ── AI-powered CSV editor using a Hugging Face model on CPU.

	This version patches Gradio’s JSON‐schema introspector to skip over
	boolean schemas and avoid the "const in schema" TypeError.
	"""

	# ─────────────────────────────────────────────────────────────────────────────
	# 0. MONKEY-PATCH for gradio_client.utils.get_type to handle bool schemas
	# ─────────────────────────────────────────────────────────────────────────────
	try:
	import gradio_client.utils as _client_utils
	_old_get_type = _client_utils.get_type
	def _patched_get_type(schema):
	# If schema is unexpectedly a bool, just return a generic "Any"
	if isinstance(schema, bool):
	return "Any"
	return _old_get_type(schema)
	_client_utils.get_type = _patched_get_type
	except ImportError:
	# If gradio_client isn't present yet, we'll let it import later
	pass

	# ─────────────────────────────────────────────────────────────────────────────
	# 1. LOAD A SMALL INSTRUCTION-FOLLOWING MODEL (CPU only)
	# ─────────────────────────────────────────────────────────────────────────────
	import json
	import tempfile
	import textwrap
	import pathlib
	from typing import List, Dict, Any

	import pandas as pd
	import gradio as gr
	from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

	MODEL_NAME = "google/flan-t5-base"
	MAX_NEW_TOKS = 256
	TEMPERATURE = 0.0

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForSeq2SeqLM.from_pretrained(
	MODEL_NAME,
	device_map="cpu", # force CPU placement
	torch_dtype="auto"
	)
	generator = pipeline(
	"text2text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=MAX_NEW_TOKS,
	temperature=TEMPERATURE,
	do_sample=False,
	)

	# ─────────────────────────────────────────────────────────────────────────────
	# 2. PROMPT → JSON “EDIT PLAN”
	# ─────────────────────────────────────────────────────────────────────────────
	SYSTEM_PROMPT = textwrap.dedent("""\
	You are an assistant that converts natural-language spreadsheet commands
	into JSON edit plans. Respond with ONLY valid JSON matching this schema:

	{
	"actions": [
	{
	"operation": "concat \| vlookup \| xlookup \| sumif",
	"target": "string",

	# For CONCAT:
	"columns": ["colA","colB"],
	"separator": " ",

	# For VLOOKUP / XLOOKUP:
	"lookup_value": "KeyInMain",
	"lookup_file": "other.csv",
	"lookup_column": "KeyInOther",
	"return_column": "Value",
	"exact": true,

	# For SUMIF:
	"criteria_column": "Category",
	"criteria": "Foo",
	"sum_column": "Amount"
	}
	]
	}
	""")

	def plan_from_command(cmd: str) -> Dict[str, Any]:
	prompt = f"{SYSTEM_PROMPT}\n\nUser: {cmd}\nJSON:"
	output = generator(
	prompt,
	max_new_tokens=MAX_NEW_TOKS,
	temperature=TEMPERATURE,
	do_sample=False,
	)[0]["generated_text"]
	try:
	return json.loads(output)
	except json.JSONDecodeError as e:
	raise ValueError(f"Model returned invalid JSON:\n{output}") from e

	# ─────────────────────────────────────────────────────────────────────────────
	# 3. DATA OPERATIONS
	# ─────────────────────────────────────────────────────────────────────────────
	def apply_action(df: pd.DataFrame,
	uploads: Dict[str, pd.DataFrame],
	act: Dict[str, Any]) -> pd.DataFrame:
	op = act["operation"]
	if op == "concat":
	sep = act.get("separator", "")
	df[act["target"]] = (
	df[act["columns"]]
	.astype(str)
	.agg(sep.join, axis=1)
	)
	elif op in {"vlookup", "xlookup"}:
	lookup_df = uploads[act["lookup_file"]]
	right = lookup_df[[act["lookup_column"], act["return_column"]]] \
	.rename(columns={
	act["lookup_column"]: act["lookup_value"],
	act["return_column"]: act["target"]
	})
	df = df.merge(right, on=act["lookup_value"], how="left")
	elif op == "sumif":
	mask = df[act["criteria_column"]] == act["criteria"]
	total = df.loc[mask, act["sum_column"]].sum()
	df[act["target"]] = total
	else:
	raise ValueError(f"Unsupported operation: {op}")
	return df

	# ─────────────────────────────────────────────────────────────────────────────
	# 4. GRADIO UI
	# ─────────────────────────────────────────────────────────────────────────────
	def run_editor(files: List[gr.File], command: str):
	if not files:
	return None, "⚠️ Please upload at least one CSV file.", None

	uploads = {
	pathlib.Path(f.name).name: pd.read_csv(f.name)
	for f in files
	}
	main_name = list(uploads.keys())[0]
	df = uploads[main_name]

	try:
	plan = plan_from_command(command)
	except Exception as e:
	return None, f"❌ LLM error: {e}", None

	try:
	for act in plan["actions"]:
	df = apply_action(df, uploads, act)
	except Exception as e:
	return None, f"❌ Execution error: {e}", None

	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
	df.to_csv(tmp.name, index=False)
	return df.head(20), "✅ Success! Download below.", tmp.name

	with gr.Blocks(title="AI CSV Editor (HF, CPU)") as demo:
	gr.Markdown("## AI-powered CSV Editor \n"
	"1. Upload one main CSV (first) plus any lookup tables \n"
	"2. Type a spreadsheet-style instruction \n"
	"3. Download the modified CSV")
	csv_files = gr.Files(file_types=[".csv"], label="Upload CSV file(s)")
	cmd_box = gr.Textbox(lines=2, placeholder="e.g. concat First Last → FullName")
	run_btn = gr.Button("Apply")
	preview = gr.Dataframe(label="Preview (first 20 rows)")
	status = gr.Markdown()
	download = gr.File(label="Download Result")

	run_btn.click(
	run_editor,
	[csv_files, cmd_box],
	[preview, status, download]
	)

	if __name__ == "__main__":
	demo.launch()