Update app.py
Browse files
app.py
CHANGED
@@ -1,64 +1,179 @@
|
|
1 |
-
|
2 |
-
from huggingface_hub import InferenceClient
|
3 |
-
|
4 |
"""
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"""
|
7 |
-
client = InferenceClient("Qwen/Qwen2.5-Coder-32B-Instruct")
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
messages = [{"role": "system", "content": system_message}]
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
|
39 |
-
|
40 |
-
|
|
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
"""
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
]
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
if __name__ == "__main__":
|
64 |
demo.launch()
|
|
|
1 |
+
#!/usr/bin/env python3
|
|
|
|
|
2 |
"""
|
3 |
+
ai_csv_editor_hf.py ββ AI-powered CSV editor using a Hugging Face model on CPU.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Upload one or more CSV files (main + optional lookup tables)
|
7 |
+
- Type spreadsheet-style commands: CONCAT, VLOOKUP, XLOOKUP, SUMIF
|
8 |
+
- LLM (google/flan-t5-base) converts commands β JSON βedit planβ
|
9 |
+
- pandas applies each action in sequence
|
10 |
+
- Preview first 20 rows & download modified CSV
|
11 |
"""
|
|
|
12 |
|
13 |
+
import json
|
14 |
+
import io
|
15 |
+
import tempfile
|
16 |
+
import textwrap
|
17 |
+
import pathlib
|
18 |
+
from typing import List, Dict, Any
|
19 |
|
20 |
+
import pandas as pd
|
21 |
+
import gradio as gr
|
22 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
25 |
+
# 1. LOAD A SMALL INSTRUCTION-FOLLOWING MODEL (CPU only)
|
26 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
27 |
+
MODEL_NAME = "google/flan-t5-base"
|
28 |
+
MAX_NEW_TOK = 256
|
29 |
+
TEMPERATURE = 0.0
|
30 |
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
32 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(
|
33 |
+
MODEL_NAME,
|
34 |
+
device_map="cpu", # force CPU
|
35 |
+
torch_dtype="auto"
|
36 |
+
)
|
37 |
+
generator = pipeline(
|
38 |
+
"text2text-generation",
|
39 |
+
model=model,
|
40 |
+
tokenizer=tokenizer,
|
41 |
+
device=-1, # -1 = CPU
|
42 |
+
)
|
43 |
|
44 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
45 |
+
# 2. PROMPT β JSON βEDIT PLANβ
|
46 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
47 |
+
SYSTEM_PROMPT = textwrap.dedent("""\
|
48 |
+
You are an assistant that converts natural-language spreadsheet commands
|
49 |
+
into JSON edit plans. Respond with ONLY valid JSON matching this schema:
|
50 |
|
51 |
+
{
|
52 |
+
"actions": [
|
53 |
+
{
|
54 |
+
"operation": "concat | vlookup | xlookup | sumif",
|
55 |
+
"target": "string",
|
|
|
|
|
|
|
56 |
|
57 |
+
# For CONCAT:
|
58 |
+
"columns": ["colA","colB"],
|
59 |
+
"separator": " ",
|
60 |
|
61 |
+
# For VLOOKUP / XLOOKUP:
|
62 |
+
"lookup_value": "KeyInMain",
|
63 |
+
"lookup_file": "other.csv",
|
64 |
+
"lookup_column": "KeyInOther",
|
65 |
+
"return_column": "Value",
|
66 |
+
"exact": true,
|
67 |
|
68 |
+
# For SUMIF:
|
69 |
+
"criteria_column": "Category",
|
70 |
+
"criteria": "Foo",
|
71 |
+
"sum_column": "Amount"
|
72 |
+
}
|
73 |
+
]
|
74 |
+
}
|
75 |
+
""")
|
76 |
+
|
77 |
+
def plan_from_command(cmd: str) -> Dict[str, Any]:
|
78 |
+
prompt = f"{SYSTEM_PROMPT}\n\nUser: {cmd}\nJSON:"
|
79 |
+
output = generator(
|
80 |
+
prompt,
|
81 |
+
max_new_tokens=MAX_NEW_TOK,
|
82 |
+
temperature=TEMPERATURE,
|
83 |
+
do_sample=False,
|
84 |
+
)[0]["generated_text"]
|
85 |
+
try:
|
86 |
+
return json.loads(output)
|
87 |
+
except json.JSONDecodeError as e:
|
88 |
+
raise ValueError(f"Model returned invalid JSON:\n{output}") from e
|
89 |
+
|
90 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
91 |
+
# 3. DATA OPERATIONS
|
92 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
93 |
+
def apply_action(df: pd.DataFrame,
|
94 |
+
uploads: Dict[str, pd.DataFrame],
|
95 |
+
act: Dict[str, Any]) -> pd.DataFrame:
|
96 |
+
op = act["operation"]
|
97 |
+
|
98 |
+
if op == "concat":
|
99 |
+
sep = act.get("separator", "")
|
100 |
+
df[act["target"]] = (
|
101 |
+
df[act["columns"]]
|
102 |
+
.astype(str)
|
103 |
+
.agg(sep.join, axis=1)
|
104 |
+
)
|
105 |
+
|
106 |
+
elif op in {"vlookup", "xlookup"}:
|
107 |
+
lookup_df = uploads[act["lookup_file"]]
|
108 |
+
# select only the two relevant columns and rename for merging
|
109 |
+
right = lookup_df[[act["lookup_column"], act["return_column"]]] \
|
110 |
+
.rename(columns={
|
111 |
+
act["lookup_column"]: act["lookup_value"],
|
112 |
+
act["return_column"]: act["target"]
|
113 |
+
})
|
114 |
+
df = df.merge(right, on=act["lookup_value"], how="left")
|
115 |
+
|
116 |
+
elif op == "sumif":
|
117 |
+
mask = df[act["criteria_column"]] == act["criteria"]
|
118 |
+
total = df.loc[mask, act["sum_column"]].sum()
|
119 |
+
df[act["target"]] = total
|
120 |
+
|
121 |
+
else:
|
122 |
+
raise ValueError(f"Unsupported operation: {op}")
|
123 |
+
|
124 |
+
return df
|
125 |
+
|
126 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
127 |
+
# 4. GRADIO UI
|
128 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
129 |
+
def run_editor(files: List[gr.File], command: str):
|
130 |
+
if not files:
|
131 |
+
return None, "β οΈ Please upload at least one CSV file.", None
|
132 |
+
|
133 |
+
# Load uploaded CSVs into a dictionary
|
134 |
+
uploads = {
|
135 |
+
pathlib.Path(f.name).name: pd.read_csv(f.name)
|
136 |
+
for f in files
|
137 |
+
}
|
138 |
+
# Treat the first file as the main dataset
|
139 |
+
main_name = list(uploads.keys())[0]
|
140 |
+
df = uploads[main_name]
|
141 |
+
|
142 |
+
# Generate plan
|
143 |
+
try:
|
144 |
+
plan = plan_from_command(command)
|
145 |
+
except Exception as e:
|
146 |
+
return None, f"β LLM error: {e}", None
|
147 |
+
|
148 |
+
# Apply actions
|
149 |
+
try:
|
150 |
+
for act in plan["actions"]:
|
151 |
+
df = apply_action(df, uploads, act)
|
152 |
+
except Exception as e:
|
153 |
+
return None, f"β Execution error: {e}", None
|
154 |
+
|
155 |
+
# Write modified CSV to a temp file and return
|
156 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
|
157 |
+
df.to_csv(tmp.name, index=False)
|
158 |
+
return df.head(20), "β
Success! Download below.", tmp.name
|
159 |
+
|
160 |
+
with gr.Blocks(title="AI CSV Editor (HF, CPU)") as demo:
|
161 |
+
gr.Markdown("## AI-powered CSV Editor \n"
|
162 |
+
"1. Upload one main CSV (first) plus any lookup tables \n"
|
163 |
+
"2. Type a spreadsheet-style instruction \n"
|
164 |
+
"3. Download the modified CSV")
|
165 |
+
csv_files = gr.Files(file_types=[".csv"], label="Upload CSV file(s)")
|
166 |
+
cmd_box = gr.Textbox(lines=2, placeholder="e.g. concat First Last β FullName")
|
167 |
+
run_btn = gr.Button("Apply")
|
168 |
+
preview = gr.Dataframe(label="Preview (first 20 rows)")
|
169 |
+
status = gr.Markdown()
|
170 |
+
download = gr.File(label="Download Result")
|
171 |
|
172 |
+
run_btn.click(
|
173 |
+
fn=run_editor,
|
174 |
+
inputs=[csv_files, cmd_box],
|
175 |
+
outputs=[preview, status, download]
|
176 |
+
)
|
177 |
|
178 |
if __name__ == "__main__":
|
179 |
demo.launch()
|