Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,11 @@
|
|
1 |
-
"""NetCom → WooCommerce transformer (Try 2 schema —
|
2 |
-
|
3 |
*Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
*
|
8 |
-
*
|
9 |
-
in-flight `dict[str, asyncio.Future]`.
|
10 |
-
* Everything else (cache filenames, interface, outputs) stays the same.
|
11 |
"""
|
12 |
|
13 |
from __future__ import annotations
|
@@ -44,7 +42,7 @@ def _cache_path(p: str) -> Path:
|
|
44 |
|
45 |
def _get_cached(p: str) -> str | None:
|
46 |
try:
|
47 |
-
return json.loads(_cache_path(p).read_text("utf-8"))[
|
48 |
except Exception:
|
49 |
return None
|
50 |
|
@@ -55,24 +53,24 @@ def _set_cache(p: str, r: str) -> None:
|
|
55 |
pass
|
56 |
|
57 |
# -------- Async GPT helpers --------------------------------------------------
|
58 |
-
_SEM = asyncio.Semaphore(100)
|
59 |
_inflight: dict[str, asyncio.Future] = {} # prompt → Future
|
60 |
|
61 |
async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
|
62 |
-
"""Single LLM call with
|
63 |
cached = _get_cached(prompt)
|
64 |
if cached is not None:
|
65 |
return cached
|
66 |
|
67 |
-
#
|
68 |
running = _inflight.get(prompt)
|
69 |
if running is not None:
|
70 |
-
return await running
|
71 |
|
72 |
loop = asyncio.get_running_loop()
|
73 |
|
74 |
async def _call_api() -> str:
|
75 |
-
async with _SEM:
|
76 |
try:
|
77 |
msg = await client.chat.completions.create(
|
78 |
model="gpt-4o-mini",
|
@@ -80,17 +78,17 @@ async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
|
|
80 |
temperature=0,
|
81 |
)
|
82 |
text = msg.choices[0].message.content
|
83 |
-
except Exception as exc:
|
84 |
text = f"Error: {exc}"
|
85 |
_set_cache(prompt, text)
|
86 |
return text
|
87 |
|
88 |
task = loop.create_task(_call_api())
|
89 |
-
_inflight[prompt] = task
|
90 |
try:
|
91 |
return await task
|
92 |
finally:
|
93 |
-
_inflight.pop(prompt, None)
|
94 |
|
95 |
async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
|
96 |
"""Vectorised helper — returns an output list matching *lst* length."""
|
@@ -100,10 +98,9 @@ async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpe
|
|
100 |
if isinstance(txt, str) and txt.strip():
|
101 |
idx.append(i)
|
102 |
prompts.append(f"{instruction}\n\nText: {txt}")
|
103 |
-
if not prompts:
|
104 |
return out
|
105 |
|
106 |
-
# Gather — duplicate prompts handled inside _gpt_async
|
107 |
responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
|
108 |
for j, val in enumerate(responses):
|
109 |
out[idx[j]] = val
|
@@ -121,29 +118,46 @@ def _read(path: str) -> pd.DataFrame:
|
|
121 |
return pd.read_excel(path)
|
122 |
return pd.read_csv(path, encoding="latin1")
|
123 |
|
124 |
-
async def _enrich_dataframe(
|
|
|
|
|
125 |
"""Run all LLM batches concurrently and return the five enrichment columns."""
|
126 |
async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
|
127 |
sdesc, ldesc, fobj, fout = await asyncio.gather(
|
128 |
-
_batch_async(
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
)
|
137 |
-
|
|
|
138 |
prereq_raw = df.get(pcol, "").fillna("").tolist()
|
139 |
fpre: list[str] = []
|
140 |
for req in prereq_raw:
|
141 |
if not str(req).strip():
|
142 |
fpre.append(DEFAULT_PREREQ)
|
143 |
else:
|
144 |
-
formatted = await _batch_async(
|
145 |
-
|
146 |
-
|
|
|
|
|
147 |
fpre.append(formatted[0])
|
148 |
|
149 |
return sdesc, ldesc, fobj, fout, fpre
|
@@ -176,10 +190,12 @@ def convert(path: str) -> BytesIO:
|
|
176 |
sid = first_col("Course SID", "Course SID")
|
177 |
|
178 |
if dur not in df.columns:
|
179 |
-
df[dur] = "" # ensure Duration
|
180 |
|
181 |
# ---------- LLM enrichment (async) -------------------------------------
|
182 |
-
sdesc, ldesc, fobj, fout, fpre = asyncio.run(
|
|
|
|
|
183 |
|
184 |
df["Short_Description"] = sdesc
|
185 |
df["Condensed_Description"] = ldesc
|
@@ -193,105 +209,148 @@ def convert(path: str) -> BytesIO:
|
|
193 |
|
194 |
dsorted = df.sort_values(["Course ID", "Course Start Date"])
|
195 |
d_agg = (
|
196 |
-
dsorted
|
197 |
-
.groupby("Course ID")["Date_fmt"]
|
198 |
.apply(lambda s: ",".join(s.dropna().unique()))
|
199 |
.reset_index(name="Dates")
|
200 |
)
|
|
|
201 |
t_agg = (
|
202 |
-
dsorted
|
203 |
-
.
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
.reset_index(name="Times")
|
206 |
)
|
207 |
|
208 |
parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
|
209 |
|
210 |
# ---------- Parent / child product rows --------------------------------
|
211 |
-
parent = pd.DataFrame(
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
all_rows = pd.concat([parent, child], ignore_index=True)
|
288 |
order = [
|
289 |
-
"Type",
|
290 |
-
"
|
291 |
-
"
|
292 |
-
"
|
293 |
-
"
|
294 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
]
|
296 |
|
297 |
out = BytesIO()
|
@@ -310,7 +369,9 @@ def process_file(upload: gr.File) -> str:
|
|
310 |
|
311 |
ui = gr.Interface(
|
312 |
fn=process_file,
|
313 |
-
inputs=gr.File(
|
|
|
|
|
314 |
outputs=gr.File(label="Download WooCommerce CSV"),
|
315 |
title="NetCom → WooCommerce CSV Processor (Try 2)",
|
316 |
description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",
|
|
|
1 |
+
"""NetCom → WooCommerce transformer (Try 2 schema — 100-parallel + de-dupe, pandas fix)
|
2 |
+
======================================================================================
|
3 |
*Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
|
4 |
|
5 |
+
New since the last paste
|
6 |
+
------------------------
|
7 |
+
* Fix for older pandas: move `include_groups=False` from `.groupby()` to `.apply()`.
|
8 |
+
* Everything else (cache names, concurrency cap, in-flight de-duplication) is unchanged.
|
|
|
|
|
9 |
"""
|
10 |
|
11 |
from __future__ import annotations
|
|
|
42 |
|
43 |
def _get_cached(p: str) -> str | None:
|
44 |
try:
|
45 |
+
return json.loads(_cache_path(p).read_text("utf-8"))["response"]
|
46 |
except Exception:
|
47 |
return None
|
48 |
|
|
|
53 |
pass
|
54 |
|
55 |
# -------- Async GPT helpers --------------------------------------------------
|
56 |
+
_SEM = asyncio.Semaphore(100) # ≤100 concurrent OpenAI calls
|
57 |
_inflight: dict[str, asyncio.Future] = {} # prompt → Future
|
58 |
|
59 |
async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
|
60 |
+
"""Single LLM call with disk cache, concurrency cap, and de-duplication."""
|
61 |
cached = _get_cached(prompt)
|
62 |
if cached is not None:
|
63 |
return cached
|
64 |
|
65 |
+
# De-duplicate identical prompts already in flight
|
66 |
running = _inflight.get(prompt)
|
67 |
if running is not None:
|
68 |
+
return await running
|
69 |
|
70 |
loop = asyncio.get_running_loop()
|
71 |
|
72 |
async def _call_api() -> str:
|
73 |
+
async with _SEM: # concurrency limiter
|
74 |
try:
|
75 |
msg = await client.chat.completions.create(
|
76 |
model="gpt-4o-mini",
|
|
|
78 |
temperature=0,
|
79 |
)
|
80 |
text = msg.choices[0].message.content
|
81 |
+
except Exception as exc:
|
82 |
text = f"Error: {exc}"
|
83 |
_set_cache(prompt, text)
|
84 |
return text
|
85 |
|
86 |
task = loop.create_task(_call_api())
|
87 |
+
_inflight[prompt] = task
|
88 |
try:
|
89 |
return await task
|
90 |
finally:
|
91 |
+
_inflight.pop(prompt, None)
|
92 |
|
93 |
async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
|
94 |
"""Vectorised helper — returns an output list matching *lst* length."""
|
|
|
98 |
if isinstance(txt, str) and txt.strip():
|
99 |
idx.append(i)
|
100 |
prompts.append(f"{instruction}\n\nText: {txt}")
|
101 |
+
if not prompts:
|
102 |
return out
|
103 |
|
|
|
104 |
responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
|
105 |
for j, val in enumerate(responses):
|
106 |
out[idx[j]] = val
|
|
|
118 |
return pd.read_excel(path)
|
119 |
return pd.read_csv(path, encoding="latin1")
|
120 |
|
121 |
+
async def _enrich_dataframe(
|
122 |
+
df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str
|
123 |
+
) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
|
124 |
"""Run all LLM batches concurrently and return the five enrichment columns."""
|
125 |
async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
|
126 |
sdesc, ldesc, fobj, fout = await asyncio.gather(
|
127 |
+
_batch_async(
|
128 |
+
df.get(dcol, "").fillna("").tolist(),
|
129 |
+
"Create a concise 250-character summary of this course description:",
|
130 |
+
client,
|
131 |
+
),
|
132 |
+
_batch_async(
|
133 |
+
df.get(dcol, "").fillna("").tolist(),
|
134 |
+
"Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:",
|
135 |
+
client,
|
136 |
+
),
|
137 |
+
_batch_async(
|
138 |
+
df.get(ocol, "").fillna("").tolist(),
|
139 |
+
"Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':",
|
140 |
+
client,
|
141 |
+
),
|
142 |
+
_batch_async(
|
143 |
+
df.get(acol, "").fillna("").tolist(),
|
144 |
+
"Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':",
|
145 |
+
client,
|
146 |
+
),
|
147 |
)
|
148 |
+
|
149 |
+
# Prerequisites (some rows empty → default text)
|
150 |
prereq_raw = df.get(pcol, "").fillna("").tolist()
|
151 |
fpre: list[str] = []
|
152 |
for req in prereq_raw:
|
153 |
if not str(req).strip():
|
154 |
fpre.append(DEFAULT_PREREQ)
|
155 |
else:
|
156 |
+
formatted = await _batch_async(
|
157 |
+
[req],
|
158 |
+
"Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':",
|
159 |
+
client,
|
160 |
+
)
|
161 |
fpre.append(formatted[0])
|
162 |
|
163 |
return sdesc, ldesc, fobj, fout, fpre
|
|
|
190 |
sid = first_col("Course SID", "Course SID")
|
191 |
|
192 |
if dur not in df.columns:
|
193 |
+
df[dur] = "" # ensure Duration column exists
|
194 |
|
195 |
# ---------- LLM enrichment (async) -------------------------------------
|
196 |
+
sdesc, ldesc, fobj, fout, fpre = asyncio.run(
|
197 |
+
_enrich_dataframe(df, dcol, ocol, pcol, acol)
|
198 |
+
)
|
199 |
|
200 |
df["Short_Description"] = sdesc
|
201 |
df["Condensed_Description"] = ldesc
|
|
|
209 |
|
210 |
dsorted = df.sort_values(["Course ID", "Course Start Date"])
|
211 |
d_agg = (
|
212 |
+
dsorted.groupby("Course ID")["Date_fmt"]
|
|
|
213 |
.apply(lambda s: ",".join(s.dropna().unique()))
|
214 |
.reset_index(name="Dates")
|
215 |
)
|
216 |
+
|
217 |
t_agg = (
|
218 |
+
dsorted.groupby("Course ID", group_keys=False)
|
219 |
+
.apply(
|
220 |
+
lambda g: ",".join(
|
221 |
+
f"{st}-{et} {tz}"
|
222 |
+
for st, et, tz in zip(
|
223 |
+
g["Course Start Time"], g["Course End Time"], g["Time Zone"]
|
224 |
+
)
|
225 |
+
),
|
226 |
+
include_groups=False, # <- moved here
|
227 |
+
)
|
228 |
.reset_index(name="Times")
|
229 |
)
|
230 |
|
231 |
parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
|
232 |
|
233 |
# ---------- Parent / child product rows --------------------------------
|
234 |
+
parent = pd.DataFrame(
|
235 |
+
{
|
236 |
+
"Type": "variable",
|
237 |
+
"SKU": parents["Course ID"],
|
238 |
+
"Name": parents["Course Name"],
|
239 |
+
"Published": 1,
|
240 |
+
"Visibility in catalog": "visible",
|
241 |
+
"Short description": parents["Short_Description"],
|
242 |
+
"Description": parents["Condensed_Description"],
|
243 |
+
"Tax status": "taxable",
|
244 |
+
"In stock?": 1,
|
245 |
+
"Stock": 1,
|
246 |
+
"Sold individually?": 1,
|
247 |
+
"Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True),
|
248 |
+
"Categories": "courses",
|
249 |
+
"Images": parents["Vendor"].map(logos).fillna(""),
|
250 |
+
"Parent": "",
|
251 |
+
"Brands": parents["Vendor"],
|
252 |
+
"Attribute 1 name": "Date",
|
253 |
+
"Attribute 1 value(s)": parents["Dates"],
|
254 |
+
"Attribute 1 visible": "visible",
|
255 |
+
"Attribute 1 global": 1,
|
256 |
+
"Attribute 2 name": "Location",
|
257 |
+
"Attribute 2 value(s)": "Virtual",
|
258 |
+
"Attribute 2 visible": "visible",
|
259 |
+
"Attribute 2 global": 1,
|
260 |
+
"Attribute 3 name": "Time",
|
261 |
+
"Attribute 3 value(s)": parents["Times"],
|
262 |
+
"Attribute 3 visible": "visible",
|
263 |
+
"Attribute 3 global": 1,
|
264 |
+
"Meta: outline": parents["Formatted_Agenda"],
|
265 |
+
"Meta: days": parents[dur],
|
266 |
+
"Meta: location": "Virtual",
|
267 |
+
"Meta: overview": parents["Target Audience"],
|
268 |
+
"Meta: objectives": parents["Formatted_Objectives"],
|
269 |
+
"Meta: prerequisites": parents["Formatted_Prerequisites"],
|
270 |
+
"Meta: agenda": parents["Formatted_Agenda"],
|
271 |
+
}
|
272 |
+
)
|
273 |
+
|
274 |
+
child = pd.DataFrame(
|
275 |
+
{
|
276 |
+
"Type": "variation, virtual",
|
277 |
+
"SKU": dsorted[sid].astype(str).str.strip(),
|
278 |
+
"Name": dsorted["Course Name"],
|
279 |
+
"Published": 1,
|
280 |
+
"Visibility in catalog": "visible",
|
281 |
+
"Short description": dsorted["Short_Description"],
|
282 |
+
"Description": dsorted["Condensed_Description"],
|
283 |
+
"Tax status": "taxable",
|
284 |
+
"In stock?": 1,
|
285 |
+
"Stock": 1,
|
286 |
+
"Sold individually?": 1,
|
287 |
+
"Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True),
|
288 |
+
"Categories": "courses",
|
289 |
+
"Images": dsorted["Vendor"].map(logos).fillna(""),
|
290 |
+
"Parent": dsorted["Course ID"],
|
291 |
+
"Brands": dsorted["Vendor"],
|
292 |
+
"Attribute 1 name": "Date",
|
293 |
+
"Attribute 1 value(s)": dsorted["Date_fmt"],
|
294 |
+
"Attribute 1 visible": "visible",
|
295 |
+
"Attribute 1 global": 1,
|
296 |
+
"Attribute 2 name": "Location",
|
297 |
+
"Attribute 2 value(s)": "Virtual",
|
298 |
+
"Attribute 2 visible": "visible",
|
299 |
+
"Attribute 2 global": 1,
|
300 |
+
"Attribute 3 name": "Time",
|
301 |
+
"Attribute 3 value(s)": dsorted.apply(
|
302 |
+
lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}",
|
303 |
+
axis=1,
|
304 |
+
),
|
305 |
+
"Attribute 3 visible": "visible",
|
306 |
+
"Attribute 3 global": 1,
|
307 |
+
"Meta: outline": dsorted["Formatted_Agenda"],
|
308 |
+
"Meta: days": dsorted[dur],
|
309 |
+
"Meta: location": "Virtual",
|
310 |
+
"Meta: overview": dsorted["Target Audience"],
|
311 |
+
"Meta: objectives": dsorted["Formatted_Objectives"],
|
312 |
+
"Meta: prerequisites": dsorted["Formatted_Prerequisites"],
|
313 |
+
"Meta: agenda": dsorted["Formatted_Agenda"],
|
314 |
+
}
|
315 |
+
)
|
316 |
|
317 |
all_rows = pd.concat([parent, child], ignore_index=True)
|
318 |
order = [
|
319 |
+
"Type",
|
320 |
+
"SKU",
|
321 |
+
"Name",
|
322 |
+
"Published",
|
323 |
+
"Visibility in catalog",
|
324 |
+
"Short description",
|
325 |
+
"Description",
|
326 |
+
"Tax status",
|
327 |
+
"In stock?",
|
328 |
+
"Stock",
|
329 |
+
"Sold individually?",
|
330 |
+
"Regular price",
|
331 |
+
"Categories",
|
332 |
+
"Images",
|
333 |
+
"Parent",
|
334 |
+
"Brands",
|
335 |
+
"Attribute 1 name",
|
336 |
+
"Attribute 1 value(s)",
|
337 |
+
"Attribute 1 visible",
|
338 |
+
"Attribute 1 global",
|
339 |
+
"Attribute 2 name",
|
340 |
+
"Attribute 2 value(s)",
|
341 |
+
"Attribute 2 visible",
|
342 |
+
"Attribute 2 global",
|
343 |
+
"Attribute 3 name",
|
344 |
+
"Attribute 3 value(s)",
|
345 |
+
"Attribute 3 visible",
|
346 |
+
"Attribute 3 global",
|
347 |
+
"Meta: outline",
|
348 |
+
"Meta: days",
|
349 |
+
"Meta: location",
|
350 |
+
"Meta: overview",
|
351 |
+
"Meta: objectives",
|
352 |
+
"Meta: prerequisites",
|
353 |
+
"Meta: agenda",
|
354 |
]
|
355 |
|
356 |
out = BytesIO()
|
|
|
369 |
|
370 |
ui = gr.Interface(
|
371 |
fn=process_file,
|
372 |
+
inputs=gr.File(
|
373 |
+
label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]
|
374 |
+
),
|
375 |
outputs=gr.File(label="Download WooCommerce CSV"),
|
376 |
title="NetCom → WooCommerce CSV Processor (Try 2)",
|
377 |
description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",
|