codys12 commited on
Commit
db70732
·
verified ·
1 Parent(s): f4e036a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -121
app.py CHANGED
@@ -1,13 +1,11 @@
1
- """NetCom → WooCommerce transformer (Try 2 schema — cleaned async, 100-parallel + de-dupe)
2
- =========================================================================================
3
  *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
4
 
5
- Changes vs previous Try 2
6
- -------------------------
7
- * Cap OpenAI concurrency to **100** with an `asyncio.Semaphore`.
8
- * Prevent duplicate prompts from hitting the API in parallel with an
9
- in-flight `dict[str, asyncio.Future]`.
10
- * Everything else (cache filenames, interface, outputs) stays the same.
11
  """
12
 
13
  from __future__ import annotations
@@ -44,7 +42,7 @@ def _cache_path(p: str) -> Path:
44
 
45
  def _get_cached(p: str) -> str | None:
46
  try:
47
- return json.loads(_cache_path(p).read_text("utf-8"))['response']
48
  except Exception:
49
  return None
50
 
@@ -55,24 +53,24 @@ def _set_cache(p: str, r: str) -> None:
55
  pass
56
 
57
  # -------- Async GPT helpers --------------------------------------------------
58
- _SEM = asyncio.Semaphore(100) # ≤100 concurrent calls
59
  _inflight: dict[str, asyncio.Future] = {} # prompt → Future
60
 
61
  async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
62
- """Single LLM call with on-disk cache, concurrency cap, and de-duplication."""
63
  cached = _get_cached(prompt)
64
  if cached is not None:
65
  return cached
66
 
67
- # — de-dup identical prompts already in flight
68
  running = _inflight.get(prompt)
69
  if running is not None:
70
- return await running # reuse the same Future
71
 
72
  loop = asyncio.get_running_loop()
73
 
74
  async def _call_api() -> str:
75
- async with _SEM: # concurrency limiter
76
  try:
77
  msg = await client.chat.completions.create(
78
  model="gpt-4o-mini",
@@ -80,17 +78,17 @@ async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
80
  temperature=0,
81
  )
82
  text = msg.choices[0].message.content
83
- except Exception as exc: # network/auth errors
84
  text = f"Error: {exc}"
85
  _set_cache(prompt, text)
86
  return text
87
 
88
  task = loop.create_task(_call_api())
89
- _inflight[prompt] = task # register
90
  try:
91
  return await task
92
  finally:
93
- _inflight.pop(prompt, None) # clean up even on error
94
 
95
  async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
96
  """Vectorised helper — returns an output list matching *lst* length."""
@@ -100,10 +98,9 @@ async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpe
100
  if isinstance(txt, str) and txt.strip():
101
  idx.append(i)
102
  prompts.append(f"{instruction}\n\nText: {txt}")
103
- if not prompts: # fast-path: all rows empty
104
  return out
105
 
106
- # Gather — duplicate prompts handled inside _gpt_async
107
  responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
108
  for j, val in enumerate(responses):
109
  out[idx[j]] = val
@@ -121,29 +118,46 @@ def _read(path: str) -> pd.DataFrame:
121
  return pd.read_excel(path)
122
  return pd.read_csv(path, encoding="latin1")
123
 
124
- async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
 
 
125
  """Run all LLM batches concurrently and return the five enrichment columns."""
126
  async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
127
  sdesc, ldesc, fobj, fout = await asyncio.gather(
128
- _batch_async(df.get(dcol, "").fillna("").tolist(),
129
- "Create a concise 250-character summary of this course description:", client),
130
- _batch_async(df.get(dcol, "").fillna("").tolist(),
131
- "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:", client),
132
- _batch_async(df.get(ocol, "").fillna("").tolist(),
133
- "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':", client),
134
- _batch_async(df.get(acol, "").fillna("").tolist(),
135
- "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':", client),
 
 
 
 
 
 
 
 
 
 
 
 
136
  )
137
- # Prerequisites (some rows empty → default)
 
138
  prereq_raw = df.get(pcol, "").fillna("").tolist()
139
  fpre: list[str] = []
140
  for req in prereq_raw:
141
  if not str(req).strip():
142
  fpre.append(DEFAULT_PREREQ)
143
  else:
144
- formatted = await _batch_async([req],
145
- "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':",
146
- client)
 
 
147
  fpre.append(formatted[0])
148
 
149
  return sdesc, ldesc, fobj, fout, fpre
@@ -176,10 +190,12 @@ def convert(path: str) -> BytesIO:
176
  sid = first_col("Course SID", "Course SID")
177
 
178
  if dur not in df.columns:
179
- df[dur] = "" # ensure Duration col exists
180
 
181
  # ---------- LLM enrichment (async) -------------------------------------
182
- sdesc, ldesc, fobj, fout, fpre = asyncio.run(_enrich_dataframe(df, dcol, ocol, pcol, acol))
 
 
183
 
184
  df["Short_Description"] = sdesc
185
  df["Condensed_Description"] = ldesc
@@ -193,105 +209,148 @@ def convert(path: str) -> BytesIO:
193
 
194
  dsorted = df.sort_values(["Course ID", "Course Start Date"])
195
  d_agg = (
196
- dsorted
197
- .groupby("Course ID")["Date_fmt"]
198
  .apply(lambda s: ",".join(s.dropna().unique()))
199
  .reset_index(name="Dates")
200
  )
 
201
  t_agg = (
202
- dsorted
203
- .groupby("Course ID", group_keys=False, include_groups=False)
204
- .apply(lambda g: ",".join(f"{st}-{et} {tz}" for st, et, tz in zip(g["Course Start Time"], g["Course End Time"], g["Time Zone"])))
 
 
 
 
 
 
 
205
  .reset_index(name="Times")
206
  )
207
 
208
  parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
209
 
210
  # ---------- Parent / child product rows --------------------------------
211
- parent = pd.DataFrame({
212
- "Type": "variable",
213
- "SKU": parents["Course ID"],
214
- "Name": parents["Course Name"],
215
- "Published": 1,
216
- "Visibility in catalog": "visible",
217
- "Short description": parents["Short_Description"],
218
- "Description": parents["Condensed_Description"],
219
- "Tax status": "taxable",
220
- "In stock?": 1,
221
- "Stock": 1,
222
- "Sold individually?": 1,
223
- "Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True),
224
- "Categories": "courses",
225
- "Images": parents["Vendor"].map(logos).fillna(""),
226
- "Parent": "",
227
- "Brands": parents["Vendor"],
228
- "Attribute 1 name": "Date",
229
- "Attribute 1 value(s)": parents["Dates"],
230
- "Attribute 1 visible": "visible",
231
- "Attribute 1 global": 1,
232
- "Attribute 2 name": "Location",
233
- "Attribute 2 value(s)": "Virtual",
234
- "Attribute 2 visible": "visible",
235
- "Attribute 2 global": 1,
236
- "Attribute 3 name": "Time",
237
- "Attribute 3 value(s)": parents["Times"],
238
- "Attribute 3 visible": "visible",
239
- "Attribute 3 global": 1,
240
- "Meta: outline": parents["Formatted_Agenda"],
241
- "Meta: days": parents[dur],
242
- "Meta: location": "Virtual",
243
- "Meta: overview": parents["Target Audience"],
244
- "Meta: objectives": parents["Formatted_Objectives"],
245
- "Meta: prerequisites": parents["Formatted_Prerequisites"],
246
- "Meta: agenda": parents["Formatted_Agenda"],
247
- })
248
-
249
- child = pd.DataFrame({
250
- "Type": "variation, virtual",
251
- "SKU": dsorted[sid].astype(str).str.strip(),
252
- "Name": dsorted["Course Name"],
253
- "Published": 1,
254
- "Visibility in catalog": "visible",
255
- "Short description": dsorted["Short_Description"],
256
- "Description": dsorted["Condensed_Description"],
257
- "Tax status": "taxable",
258
- "In stock?": 1,
259
- "Stock": 1,
260
- "Sold individually?": 1,
261
- "Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True),
262
- "Categories": "courses",
263
- "Images": dsorted["Vendor"].map(logos).fillna(""),
264
- "Parent": dsorted["Course ID"],
265
- "Brands": dsorted["Vendor"],
266
- "Attribute 1 name": "Date",
267
- "Attribute 1 value(s)": dsorted["Date_fmt"],
268
- "Attribute 1 visible": "visible",
269
- "Attribute 1 global": 1,
270
- "Attribute 2 name": "Location",
271
- "Attribute 2 value(s)": "Virtual",
272
- "Attribute 2 visible": "visible",
273
- "Attribute 2 global": 1,
274
- "Attribute 3 name": "Time",
275
- "Attribute 3 value(s)": dsorted.apply(lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}", axis=1),
276
- "Attribute 3 visible": "visible",
277
- "Attribute 3 global": 1,
278
- "Meta: outline": dsorted["Formatted_Agenda"],
279
- "Meta: days": dsorted[dur],
280
- "Meta: location": "Virtual",
281
- "Meta: overview": dsorted["Target Audience"],
282
- "Meta: objectives": dsorted["Formatted_Objectives"],
283
- "Meta: prerequisites": dsorted["Formatted_Prerequisites"],
284
- "Meta: agenda": dsorted["Formatted_Agenda"],
285
- })
 
 
 
 
 
 
 
286
 
287
  all_rows = pd.concat([parent, child], ignore_index=True)
288
  order = [
289
- "Type", "SKU", "Name", "Published", "Visibility in catalog", "Short description", "Description",
290
- "Tax status", "In stock?", "Stock", "Sold individually?", "Regular price", "Categories", "Images",
291
- "Parent", "Brands", "Attribute 1 name", "Attribute 1 value(s)", "Attribute 1 visible", "Attribute 1 global",
292
- "Attribute 2 name", "Attribute 2 value(s)", "Attribute 2 visible", "Attribute 2 global", "Attribute 3 name",
293
- "Attribute 3 value(s)", "Attribute 3 visible", "Attribute 3 global", "Meta: outline", "Meta: days", "Meta: location",
294
- "Meta: overview", "Meta: objectives", "Meta: prerequisites", "Meta: agenda",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  ]
296
 
297
  out = BytesIO()
@@ -310,7 +369,9 @@ def process_file(upload: gr.File) -> str:
310
 
311
  ui = gr.Interface(
312
  fn=process_file,
313
- inputs=gr.File(label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]),
 
 
314
  outputs=gr.File(label="Download WooCommerce CSV"),
315
  title="NetCom → WooCommerce CSV Processor (Try 2)",
316
  description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",
 
1
+ """NetCom → WooCommerce transformer (Try 2 schema — 100-parallel + de-dupe, pandas fix)
2
+ ======================================================================================
3
  *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
4
 
5
+ New since the last paste
6
+ ------------------------
7
+ * Fix for older pandas: move `include_groups=False` from `.groupby()` to `.apply()`.
8
+ * Everything else (cache names, concurrency cap, in-flight de-duplication) is unchanged.
 
 
9
  """
10
 
11
  from __future__ import annotations
 
42
 
43
  def _get_cached(p: str) -> str | None:
44
  try:
45
+ return json.loads(_cache_path(p).read_text("utf-8"))["response"]
46
  except Exception:
47
  return None
48
 
 
53
  pass
54
 
55
  # -------- Async GPT helpers --------------------------------------------------
56
+ _SEM = asyncio.Semaphore(100) # ≤100 concurrent OpenAI calls
57
  _inflight: dict[str, asyncio.Future] = {} # prompt → Future
58
 
59
  async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
60
+ """Single LLM call with disk cache, concurrency cap, and de-duplication."""
61
  cached = _get_cached(prompt)
62
  if cached is not None:
63
  return cached
64
 
65
+ # De-duplicate identical prompts already in flight
66
  running = _inflight.get(prompt)
67
  if running is not None:
68
+ return await running
69
 
70
  loop = asyncio.get_running_loop()
71
 
72
  async def _call_api() -> str:
73
+ async with _SEM: # concurrency limiter
74
  try:
75
  msg = await client.chat.completions.create(
76
  model="gpt-4o-mini",
 
78
  temperature=0,
79
  )
80
  text = msg.choices[0].message.content
81
+ except Exception as exc:
82
  text = f"Error: {exc}"
83
  _set_cache(prompt, text)
84
  return text
85
 
86
  task = loop.create_task(_call_api())
87
+ _inflight[prompt] = task
88
  try:
89
  return await task
90
  finally:
91
+ _inflight.pop(prompt, None)
92
 
93
  async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
94
  """Vectorised helper — returns an output list matching *lst* length."""
 
98
  if isinstance(txt, str) and txt.strip():
99
  idx.append(i)
100
  prompts.append(f"{instruction}\n\nText: {txt}")
101
+ if not prompts:
102
  return out
103
 
 
104
  responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
105
  for j, val in enumerate(responses):
106
  out[idx[j]] = val
 
118
  return pd.read_excel(path)
119
  return pd.read_csv(path, encoding="latin1")
120
 
121
+ async def _enrich_dataframe(
122
+ df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str
123
+ ) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
124
  """Run all LLM batches concurrently and return the five enrichment columns."""
125
  async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
126
  sdesc, ldesc, fobj, fout = await asyncio.gather(
127
+ _batch_async(
128
+ df.get(dcol, "").fillna("").tolist(),
129
+ "Create a concise 250-character summary of this course description:",
130
+ client,
131
+ ),
132
+ _batch_async(
133
+ df.get(dcol, "").fillna("").tolist(),
134
+ "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:",
135
+ client,
136
+ ),
137
+ _batch_async(
138
+ df.get(ocol, "").fillna("").tolist(),
139
+ "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':",
140
+ client,
141
+ ),
142
+ _batch_async(
143
+ df.get(acol, "").fillna("").tolist(),
144
+ "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':",
145
+ client,
146
+ ),
147
  )
148
+
149
+ # Prerequisites (some rows empty → default text)
150
  prereq_raw = df.get(pcol, "").fillna("").tolist()
151
  fpre: list[str] = []
152
  for req in prereq_raw:
153
  if not str(req).strip():
154
  fpre.append(DEFAULT_PREREQ)
155
  else:
156
+ formatted = await _batch_async(
157
+ [req],
158
+ "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':",
159
+ client,
160
+ )
161
  fpre.append(formatted[0])
162
 
163
  return sdesc, ldesc, fobj, fout, fpre
 
190
  sid = first_col("Course SID", "Course SID")
191
 
192
  if dur not in df.columns:
193
+ df[dur] = "" # ensure Duration column exists
194
 
195
  # ---------- LLM enrichment (async) -------------------------------------
196
+ sdesc, ldesc, fobj, fout, fpre = asyncio.run(
197
+ _enrich_dataframe(df, dcol, ocol, pcol, acol)
198
+ )
199
 
200
  df["Short_Description"] = sdesc
201
  df["Condensed_Description"] = ldesc
 
209
 
210
  dsorted = df.sort_values(["Course ID", "Course Start Date"])
211
  d_agg = (
212
+ dsorted.groupby("Course ID")["Date_fmt"]
 
213
  .apply(lambda s: ",".join(s.dropna().unique()))
214
  .reset_index(name="Dates")
215
  )
216
+
217
  t_agg = (
218
+ dsorted.groupby("Course ID", group_keys=False)
219
+ .apply(
220
+ lambda g: ",".join(
221
+ f"{st}-{et} {tz}"
222
+ for st, et, tz in zip(
223
+ g["Course Start Time"], g["Course End Time"], g["Time Zone"]
224
+ )
225
+ ),
226
+ include_groups=False, # <- moved here
227
+ )
228
  .reset_index(name="Times")
229
  )
230
 
231
  parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
232
 
233
  # ---------- Parent / child product rows --------------------------------
234
+ parent = pd.DataFrame(
235
+ {
236
+ "Type": "variable",
237
+ "SKU": parents["Course ID"],
238
+ "Name": parents["Course Name"],
239
+ "Published": 1,
240
+ "Visibility in catalog": "visible",
241
+ "Short description": parents["Short_Description"],
242
+ "Description": parents["Condensed_Description"],
243
+ "Tax status": "taxable",
244
+ "In stock?": 1,
245
+ "Stock": 1,
246
+ "Sold individually?": 1,
247
+ "Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True),
248
+ "Categories": "courses",
249
+ "Images": parents["Vendor"].map(logos).fillna(""),
250
+ "Parent": "",
251
+ "Brands": parents["Vendor"],
252
+ "Attribute 1 name": "Date",
253
+ "Attribute 1 value(s)": parents["Dates"],
254
+ "Attribute 1 visible": "visible",
255
+ "Attribute 1 global": 1,
256
+ "Attribute 2 name": "Location",
257
+ "Attribute 2 value(s)": "Virtual",
258
+ "Attribute 2 visible": "visible",
259
+ "Attribute 2 global": 1,
260
+ "Attribute 3 name": "Time",
261
+ "Attribute 3 value(s)": parents["Times"],
262
+ "Attribute 3 visible": "visible",
263
+ "Attribute 3 global": 1,
264
+ "Meta: outline": parents["Formatted_Agenda"],
265
+ "Meta: days": parents[dur],
266
+ "Meta: location": "Virtual",
267
+ "Meta: overview": parents["Target Audience"],
268
+ "Meta: objectives": parents["Formatted_Objectives"],
269
+ "Meta: prerequisites": parents["Formatted_Prerequisites"],
270
+ "Meta: agenda": parents["Formatted_Agenda"],
271
+ }
272
+ )
273
+
274
+ child = pd.DataFrame(
275
+ {
276
+ "Type": "variation, virtual",
277
+ "SKU": dsorted[sid].astype(str).str.strip(),
278
+ "Name": dsorted["Course Name"],
279
+ "Published": 1,
280
+ "Visibility in catalog": "visible",
281
+ "Short description": dsorted["Short_Description"],
282
+ "Description": dsorted["Condensed_Description"],
283
+ "Tax status": "taxable",
284
+ "In stock?": 1,
285
+ "Stock": 1,
286
+ "Sold individually?": 1,
287
+ "Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True),
288
+ "Categories": "courses",
289
+ "Images": dsorted["Vendor"].map(logos).fillna(""),
290
+ "Parent": dsorted["Course ID"],
291
+ "Brands": dsorted["Vendor"],
292
+ "Attribute 1 name": "Date",
293
+ "Attribute 1 value(s)": dsorted["Date_fmt"],
294
+ "Attribute 1 visible": "visible",
295
+ "Attribute 1 global": 1,
296
+ "Attribute 2 name": "Location",
297
+ "Attribute 2 value(s)": "Virtual",
298
+ "Attribute 2 visible": "visible",
299
+ "Attribute 2 global": 1,
300
+ "Attribute 3 name": "Time",
301
+ "Attribute 3 value(s)": dsorted.apply(
302
+ lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}",
303
+ axis=1,
304
+ ),
305
+ "Attribute 3 visible": "visible",
306
+ "Attribute 3 global": 1,
307
+ "Meta: outline": dsorted["Formatted_Agenda"],
308
+ "Meta: days": dsorted[dur],
309
+ "Meta: location": "Virtual",
310
+ "Meta: overview": dsorted["Target Audience"],
311
+ "Meta: objectives": dsorted["Formatted_Objectives"],
312
+ "Meta: prerequisites": dsorted["Formatted_Prerequisites"],
313
+ "Meta: agenda": dsorted["Formatted_Agenda"],
314
+ }
315
+ )
316
 
317
  all_rows = pd.concat([parent, child], ignore_index=True)
318
  order = [
319
+ "Type",
320
+ "SKU",
321
+ "Name",
322
+ "Published",
323
+ "Visibility in catalog",
324
+ "Short description",
325
+ "Description",
326
+ "Tax status",
327
+ "In stock?",
328
+ "Stock",
329
+ "Sold individually?",
330
+ "Regular price",
331
+ "Categories",
332
+ "Images",
333
+ "Parent",
334
+ "Brands",
335
+ "Attribute 1 name",
336
+ "Attribute 1 value(s)",
337
+ "Attribute 1 visible",
338
+ "Attribute 1 global",
339
+ "Attribute 2 name",
340
+ "Attribute 2 value(s)",
341
+ "Attribute 2 visible",
342
+ "Attribute 2 global",
343
+ "Attribute 3 name",
344
+ "Attribute 3 value(s)",
345
+ "Attribute 3 visible",
346
+ "Attribute 3 global",
347
+ "Meta: outline",
348
+ "Meta: days",
349
+ "Meta: location",
350
+ "Meta: overview",
351
+ "Meta: objectives",
352
+ "Meta: prerequisites",
353
+ "Meta: agenda",
354
  ]
355
 
356
  out = BytesIO()
 
369
 
370
  ui = gr.Interface(
371
  fn=process_file,
372
+ inputs=gr.File(
373
+ label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]
374
+ ),
375
  outputs=gr.File(label="Download WooCommerce CSV"),
376
  title="NetCom → WooCommerce CSV Processor (Try 2)",
377
  description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",