Spaces:

codys12
/

NetCom-to-WooComerce

Runtime error

App Files Files Community

codys12 commited on May 14

Commit

db70732

verified ·

1 Parent(s): f4e036a

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -121

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
-"""NetCom → WooCommerce transformer (Try 2 schema — cleaned async, 100-parallel + de-dupe)
-=========================================================================================
 *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
-Changes vs previous Try 2
--------------------------
-* Cap OpenAI concurrency to **100** with an `asyncio.Semaphore`.
-* Prevent duplicate prompts from hitting the API in parallel with an
-  in-flight `dict[str, asyncio.Future]`.
-* Everything else (cache filenames, interface, outputs) stays the same.
 """
 from __future__ import annotations
@@ -44,7 +42,7 @@ def _cache_path(p: str) -> Path:
 def _get_cached(p: str) -> str | None:
     try:
-        return json.loads(_cache_path(p).read_text("utf-8"))['response']
     except Exception:
         return None
@@ -55,24 +53,24 @@ def _set_cache(p: str, r: str) -> None:
         pass
 # -------- Async GPT helpers --------------------------------------------------
-_SEM = asyncio.Semaphore(100)          # ≤100 concurrent calls
 _inflight: dict[str, asyncio.Future] = {}  # prompt → Future
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
-    """Single LLM call with on-disk cache, concurrency cap, and de-duplication."""
     cached = _get_cached(prompt)
     if cached is not None:
         return cached
-    # — de-dup identical prompts already in flight —
     running = _inflight.get(prompt)
     if running is not None:
-        return await running               # reuse the same Future
     loop = asyncio.get_running_loop()
     async def _call_api() -> str:
-        async with _SEM:                   # concurrency limiter
             try:
                 msg = await client.chat.completions.create(
                     model="gpt-4o-mini",
@@ -80,17 +78,17 @@ async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
                     temperature=0,
                 )
                 text = msg.choices[0].message.content
-            except Exception as exc:       # network/auth errors
                 text = f"Error: {exc}"
             _set_cache(prompt, text)
             return text
     task = loop.create_task(_call_api())
-    _inflight[prompt] = task               # register
     try:
         return await task
     finally:
-        _inflight.pop(prompt, None)        # clean up even on error
 async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
     """Vectorised helper — returns an output list matching *lst* length."""
@@ -100,10 +98,9 @@ async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpe
         if isinstance(txt, str) and txt.strip():
             idx.append(i)
             prompts.append(f"{instruction}\n\nText: {txt}")
-    if not prompts:                         # fast-path: all rows empty
         return out
-    # Gather — duplicate prompts handled inside _gpt_async
     responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
     for j, val in enumerate(responses):
         out[idx[j]] = val
@@ -121,29 +118,46 @@ def _read(path: str) -> pd.DataFrame:
         return pd.read_excel(path)
     return pd.read_csv(path, encoding="latin1")
-async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
     """Run all LLM batches concurrently and return the five enrichment columns."""
     async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
         sdesc, ldesc, fobj, fout = await asyncio.gather(
-            _batch_async(df.get(dcol, "").fillna("").tolist(),
-                         "Create a concise 250-character summary of this course description:", client),
-            _batch_async(df.get(dcol, "").fillna("").tolist(),
-                         "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:", client),
-            _batch_async(df.get(ocol, "").fillna("").tolist(),
-                         "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':", client),
-            _batch_async(df.get(acol, "").fillna("").tolist(),
-                         "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':", client),
         )
-        # Prerequisites (some rows empty → default)
         prereq_raw = df.get(pcol, "").fillna("").tolist()
         fpre: list[str] = []
         for req in prereq_raw:
             if not str(req).strip():
                 fpre.append(DEFAULT_PREREQ)
             else:
-                formatted = await _batch_async([req],
-                                               "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':",
-                                               client)
                 fpre.append(formatted[0])
     return sdesc, ldesc, fobj, fout, fpre
@@ -176,10 +190,12 @@ def convert(path: str) -> BytesIO:
     sid = first_col("Course SID", "ï»¿Course SID")
     if dur not in df.columns:
-        df[dur] = ""  # ensure Duration col exists
     # ---------- LLM enrichment (async) -------------------------------------
-    sdesc, ldesc, fobj, fout, fpre = asyncio.run(_enrich_dataframe(df, dcol, ocol, pcol, acol))
     df["Short_Description"] = sdesc
     df["Condensed_Description"] = ldesc
@@ -193,105 +209,148 @@ def convert(path: str) -> BytesIO:
     dsorted = df.sort_values(["Course ID", "Course Start Date"])
     d_agg = (
-        dsorted
-        .groupby("Course ID")["Date_fmt"]
         .apply(lambda s: ",".join(s.dropna().unique()))
         .reset_index(name="Dates")
     )
     t_agg = (
-        dsorted
-        .groupby("Course ID", group_keys=False, include_groups=False)
-        .apply(lambda g: ",".join(f"{st}-{et} {tz}" for st, et, tz in zip(g["Course Start Time"], g["Course End Time"], g["Time Zone"])))
         .reset_index(name="Times")
     )
     parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
     # ---------- Parent / child product rows --------------------------------
-    parent = pd.DataFrame({
-        "Type": "variable",
-        "SKU": parents["Course ID"],
-        "Name": parents["Course Name"],
-        "Published": 1,
-        "Visibility in catalog": "visible",
-        "Short description": parents["Short_Description"],
-        "Description": parents["Condensed_Description"],
-        "Tax status": "taxable",
-        "In stock?": 1,
-        "Stock": 1,
-        "Sold individually?": 1,
-        "Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True),
-        "Categories": "courses",
-        "Images": parents["Vendor"].map(logos).fillna(""),
-        "Parent": "",
-        "Brands": parents["Vendor"],
-        "Attribute 1 name": "Date",
-        "Attribute 1 value(s)": parents["Dates"],
-        "Attribute 1 visible": "visible",
-        "Attribute 1 global": 1,
-        "Attribute 2 name": "Location",
-        "Attribute 2 value(s)": "Virtual",
-        "Attribute 2 visible": "visible",
-        "Attribute 2 global": 1,
-        "Attribute 3 name": "Time",
-        "Attribute 3 value(s)": parents["Times"],
-        "Attribute 3 visible": "visible",
-        "Attribute 3 global": 1,
-        "Meta: outline": parents["Formatted_Agenda"],
-        "Meta: days": parents[dur],
-        "Meta: location": "Virtual",
-        "Meta: overview": parents["Target Audience"],
-        "Meta: objectives": parents["Formatted_Objectives"],
-        "Meta: prerequisites": parents["Formatted_Prerequisites"],
-        "Meta: agenda": parents["Formatted_Agenda"],
-    })
-    child = pd.DataFrame({
-        "Type": "variation, virtual",
-        "SKU": dsorted[sid].astype(str).str.strip(),
-        "Name": dsorted["Course Name"],
-        "Published": 1,
-        "Visibility in catalog": "visible",
-        "Short description": dsorted["Short_Description"],
-        "Description": dsorted["Condensed_Description"],
-        "Tax status": "taxable",
-        "In stock?": 1,
-        "Stock": 1,
-        "Sold individually?": 1,
-        "Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True),
-        "Categories": "courses",
-        "Images": dsorted["Vendor"].map(logos).fillna(""),
-        "Parent": dsorted["Course ID"],
-        "Brands": dsorted["Vendor"],
-        "Attribute 1 name": "Date",
-        "Attribute 1 value(s)": dsorted["Date_fmt"],
-        "Attribute 1 visible": "visible",
-        "Attribute 1 global": 1,
-        "Attribute 2 name": "Location",
-        "Attribute 2 value(s)": "Virtual",
-        "Attribute 2 visible": "visible",
-        "Attribute 2 global": 1,
-        "Attribute 3 name": "Time",
-        "Attribute 3 value(s)": dsorted.apply(lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}", axis=1),
-        "Attribute 3 visible": "visible",
-        "Attribute 3 global": 1,
-        "Meta: outline": dsorted["Formatted_Agenda"],
-        "Meta: days": dsorted[dur],
-        "Meta: location": "Virtual",
-        "Meta: overview": dsorted["Target Audience"],
-        "Meta: objectives": dsorted["Formatted_Objectives"],
-        "Meta: prerequisites": dsorted["Formatted_Prerequisites"],
-        "Meta: agenda": dsorted["Formatted_Agenda"],
-    })
     all_rows = pd.concat([parent, child], ignore_index=True)
     order = [
-        "Type", "SKU", "Name", "Published", "Visibility in catalog", "Short description", "Description",
-        "Tax status", "In stock?", "Stock", "Sold individually?", "Regular price", "Categories", "Images",
-        "Parent", "Brands", "Attribute 1 name", "Attribute 1 value(s)", "Attribute 1 visible", "Attribute 1 global",
-        "Attribute 2 name", "Attribute 2 value(s)", "Attribute 2 visible", "Attribute 2 global", "Attribute 3 name",
-        "Attribute 3 value(s)", "Attribute 3 visible", "Attribute 3 global", "Meta: outline", "Meta: days", "Meta: location",
-        "Meta: overview", "Meta: objectives", "Meta: prerequisites", "Meta: agenda",
     ]
     out = BytesIO()
@@ -310,7 +369,9 @@ def process_file(upload: gr.File) -> str:
 ui = gr.Interface(
     fn=process_file,
-    inputs=gr.File(label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]),
     outputs=gr.File(label="Download WooCommerce CSV"),
     title="NetCom → WooCommerce CSV Processor (Try 2)",
     description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",

+"""NetCom → WooCommerce transformer (Try 2 schema — 100-parallel + de-dupe, pandas fix)
+======================================================================================
 *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
+New since the last paste
+------------------------
+* Fix for older pandas: move `include_groups=False` from `.groupby()` to `.apply()`.
+* Everything else (cache names, concurrency cap, in-flight de-duplication) is unchanged.
 """
 from __future__ import annotations
 def _get_cached(p: str) -> str | None:
     try:
+        return json.loads(_cache_path(p).read_text("utf-8"))["response"]
     except Exception:
         return None
         pass
 # -------- Async GPT helpers --------------------------------------------------
+_SEM = asyncio.Semaphore(100)              # ≤100 concurrent OpenAI calls
 _inflight: dict[str, asyncio.Future] = {}  # prompt → Future
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
+    """Single LLM call with disk cache, concurrency cap, and de-duplication."""
     cached = _get_cached(prompt)
     if cached is not None:
         return cached
+    # De-duplicate identical prompts already in flight
     running = _inflight.get(prompt)
     if running is not None:
+        return await running
     loop = asyncio.get_running_loop()
     async def _call_api() -> str:
+        async with _SEM:  # concurrency limiter
             try:
                 msg = await client.chat.completions.create(
                     model="gpt-4o-mini",
                     temperature=0,
                 )
                 text = msg.choices[0].message.content
+            except Exception as exc:
                 text = f"Error: {exc}"
             _set_cache(prompt, text)
             return text
     task = loop.create_task(_call_api())
+    _inflight[prompt] = task
     try:
         return await task
     finally:
+        _inflight.pop(prompt, None)
 async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
     """Vectorised helper — returns an output list matching *lst* length."""
         if isinstance(txt, str) and txt.strip():
             idx.append(i)
             prompts.append(f"{instruction}\n\nText: {txt}")
+    if not prompts:
         return out
     responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
     for j, val in enumerate(responses):
         out[idx[j]] = val
         return pd.read_excel(path)
     return pd.read_csv(path, encoding="latin1")
+async def _enrich_dataframe(
+    df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str
+) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
     """Run all LLM batches concurrently and return the five enrichment columns."""
     async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
         sdesc, ldesc, fobj, fout = await asyncio.gather(
+            _batch_async(
+                df.get(dcol, "").fillna("").tolist(),
+                "Create a concise 250-character summary of this course description:",
+                client,
+            ),
+            _batch_async(
+                df.get(dcol, "").fillna("").tolist(),
+                "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:",
+                client,
+            ),
+            _batch_async(
+                df.get(ocol, "").fillna("").tolist(),
+                "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':",
+                client,
+            ),
+            _batch_async(
+                df.get(acol, "").fillna("").tolist(),
+                "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':",
+                client,
+            ),
         )
+        # Prerequisites (some rows empty → default text)
         prereq_raw = df.get(pcol, "").fillna("").tolist()
         fpre: list[str] = []
         for req in prereq_raw:
             if not str(req).strip():
                 fpre.append(DEFAULT_PREREQ)
             else:
+                formatted = await _batch_async(
+                    [req],
+                    "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':",
+                    client,
+                )
                 fpre.append(formatted[0])
     return sdesc, ldesc, fobj, fout, fpre
     sid = first_col("Course SID", "ï»¿Course SID")
     if dur not in df.columns:
+        df[dur] = ""  # ensure Duration column exists
     # ---------- LLM enrichment (async) -------------------------------------
+    sdesc, ldesc, fobj, fout, fpre = asyncio.run(
+        _enrich_dataframe(df, dcol, ocol, pcol, acol)
+    )
     df["Short_Description"] = sdesc
     df["Condensed_Description"] = ldesc
     dsorted = df.sort_values(["Course ID", "Course Start Date"])
     d_agg = (
+        dsorted.groupby("Course ID")["Date_fmt"]
         .apply(lambda s: ",".join(s.dropna().unique()))
         .reset_index(name="Dates")
     )
     t_agg = (
+        dsorted.groupby("Course ID", group_keys=False)
+        .apply(
+            lambda g: ",".join(
+                f"{st}-{et} {tz}"
+                for st, et, tz in zip(
+                    g["Course Start Time"], g["Course End Time"], g["Time Zone"]
+                )
+            ),
+            include_groups=False,  # <- moved here
+        )
         .reset_index(name="Times")
     )
     parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
     # ---------- Parent / child product rows --------------------------------
+    parent = pd.DataFrame(
+        {
+            "Type": "variable",
+            "SKU": parents["Course ID"],
+            "Name": parents["Course Name"],
+            "Published": 1,
+            "Visibility in catalog": "visible",
+            "Short description": parents["Short_Description"],
+            "Description": parents["Condensed_Description"],
+            "Tax status": "taxable",
+            "In stock?": 1,
+            "Stock": 1,
+            "Sold individually?": 1,
+            "Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True),
+            "Categories": "courses",
+            "Images": parents["Vendor"].map(logos).fillna(""),
+            "Parent": "",
+            "Brands": parents["Vendor"],
+            "Attribute 1 name": "Date",
+            "Attribute 1 value(s)": parents["Dates"],
+            "Attribute 1 visible": "visible",
+            "Attribute 1 global": 1,
+            "Attribute 2 name": "Location",
+            "Attribute 2 value(s)": "Virtual",
+            "Attribute 2 visible": "visible",
+            "Attribute 2 global": 1,
+            "Attribute 3 name": "Time",
+            "Attribute 3 value(s)": parents["Times"],
+            "Attribute 3 visible": "visible",
+            "Attribute 3 global": 1,
+            "Meta: outline": parents["Formatted_Agenda"],
+            "Meta: days": parents[dur],
+            "Meta: location": "Virtual",
+            "Meta: overview": parents["Target Audience"],
+            "Meta: objectives": parents["Formatted_Objectives"],
+            "Meta: prerequisites": parents["Formatted_Prerequisites"],
+            "Meta: agenda": parents["Formatted_Agenda"],
+        }
+    )
+    child = pd.DataFrame(
+        {
+            "Type": "variation, virtual",
+            "SKU": dsorted[sid].astype(str).str.strip(),
+            "Name": dsorted["Course Name"],
+            "Published": 1,
+            "Visibility in catalog": "visible",
+            "Short description": dsorted["Short_Description"],
+            "Description": dsorted["Condensed_Description"],
+            "Tax status": "taxable",
+            "In stock?": 1,
+            "Stock": 1,
+            "Sold individually?": 1,
+            "Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True),
+            "Categories": "courses",
+            "Images": dsorted["Vendor"].map(logos).fillna(""),
+            "Parent": dsorted["Course ID"],
+            "Brands": dsorted["Vendor"],
+            "Attribute 1 name": "Date",
+            "Attribute 1 value(s)": dsorted["Date_fmt"],
+            "Attribute 1 visible": "visible",
+            "Attribute 1 global": 1,
+            "Attribute 2 name": "Location",
+            "Attribute 2 value(s)": "Virtual",
+            "Attribute 2 visible": "visible",
+            "Attribute 2 global": 1,
+            "Attribute 3 name": "Time",
+            "Attribute 3 value(s)": dsorted.apply(
+                lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}",
+                axis=1,
+            ),
+            "Attribute 3 visible": "visible",
+            "Attribute 3 global": 1,
+            "Meta: outline": dsorted["Formatted_Agenda"],
+            "Meta: days": dsorted[dur],
+            "Meta: location": "Virtual",
+            "Meta: overview": dsorted["Target Audience"],
+            "Meta: objectives": dsorted["Formatted_Objectives"],
+            "Meta: prerequisites": dsorted["Formatted_Prerequisites"],
+            "Meta: agenda": dsorted["Formatted_Agenda"],
+        }
+    )
     all_rows = pd.concat([parent, child], ignore_index=True)
     order = [
+        "Type",
+        "SKU",
+        "Name",
+        "Published",
+        "Visibility in catalog",
+        "Short description",
+        "Description",
+        "Tax status",
+        "In stock?",
+        "Stock",
+        "Sold individually?",
+        "Regular price",
+        "Categories",
+        "Images",
+        "Parent",
+        "Brands",
+        "Attribute 1 name",
+        "Attribute 1 value(s)",
+        "Attribute 1 visible",
+        "Attribute 1 global",
+        "Attribute 2 name",
+        "Attribute 2 value(s)",
+        "Attribute 2 visible",
+        "Attribute 2 global",
+        "Attribute 3 name",
+        "Attribute 3 value(s)",
+        "Attribute 3 visible",
+        "Attribute 3 global",
+        "Meta: outline",
+        "Meta: days",
+        "Meta: location",
+        "Meta: overview",
+        "Meta: objectives",
+        "Meta: prerequisites",
+        "Meta: agenda",
     ]
     out = BytesIO()
 ui = gr.Interface(
     fn=process_file,
+    inputs=gr.File(
+        label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]
+    ),
     outputs=gr.File(label="Download WooCommerce CSV"),
     title="NetCom → WooCommerce CSV Processor (Try 2)",
     description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",