Spaces:

codys12
/

NetCom-to-WooComerce

Runtime error

App Files Files Community

codys12 commited on May 14

Commit

f4e036a

verified ·

1 Parent(s): f302c17

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -30

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
-"""NetCom → WooCommerce transformer (Try 2 schema — cleaned async)
-=============================================================
 *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
-Changes vs Try 1
-----------------
-* Use **one** event‑loop via `asyncio.run()` — no manual `new_event_loop()` / `loop.close()` gymnastics.
-* **One** shared `openai.AsyncOpenAI` client, properly closed with an `async with` block.
-* Fixed pandas future‑warning by adding `include_groups=False`.
-* Same Gradio interface, caching, and JSON‑schema hot‑patch as before.
 """
 from __future__ import annotations
@@ -25,7 +25,7 @@ import gradio_client.utils
 import openai
 import pandas as pd
-# -------- Gradio bool‑schema hot‑patch --------------------------------------
 _original = gradio_client.utils._json_schema_to_python_type
 def _fixed_json_schema_to_python_type(schema, defs=None):  # type: ignore
@@ -55,24 +55,42 @@ def _set_cache(p: str, r: str) -> None:
         pass
 # -------- Async GPT helpers --------------------------------------------------
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
-    """Single LLM call with on‑disk response cache."""
     cached = _get_cached(prompt)
     if cached is not None:
         return cached
     try:
-        msg = await client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0,
-        )
-        text = msg.choices[0].message.content
-    except Exception as exc:  # network or auth failure ‑ return explicit error string
-        text = f"Error: {exc}"
-    _set_cache(prompt, text)
-    return text
 async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
     """Vectorised helper — returns an output list matching *lst* length."""
@@ -82,11 +100,10 @@ async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpe
         if isinstance(txt, str) and txt.strip():
             idx.append(i)
             prompts.append(f"{instruction}\n\nText: {txt}")
-    # Fast‑path: nothing to do
-    if not prompts:
         return out
-    # Fire off all prompts concurrently
     responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
     for j, val in enumerate(responses):
         out[idx[j]] = val
@@ -107,7 +124,6 @@ def _read(path: str) -> pd.DataFrame:
 async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
     """Run all LLM batches concurrently and return the five enrichment columns."""
     async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
-        # 1) Descriptions and objectives/agenda batches
         sdesc, ldesc, fobj, fout = await asyncio.gather(
             _batch_async(df.get(dcol, "").fillna("").tolist(),
                          "Create a concise 250-character summary of this course description:", client),
@@ -118,7 +134,7 @@ async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, a
             _batch_async(df.get(acol, "").fillna("").tolist(),
                          "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':", client),
         )
-        # 2) Prerequisites batch (some rows may be empty → DEFAULT_PREREQ)
         prereq_raw = df.get(pcol, "").fillna("").tolist()
         fpre: list[str] = []
         for req in prereq_raw:
@@ -150,7 +166,6 @@ def convert(path: str) -> BytesIO:
     df = _read(path)
     df.columns = df.columns.str.strip()
-    # Helper to locate first existing column name from a list of candidates
     first_col = lambda *candidates: next((c for c in candidates if c in df.columns), None)
     dcol = first_col("Description", "Decription")
@@ -161,7 +176,7 @@ def convert(path: str) -> BytesIO:
     sid = first_col("Course SID", "ï»¿Course SID")
     if dur not in df.columns:
-        df[dur] = ""  # create empty Duration col if missing
     # ---------- LLM enrichment (async) -------------------------------------
     sdesc, ldesc, fobj, fout, fpre = asyncio.run(_enrich_dataframe(df, dcol, ocol, pcol, acol))
@@ -297,8 +312,8 @@ ui = gr.Interface(
     fn=process_file,
     inputs=gr.File(label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]),
     outputs=gr.File(label="Download WooCommerce CSV"),
-    title="NetCom → WooCommerce CSV Processor (Try 2)",
-    description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2‑formatted WooCommerce CSV.",
     analytics_enabled=False,
 )

+"""NetCom → WooCommerce transformer (Try 2 schema — cleaned async, 100-parallel + de-dupe)
+=========================================================================================
 *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
+Changes vs previous Try 2
+-------------------------
+* Cap OpenAI concurrency to **100** with an `asyncio.Semaphore`.
+* Prevent duplicate prompts from hitting the API in parallel with an
+  in-flight `dict[str, asyncio.Future]`.
+* Everything else (cache filenames, interface, outputs) stays the same.
 """
 from __future__ import annotations
 import openai
 import pandas as pd
+# -------- Gradio bool-schema hot-patch --------------------------------------
 _original = gradio_client.utils._json_schema_to_python_type
 def _fixed_json_schema_to_python_type(schema, defs=None):  # type: ignore
         pass
 # -------- Async GPT helpers --------------------------------------------------
+_SEM = asyncio.Semaphore(100)          # ≤100 concurrent calls
+_inflight: dict[str, asyncio.Future] = {}  # prompt → Future
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
+    """Single LLM call with on-disk cache, concurrency cap, and de-duplication."""
     cached = _get_cached(prompt)
     if cached is not None:
         return cached
+    # — de-dup identical prompts already in flight —
+    running = _inflight.get(prompt)
+    if running is not None:
+        return await running               # reuse the same Future
+    loop = asyncio.get_running_loop()
+    async def _call_api() -> str:
+        async with _SEM:                   # concurrency limiter
+            try:
+                msg = await client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0,
+                )
+                text = msg.choices[0].message.content
+            except Exception as exc:       # network/auth errors
+                text = f"Error: {exc}"
+            _set_cache(prompt, text)
+            return text
+    task = loop.create_task(_call_api())
+    _inflight[prompt] = task               # register
     try:
+        return await task
+    finally:
+        _inflight.pop(prompt, None)        # clean up even on error
 async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
     """Vectorised helper — returns an output list matching *lst* length."""
         if isinstance(txt, str) and txt.strip():
             idx.append(i)
             prompts.append(f"{instruction}\n\nText: {txt}")
+    if not prompts:                         # fast-path: all rows empty
         return out
+    # Gather — duplicate prompts handled inside _gpt_async
     responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
     for j, val in enumerate(responses):
         out[idx[j]] = val
 async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
     """Run all LLM batches concurrently and return the five enrichment columns."""
     async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
         sdesc, ldesc, fobj, fout = await asyncio.gather(
             _batch_async(df.get(dcol, "").fillna("").tolist(),
                          "Create a concise 250-character summary of this course description:", client),
             _batch_async(df.get(acol, "").fillna("").tolist(),
                          "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':", client),
         )
+        # Prerequisites (some rows empty → default)
         prereq_raw = df.get(pcol, "").fillna("").tolist()
         fpre: list[str] = []
         for req in prereq_raw:
     df = _read(path)
     df.columns = df.columns.str.strip()
     first_col = lambda *candidates: next((c for c in candidates if c in df.columns), None)
     dcol = first_col("Description", "Decription")
     sid = first_col("Course SID", "ï»¿Course SID")
     if dur not in df.columns:
+        df[dur] = ""  # ensure Duration col exists
     # ---------- LLM enrichment (async) -------------------------------------
     sdesc, ldesc, fobj, fout, fpre = asyncio.run(_enrich_dataframe(df, dcol, ocol, pcol, acol))
     fn=process_file,
     inputs=gr.File(label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]),
     outputs=gr.File(label="Download WooCommerce CSV"),
+    title="NetCom → WooCommerce CSV Processor (Try 2)",
+    description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",
     analytics_enabled=False,
 )