Spaces:

codys12
/

NetCom-to-WooComerce

Runtime error

File size: 14,227 Bytes

"""NetCom → WooCommerce transformer (Try 2 schema — 100-parallel + de-dupe, pandas fix)
======================================================================================
*Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*

New since the last paste
------------------------
* Fix for older pandas: move `include_groups=False` from `.groupby()` to `.apply()`.
* Everything else (cache names, concurrency cap, in-flight de-duplication) is unchanged.
"""

from __future__ import annotations

import asyncio
import hashlib
import json
import os
import tempfile
from io import BytesIO
from pathlib import Path

import gradio as gr
import gradio_client.utils
import openai
import pandas as pd

# -------- Gradio bool-schema hot-patch --------------------------------------
_original = gradio_client.utils._json_schema_to_python_type

def _fixed_json_schema_to_python_type(schema, defs=None):  # type: ignore
    if isinstance(schema, bool):
        return "any"
    return _original(schema, defs)

gradio_client.utils._json_schema_to_python_type = _fixed_json_schema_to_python_type  # type: ignore

# -------- Tiny disk cache ----------------------------------------------------
CACHE_DIR = Path("ai_response_cache")
CACHE_DIR.mkdir(exist_ok=True)

def _cache_path(p: str) -> Path:
    return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json"

def _get_cached(p: str) -> str | None:
    try:
        return json.loads(_cache_path(p).read_text("utf-8"))["response"]
    except Exception:
        return None

def _set_cache(p: str, r: str) -> None:
    try:
        _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8")
    except Exception:
        pass

# -------- Async GPT helpers --------------------------------------------------
_SEM = asyncio.Semaphore(100)              # ≤100 concurrent OpenAI calls
_inflight: dict[str, asyncio.Future] = {}  # prompt → Future

async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
    """Single LLM call with disk cache, concurrency cap, and de-duplication."""
    cached = _get_cached(prompt)
    if cached is not None:
        return cached

    # De-duplicate identical prompts already in flight
    running = _inflight.get(prompt)
    if running is not None:
        return await running

    loop = asyncio.get_running_loop()

    async def _call_api() -> str:
        async with _SEM:  # concurrency limiter
            try:
                msg = await client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0,
                )
                text = msg.choices[0].message.content
            except Exception as exc:
                text = f"Error: {exc}"
            _set_cache(prompt, text)
            return text

    task = loop.create_task(_call_api())
    _inflight[prompt] = task
    try:
        return await task
    finally:
        _inflight.pop(prompt, None)

async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
    """Vectorised helper — returns an output list matching *lst* length."""
    out: list[str] = ["" for _ in lst]
    idx, prompts = [], []
    for i, txt in enumerate(lst):
        if isinstance(txt, str) and txt.strip():
            idx.append(i)
            prompts.append(f"{instruction}\n\nText: {txt}")
    if not prompts:
        return out

    responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
    for j, val in enumerate(responses):
        out[idx[j]] = val
    return out

# -------- Core converter -----------------------------------------------------
DEFAULT_PREREQ = (
    "No specific prerequisites are required for this course. Basic computer literacy and "
    "familiarity with fundamental concepts in the subject area are recommended for the best "
    "learning experience."
)

def _read(path: str) -> pd.DataFrame:
    if path.lower().endswith((".xlsx", ".xls")):
        return pd.read_excel(path)
    return pd.read_csv(path, encoding="latin1")

async def _enrich_dataframe(
    df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str
) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
    """Run all LLM batches concurrently and return the five enrichment columns."""
    async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
        sdesc, ldesc, fobj, fout = await asyncio.gather(
            _batch_async(
                df.get(dcol, "").fillna("").tolist(),
                "Create a concise 250-character summary of this course description:",
                client,
            ),
            _batch_async(
                df.get(dcol, "").fillna("").tolist(),
                "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:",
                client,
            ),
            _batch_async(
                df.get(ocol, "").fillna("").tolist(),
                "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':",
                client,
            ),
            _batch_async(
                df.get(acol, "").fillna("").tolist(),
                "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':",
                client,
            ),
        )

        # Prerequisites (some rows empty → default text)
        prereq_raw = df.get(pcol, "").fillna("").tolist()
        fpre: list[str] = []
        for req in prereq_raw:
            if not str(req).strip():
                fpre.append(DEFAULT_PREREQ)
            else:
                formatted = await _batch_async(
                    [req],
                    "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':",
                    client,
                )
                fpre.append(formatted[0])

    return sdesc, ldesc, fobj, fout, fpre

def convert(path: str) -> BytesIO:
    logos = {
        "Amazon Web Services": "/wp-content/uploads/2025/04/aws.png",
        "Cisco": "/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp",
        "Microsoft": "/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png",
        "Google Cloud": "/wp-content/uploads/2025/04/Google_Cloud.png",
        "EC Council": "/wp-content/uploads/2025/04/Ec_Council.png",
        "ITIL": "/wp-content/uploads/2025/04/ITIL.webp",
        "PMI": "/wp-content/uploads/2025/04/PMI.png",
        "Comptia": "/wp-content/uploads/2025/04/Comptia.png",
        "Autodesk": "/wp-content/uploads/2025/04/autodesk.png",
        "ISC2": "/wp-content/uploads/2025/04/ISC2.png",
        "AICerts": "/wp-content/uploads/2025/04/aicerts-logo-1.png",
    }

    df = _read(path)
    df.columns = df.columns.str.strip()

    first_col = lambda *candidates: next((c for c in candidates if c in df.columns), None)

    dcol = first_col("Description", "Decription")
    ocol = first_col("Objectives", "objectives")
    pcol = first_col("RequiredPrerequisite", "Required Pre-requisite")
    acol = first_col("Outline")
    dur = first_col("Duration") or "Duration"
    sid = first_col("Course SID", "ï»¿Course SID")

    if dur not in df.columns:
        df[dur] = ""  # ensure Duration column exists

    # ---------- LLM enrichment (async) -------------------------------------
    sdesc, ldesc, fobj, fout, fpre = asyncio.run(
        _enrich_dataframe(df, dcol, ocol, pcol, acol)
    )

    df["Short_Description"] = sdesc
    df["Condensed_Description"] = ldesc
    df["Formatted_Objectives"] = fobj
    df["Formatted_Agenda"] = fout
    df["Formatted_Prerequisites"] = fpre

    # ---------- Schedule aggregation --------------------------------------
    df["Course Start Date"] = pd.to_datetime(df["Course Start Date"], errors="coerce")
    df["Date_fmt"] = df["Course Start Date"].dt.strftime("%-m/%-d/%Y")

    dsorted = df.sort_values(["Course ID", "Course Start Date"])
    d_agg = (
        dsorted.groupby("Course ID")["Date_fmt"]
        .apply(lambda s: ",".join(s.dropna().unique()))
        .reset_index(name="Dates")
    )

    t_agg = (
        dsorted.groupby("Course ID", group_keys=False)
        .apply(
            lambda g: ",".join(
                f"{st}-{et} {tz}"
                for st, et, tz in zip(
                    g["Course Start Time"], g["Course End Time"], g["Time Zone"]
                )
            ),
            include_groups=False,  # <- moved here
        )
        .reset_index(name="Times")
    )

    parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)

    # ---------- Parent / child product rows --------------------------------
    parent = pd.DataFrame(
        {
            "Type": "variable",
            "SKU": parents["Course ID"],
            "Name": parents["Course Name"],
            "Published": 1,
            "Visibility in catalog": "visible",
            "Short description": parents["Short_Description"],
            "Description": parents["Condensed_Description"],
            "Tax status": "taxable",
            "In stock?": 1,
            "Stock": 1,
            "Sold individually?": 1,
            "Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True),
            "Categories": "courses",
            "Images": parents["Vendor"].map(logos).fillna(""),
            "Parent": "",
            "Brands": parents["Vendor"],
            "Attribute 1 name": "Date",
            "Attribute 1 value(s)": parents["Dates"],
            "Attribute 1 visible": "visible",
            "Attribute 1 global": 1,
            "Attribute 2 name": "Location",
            "Attribute 2 value(s)": "Virtual",
            "Attribute 2 visible": "visible",
            "Attribute 2 global": 1,
            "Attribute 3 name": "Time",
            "Attribute 3 value(s)": parents["Times"],
            "Attribute 3 visible": "visible",
            "Attribute 3 global": 1,
            "Meta: outline": parents["Formatted_Agenda"],
            "Meta: days": parents[dur],
            "Meta: location": "Virtual",
            "Meta: overview": parents["Target Audience"],
            "Meta: objectives": parents["Formatted_Objectives"],
            "Meta: prerequisites": parents["Formatted_Prerequisites"],
            "Meta: agenda": parents["Formatted_Agenda"],
        }
    )

    child = pd.DataFrame(
        {
            "Type": "variation, virtual",
            "SKU": dsorted[sid].astype(str).str.strip(),
            "Name": dsorted["Course Name"],
            "Published": 1,
            "Visibility in catalog": "visible",
            "Short description": dsorted["Short_Description"],
            "Description": dsorted["Condensed_Description"],
            "Tax status": "taxable",
            "In stock?": 1,
            "Stock": 1,
            "Sold individually?": 1,
            "Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True),
            "Categories": "courses",
            "Images": dsorted["Vendor"].map(logos).fillna(""),
            "Parent": dsorted["Course ID"],
            "Brands": dsorted["Vendor"],
            "Attribute 1 name": "Date",
            "Attribute 1 value(s)": dsorted["Date_fmt"],
            "Attribute 1 visible": "visible",
            "Attribute 1 global": 1,
            "Attribute 2 name": "Location",
            "Attribute 2 value(s)": "Virtual",
            "Attribute 2 visible": "visible",
            "Attribute 2 global": 1,
            "Attribute 3 name": "Time",
            "Attribute 3 value(s)": dsorted.apply(
                lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}",
                axis=1,
            ),
            "Attribute 3 visible": "visible",
            "Attribute 3 global": 1,
            "Meta: outline": dsorted["Formatted_Agenda"],
            "Meta: days": dsorted[dur],
            "Meta: location": "Virtual",
            "Meta: overview": dsorted["Target Audience"],
            "Meta: objectives": dsorted["Formatted_Objectives"],
            "Meta: prerequisites": dsorted["Formatted_Prerequisites"],
            "Meta: agenda": dsorted["Formatted_Agenda"],
        }
    )

    all_rows = pd.concat([parent, child], ignore_index=True)
    order = [
        "Type",
        "SKU",
        "Name",
        "Published",
        "Visibility in catalog",
        "Short description",
        "Description",
        "Tax status",
        "In stock?",
        "Stock",
        "Sold individually?",
        "Regular price",
        "Categories",
        "Images",
        "Parent",
        "Brands",
        "Attribute 1 name",
        "Attribute 1 value(s)",
        "Attribute 1 visible",
        "Attribute 1 global",
        "Attribute 2 name",
        "Attribute 2 value(s)",
        "Attribute 2 visible",
        "Attribute 2 global",
        "Attribute 3 name",
        "Attribute 3 value(s)",
        "Attribute 3 visible",
        "Attribute 3 global",
        "Meta: outline",
        "Meta: days",
        "Meta: location",
        "Meta: overview",
        "Meta: objectives",
        "Meta: prerequisites",
        "Meta: agenda",
    ]

    out = BytesIO()
    all_rows[order].to_csv(out, index=False, encoding="utf-8-sig")
    out.seek(0)
    return out

# -------- Gradio wrappers ----------------------------------------------------

def process_file(upload: gr.File) -> str:
    csv_bytes = convert(upload.name)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
        tmp.write(csv_bytes.getvalue())
        path = tmp.name
    return path

ui = gr.Interface(
    fn=process_file,
    inputs=gr.File(
        label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]
    ),
    outputs=gr.File(label="Download WooCommerce CSV"),
    title="NetCom → WooCommerce CSV Processor (Try 2)",
    description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",
    analytics_enabled=False,
)

if __name__ == "__main__":
    if not os.getenv("OPENAI_API_KEY"):
        print("⚠️  OPENAI_API_KEY not set – AI features will error")
    ui.launch()