Spaces:
Running
Running
"""NetCom β WooCommerce transformer (TryΒ 3Β schema β metaβdays calc, sorted attributes, deduped AI sections, persistent cache, 100βparallel, | |
duplicateβsafe, relativeβlogo paths, cacheβpreload) | |
============================================================================== | |
Accept a NetCom schedule (CSV/XLSX) and **optionally** a *previous* WooCommerce | |
CSV; output the fresh WooCommerce CSV. | |
NewΒ in this revision | |
-------------------- | |
* **MetaΒ days** automatically calculated as the inclusive span (in days) between | |
the earliest and latest course dates for each CourseΒ ID. | |
* **AttributeΒ 1 (Date)** lists are now guaranteed to be sorted chronologically. | |
* All AIβgenerated sections (descriptions, objectives, agenda, prerequisites) | |
are postβprocessed to **deduplicate any repeated lines** inside each section. | |
* Everything else (persistent cache in `/data`, 100βparallel semaphore, | |
inβflight deβduplication, pandas compatibility fix) remains unchanged. | |
""" | |
from __future__ import annotations | |
import asyncio | |
import hashlib | |
import json | |
import os | |
import tempfile | |
from io import BytesIO | |
from pathlib import Path | |
from typing import List | |
import gradio as gr | |
import gradio_client.utils | |
import openai | |
import pandas as pd | |
# ββ Gradio boolβschema hotβpatch ββββββββββββββββββββββββββββββββββββββββββββ | |
_original = gradio_client.utils._json_schema_to_python_type | |
def _fixed_json_schema_to_python_type(schema, defs=None): # type: ignore | |
if isinstance(schema, bool): | |
return "any" | |
return _original(schema, defs) | |
gradio_client.utils._json_schema_to_python_type = _fixed_json_schema_to_python_type # type: ignore | |
# ββ Persistent disk cache (HF Spaces uses /data) ββββββββββββββββββββββββββββ | |
_PERSISTENT_ROOT = Path("/data") | |
CACHE_DIR = (_PERSISTENT_ROOT if _PERSISTENT_ROOT.exists() else Path(".")) / "ai_response_cache" | |
CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
def _cache_path(p: str) -> Path: | |
return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json" | |
def _get_cached(p: str) -> str | None: | |
try: | |
return json.loads(_cache_path(p).read_text("utf-8"))["response"] | |
except Exception: | |
return None | |
def _set_cache(p: str, r: str) -> None: | |
try: | |
_cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8") | |
except Exception: | |
pass | |
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def _dedup_lines(txt: str) -> str: | |
"""Remove duplicated lines while preserving order inside a block of text.""" | |
seen = set() | |
out: List[str] = [] | |
for raw in txt.splitlines(): | |
line = raw.rstrip() | |
if line and line not in seen: | |
out.append(line) | |
seen.add(line) | |
return "\n".join(out) | |
# ββ OpenAI helpers: 100βparallel + deβdup βββββββββββββββββββββββββββββββββββ | |
_SEM = asyncio.Semaphore(100) # β€100 concurrent OpenAI calls | |
_inflight: dict[str, asyncio.Future] = {} # prompt β Future | |
async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str: | |
cached = _get_cached(prompt) | |
if cached is not None: | |
return cached | |
running = _inflight.get(prompt) | |
if running is not None: | |
return await running | |
loop = asyncio.get_running_loop() | |
async def _call_api() -> str: | |
async with _SEM: | |
try: | |
msg = await client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[{"role": "user", "content": prompt}], | |
temperature=0, | |
) | |
text = msg.choices[0].message.content | |
except Exception as exc: | |
text = f"Error: {exc}" | |
_set_cache(prompt, text) | |
return text | |
task = loop.create_task(_call_api()) | |
_inflight[prompt] = task | |
try: | |
return await task | |
finally: | |
_inflight.pop(prompt, None) | |
async def _batch_async(lst, instruction: str, client): | |
out = ["" for _ in lst] | |
idx, prompts = [], [] | |
for i, txt in enumerate(lst): | |
if isinstance(txt, str) and txt.strip(): | |
idx.append(i) | |
prompts.append(f"{instruction}\n\nText: {txt}") | |
if not prompts: | |
return out | |
responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts]) | |
for j, val in enumerate(responses): | |
out[idx[j]] = _dedup_lines(val) | |
return out | |
# ββ Instructions (reuse across preload & gen) βββββββββββββββββββββββββββββββ | |
DESC_SHORT = "Create a concise 250-character summary of this course description:" | |
DESC_LONG = "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:" | |
OBJECTIVES = "Format these objectives into a bullet list with clean formatting. Start each bullet with 'β’ ':" | |
AGENDA = "Format this agenda into a bullet list with clean formatting. Start each bullet with 'β’ ':" | |
PREREQ = "Format these prerequisites into a bullet list with clean formatting. Start each bullet with 'β’ ':" | |
# ββ Logo map (relative paths, with common aliases) ββββββββββββββββββββββββββ | |
logos = { | |
"Amazon Web Services": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aws.png", | |
"AWS": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aws.png", | |
"Cisco": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp", | |
"Microsoft": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png", | |
"Google Cloud": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Google_Cloud.png", | |
"EC Council": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Ec_Council.png", | |
"ITIL": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/ITIL.webp", | |
"PMI": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/PMI.png", | |
"Comptia": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Comptia.png", | |
"Autodesk": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/autodesk.png", | |
"ISC2": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/ISC2.png", | |
"AICerts": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aicerts-logo-1.png", | |
} | |
DEFAULT_PREREQ = ( | |
"No specific prerequisites are required for this course. Basic computer literacy and " | |
"familiarity with fundamental concepts in the subject area are recommended for the best " | |
"learning experience." | |
) | |
# ββ Cacheβpreload from previous WooCommerce CSV βββββββββββββββββββββββββββββ | |
def _preload_cache(prev_csv: str, df_new: pd.DataFrame, dcol, ocol, pcol, acol): | |
"""Seed the onβdisk cache with completions from an earlier WooCommerce CSV.""" | |
try: | |
prev = pd.read_csv(prev_csv, encoding="utf-8-sig") | |
except Exception: | |
return | |
prev_parent = prev[prev["Type"].str.startswith("variable", na=False)] | |
prev_map = {row["SKU"]: row for _, row in prev_parent.iterrows()} # SKU == Course ID | |
for _, row in df_new.iterrows(): | |
cid = row["Course ID"] | |
if cid not in prev_map: | |
continue | |
old = prev_map[cid] | |
desc = str(row[dcol]) | |
obj = str(row[ocol]) | |
ag = str(row[acol]) | |
pre = str(row[pcol]) | |
_set_cache(f"{DESC_SHORT}\n\nText: {desc}", _dedup_lines(old.get("Short description", ""))) | |
_set_cache(f"{DESC_LONG}\n\nText: {desc}", _dedup_lines(old.get("Description", ""))) | |
_set_cache(f"{OBJECTIVES}\n\nText: {obj}", _dedup_lines(old.get("Meta: objectives", ""))) | |
_set_cache(f"{AGENDA}\n\nText: {ag}", _dedup_lines(old.get("Meta: agenda", ""))) | |
if pre.strip(): | |
_set_cache(f"{PREREQ}\n\nText: {pre}", _dedup_lines(old.get("Meta: prerequisites", ""))) | |
# ββ Helper: read user file (CSV or Excel) βββββββββββββββββββββββββββββββββββ | |
def _read(path: str) -> pd.DataFrame: | |
if path.lower().endswith((".xlsx", ".xls")): | |
return pd.read_excel(path) | |
return pd.read_csv(path, encoding="latin1") | |
# ββ Enrichment step (async batched LLM) ββββββββββββββββββββββββββββββββββββββ | |
async def _enrich_dataframe(df, dcol, ocol, pcol, acol): | |
async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client: | |
sdesc, ldesc, fobj, fout = await asyncio.gather( | |
_batch_async(df.get(dcol, "").fillna("").tolist(), DESC_SHORT, client), | |
_batch_async(df.get(dcol, "").fillna("").tolist(), DESC_LONG, client), | |
_batch_async(df.get(ocol, "").fillna("").tolist(), OBJECTIVES, client), | |
_batch_async(df.get(acol, "").fillna("").tolist(), AGENDA, client), | |
) | |
prereq_raw = df.get(pcol, "").fillna("").tolist() | |
fpre = [] | |
for req in prereq_raw: | |
if not str(req).strip(): | |
fpre.append(DEFAULT_PREREQ) | |
else: | |
out = await _batch_async([req], PREREQ, client) | |
fpre.append(out[0]) | |
# Ensure everything is deduped (safety). | |
sdesc = [_dedup_lines(t) for t in sdesc] | |
ldesc = [_dedup_lines(t) for t in ldesc] | |
fobj = [_dedup_lines(t) for t in fobj] | |
fout = [_dedup_lines(t) for t in fout] | |
fpre = [_dedup_lines(t) for t in fpre] | |
return sdesc, ldesc, fobj, fout, fpre | |
# ββ Main converter ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO: | |
df = _read(schedule_path) | |
df.columns = df.columns.str.strip() | |
first_col = lambda *cand: next((c for c in cand if c in df.columns), None) | |
dcol = first_col("Description", "Decription") | |
ocol = first_col("Objectives", "objectives") | |
pcol = first_col("RequiredPrerequisite", "Required Pre-requisite") | |
acol = first_col("Outline") | |
dur = first_col("Duration") or "Duration" # kept for backwardβcompat (unused) | |
sid = first_col("Course SID", "Course SID") | |
# optional cache preload | |
if prev_csv_path: | |
_preload_cache(prev_csv_path, df, dcol, ocol, pcol, acol) | |
# asyncβenrich via LLM | |
sdesc, ldesc, fobj, fout, fpre = asyncio.run( | |
_enrich_dataframe(df, dcol, ocol, pcol, acol) | |
) | |
df["Short_Description"] = sdesc | |
df["Condensed_Description"] = ldesc | |
df["Formatted_Objectives"] = fobj | |
df["Formatted_Agenda"] = fout | |
df["Formatted_Prerequisites"] = fpre | |
# schedule aggregation & metaβdays calculation | |
df["Course Start Date"] = pd.to_datetime(df["Course Start Date"], errors="coerce") | |
df["Date_fmt"] = df["Course Start Date"].dt.strftime("%-m/%-d/%Y") | |
dsorted = df.sort_values(["Course ID", "Course Start Date"]) | |
# "MetaDays" = inclusive span between earliest & latest dates per CourseΒ ID | |
meta_days = ( | |
dsorted.groupby("Course ID")["Course Start Date"].agg(lambda s: (s.max() - s.min()).days + 1) | |
.reset_index(name="MetaDays") | |
) | |
# AttributeΒ 1 list β ensure chronological order | |
d_agg = ( | |
dsorted.groupby("Course ID")["Date_fmt"] | |
.apply(lambda s: ",".join(sorted(s.dropna().unique(), key=lambda x: pd.to_datetime(x)))) | |
.reset_index(name="Dates") | |
) | |
t_agg = ( | |
dsorted.groupby("Course ID", group_keys=False) | |
.apply( | |
lambda g: ",".join( | |
f"{st}-{et} {tz}" | |
for st, et, tz in zip( | |
g["Course Start Time"], g["Course End Time"], g["Time Zone"] | |
) | |
), | |
include_groups=False, | |
) | |
.reset_index(name="Times") | |
) | |
parents = ( | |
dsorted.drop_duplicates("Course ID") | |
.merge(d_agg) | |
.merge(t_agg) | |
.merge(meta_days) | |
) | |
# propagate MetaDays to each schedule row | |
dsorted = dsorted.merge(meta_days, on="Course ID", how="left") | |
parent = pd.DataFrame( | |
{ | |
"Type": "variable", | |
"SKU": parents["Course ID"], | |
"Name": parents["Course Name"], | |
"Published": 1, | |
"Visibility in catalog": "visible", | |
"Short description": parents["Short_Description"], | |
"Description": parents["Condensed_Description"], | |
"Tax status": "taxable", | |
"In stock?": 1, | |
"Stock": 1, | |
"Sold individually?": 1, | |
"Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True), | |
"Categories": "courses", | |
"Images": parents["Vendor"].map(logos).fillna(""), | |
"Parent": "", | |
"Brands": parents["Vendor"], | |
"Attribute 1 name": "Date", | |
"Attribute 1 value(s)": parents["Dates"], | |
"Attribute 1 visible": "visible", | |
"Attribute 1 global": 1, | |
"Attribute 2 name": "Location", | |
"Attribute 2 value(s)": "Virtual", | |
"Attribute 2 visible": "visible", | |
"Attribute 2 global": 1, | |
"Attribute 3 name": "Time", | |
"Attribute 3 value(s)": parents["Times"], | |
"Attribute 3 visible": "visible", | |
"Attribute 3 global": 1, | |
"Meta: outline": parents["Formatted_Agenda"], | |
"Meta: days": parents["MetaDays"], | |
"Meta: location": "Virtual", | |
"Meta: overview": parents["Target Audience"], | |
"Meta: objectives": parents["Formatted_Objectives"], | |
"Meta: prerequisites": parents["Formatted_Prerequisites"], | |
"Meta: agenda": parents["Formatted_Agenda"], | |
} | |
) | |
child = pd.DataFrame( | |
{ | |
"Type": "variation, virtual", | |
"SKU": dsorted[sid].astype(str).str.strip(), | |
"Name": dsorted["Course Name"], | |
"Published": 1, | |
"Visibility in catalog": "visible", | |
"Short description": dsorted["Short_Description"], | |
"Description": dsorted["Condensed_Description"], | |
"Tax status": "taxable", | |
"In stock?": 1, | |
"Stock": 1, | |
"Sold individually?": 1, | |
"Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True), | |
"Categories": "courses", | |
"Images": dsorted["Vendor"].map(logos).fillna(""), | |
"Parent": dsorted["Course ID"], | |
"Brands": dsorted["Vendor"], | |
"Attribute 1 name": "Date", | |
"Attribute 1 value(s)": dsorted["Date_fmt"], | |
"Attribute 1 visible": "visible", | |
"Attribute 1 global": 1, | |
"Attribute 2 name": "Location", | |
"Attribute 2 value(s)": "Virtual", | |
"Attribute 2 visible": "visible", | |
"Attribute 2 global": 1, | |
"Attribute 3 name": "Time", | |
"Attribute 3 value(s)": dsorted.apply( | |
lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}", | |
axis=1, | |
), | |
"Attribute 3 visible": "visible", | |
"Attribute 3 global": 1, | |
"Meta: outline": dsorted["Formatted_Agenda"], | |
"Meta: days": dsorted["MetaDays"], | |
"Meta: location": "Virtual", | |
"Meta: overview": dsorted["Target Audience"], | |
"Meta: objectives": dsorted["Formatted_Objectives"], | |
"Meta: prerequisites": dsorted["Formatted_Prerequisites"], | |
"Meta: agenda": dsorted["Formatted_Agenda"], | |
} | |
) | |
all_rows = pd.concat([parent, child], ignore_index=True) | |
order = [ | |
"Type","SKU","Name","Published","Visibility in catalog","Short description","Description", | |
"Tax status","In stock?","Stock","Sold individually?","Regular price","Categories","Images", | |
"Parent","Brands","Attribute 1 name","Attribute 1 value(s)","Attribute 1 visible","Attribute 1 global", | |
"Attribute 2 name","Attribute 2 value(s)","Attribute 2 visible","Attribute 2 global", | |
"Attribute 3 name","Attribute 3 value(s)","Attribute 3 visible","Attribute 3 global", | |
"Meta: outline","Meta: days","Meta: location","Meta: overview","Meta: objectives", | |
"Meta: prerequisites","Meta: agenda", | |
] | |
out = BytesIO() | |
all_rows[order].to_csv(out, index=False, encoding="utf-8-sig") | |
out.seek(0) | |
return out | |
# ββ Gradio interface ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def process_files(schedule: gr.File, previous: gr.File | None) -> str: | |
csv_bytes = convert(schedule.name, previous.name if previous else None) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
tmp.write(csv_bytes.getvalue()) | |
return tmp.name | |
ui = gr.Interface( | |
fn=process_files, | |
inputs=[ | |
gr.File(label="Upload NetCom schedule (.csv/.xlsx/.xls)", file_types=[".csv", ".xlsx", ".xls"]), | |
gr.File(label="Previous WooCommerce CSV (optional)", file_types=[".csv"]), | |
], | |
outputs=gr.File(label="Download WooCommerce CSV"), | |
title="NetCom β WooCommerce CSV Processor (TryΒ 3)", | |
description=( | |
"1. Upload the **latest NetCom schedule** file.\n" | |
"2. *(Optional)* Upload the **WooCommerce CSV** generated by a previous run to " | |
"pre-load the cache and skip already-processed courses." | |
), | |
analytics_enabled=False, | |
) | |
if __name__ == "__main__": | |
if not os.getenv("OPENAI_API_KEY"): | |
print("β οΈ OPENAI_API_KEY not set β AI features will error") | |
ui.launch() | |