codys12 commited on
Commit
cf087b0
·
verified ·
1 Parent(s): db70732

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -19
app.py CHANGED
@@ -1,11 +1,12 @@
1
- """NetCom → WooCommerce transformer (Try 2 schema — 100-parallel + de-dupe, pandas fix)
2
- ======================================================================================
3
  *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
4
 
5
- New since the last paste
6
- ------------------------
7
- * Fix for older pandas: move `include_groups=False` from `.groupby()` to `.apply()`.
8
- * Everything else (cache names, concurrency cap, in-flight de-duplication) is unchanged.
 
9
  """
10
 
11
  from __future__ import annotations
@@ -26,51 +27,60 @@ import pandas as pd
26
  # -------- Gradio bool-schema hot-patch --------------------------------------
27
  _original = gradio_client.utils._json_schema_to_python_type
28
 
 
29
  def _fixed_json_schema_to_python_type(schema, defs=None): # type: ignore
30
  if isinstance(schema, bool):
31
  return "any"
32
  return _original(schema, defs)
33
 
34
- gradio_client.utils._json_schema_to_python_type = _fixed_json_schema_to_python_type # type: ignore
 
 
 
35
 
36
  # -------- Tiny disk cache ----------------------------------------------------
37
  CACHE_DIR = Path("ai_response_cache")
38
  CACHE_DIR.mkdir(exist_ok=True)
39
 
 
40
  def _cache_path(p: str) -> Path:
41
  return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json"
42
 
 
43
  def _get_cached(p: str) -> str | None:
44
  try:
45
  return json.loads(_cache_path(p).read_text("utf-8"))["response"]
46
  except Exception:
47
  return None
48
 
 
49
  def _set_cache(p: str, r: str) -> None:
50
  try:
51
  _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8")
52
  except Exception:
53
  pass
54
 
 
55
  # -------- Async GPT helpers --------------------------------------------------
56
- _SEM = asyncio.Semaphore(100) # ≤100 concurrent OpenAI calls
57
  _inflight: dict[str, asyncio.Future] = {} # prompt → Future
58
 
 
59
  async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
60
- """Single LLM call with disk cache, concurrency cap, and de-duplication."""
61
  cached = _get_cached(prompt)
62
  if cached is not None:
63
  return cached
64
 
65
- # De-duplicate identical prompts already in flight
66
- running = _inflight.get(prompt)
67
- if running is not None:
68
- return await running
69
 
70
  loop = asyncio.get_running_loop()
71
 
72
  async def _call_api() -> str:
73
- async with _SEM: # concurrency limiter
74
  try:
75
  msg = await client.chat.completions.create(
76
  model="gpt-4o-mini",
@@ -90,7 +100,10 @@ async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
90
  finally:
91
  _inflight.pop(prompt, None)
92
 
93
- async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
 
 
 
94
  """Vectorised helper — returns an output list matching *lst* length."""
95
  out: list[str] = ["" for _ in lst]
96
  idx, prompts = [], []
@@ -106,6 +119,7 @@ async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpe
106
  out[idx[j]] = val
107
  return out
108
 
 
109
  # -------- Core converter -----------------------------------------------------
110
  DEFAULT_PREREQ = (
111
  "No specific prerequisites are required for this course. Basic computer literacy and "
@@ -113,11 +127,13 @@ DEFAULT_PREREQ = (
113
  "learning experience."
114
  )
115
 
 
116
  def _read(path: str) -> pd.DataFrame:
117
  if path.lower().endswith((".xlsx", ".xls")):
118
  return pd.read_excel(path)
119
  return pd.read_csv(path, encoding="latin1")
120
 
 
121
  async def _enrich_dataframe(
122
  df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str
123
  ) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
@@ -146,7 +162,7 @@ async def _enrich_dataframe(
146
  ),
147
  )
148
 
149
- # Prerequisites (some rows empty → default text)
150
  prereq_raw = df.get(pcol, "").fillna("").tolist()
151
  fpre: list[str] = []
152
  for req in prereq_raw:
@@ -162,9 +178,11 @@ async def _enrich_dataframe(
162
 
163
  return sdesc, ldesc, fobj, fout, fpre
164
 
 
165
  def convert(path: str) -> BytesIO:
166
  logos = {
167
  "Amazon Web Services": "/wp-content/uploads/2025/04/aws.png",
 
168
  "Cisco": "/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp",
169
  "Microsoft": "/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png",
170
  "Google Cloud": "/wp-content/uploads/2025/04/Google_Cloud.png",
@@ -190,7 +208,7 @@ def convert(path: str) -> BytesIO:
190
  sid = first_col("Course SID", "Course SID")
191
 
192
  if dur not in df.columns:
193
- df[dur] = "" # ensure Duration column exists
194
 
195
  # ---------- LLM enrichment (async) -------------------------------------
196
  sdesc, ldesc, fobj, fout, fpre = asyncio.run(
@@ -223,7 +241,7 @@ def convert(path: str) -> BytesIO:
223
  g["Course Start Time"], g["Course End Time"], g["Time Zone"]
224
  )
225
  ),
226
- include_groups=False, # <- moved here
227
  )
228
  .reset_index(name="Times")
229
  )
@@ -358,8 +376,8 @@ def convert(path: str) -> BytesIO:
358
  out.seek(0)
359
  return out
360
 
361
- # -------- Gradio wrappers ----------------------------------------------------
362
 
 
363
  def process_file(upload: gr.File) -> str:
364
  csv_bytes = convert(upload.name)
365
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
@@ -367,6 +385,7 @@ def process_file(upload: gr.File) -> str:
367
  path = tmp.name
368
  return path
369
 
 
370
  ui = gr.Interface(
371
  fn=process_file,
372
  inputs=gr.File(
 
1
+ """NetCom → WooCommerce transformer (Try 2 schema — 100-parallel, de-dupe, pandas-fix)
2
+ =====================================================================================
3
  *Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
4
 
5
+ Latest tweak
6
+ ------------
7
+ **Logo map** now contains both `"Amazon Web Services"` *and* `"AWS"` keys
8
+ so either value in the *Vendor* column resolves to the same upload path.
9
+ (Everything else is untouched.)
10
  """
11
 
12
  from __future__ import annotations
 
27
  # -------- Gradio bool-schema hot-patch --------------------------------------
28
  _original = gradio_client.utils._json_schema_to_python_type
29
 
30
+
31
  def _fixed_json_schema_to_python_type(schema, defs=None): # type: ignore
32
  if isinstance(schema, bool):
33
  return "any"
34
  return _original(schema, defs)
35
 
36
+
37
+ gradio_client.utils._json_schema_to_python_type = ( # type: ignore
38
+ _fixed_json_schema_to_python_type
39
+ )
40
 
41
  # -------- Tiny disk cache ----------------------------------------------------
42
  CACHE_DIR = Path("ai_response_cache")
43
  CACHE_DIR.mkdir(exist_ok=True)
44
 
45
+
46
  def _cache_path(p: str) -> Path:
47
  return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json"
48
 
49
+
50
  def _get_cached(p: str) -> str | None:
51
  try:
52
  return json.loads(_cache_path(p).read_text("utf-8"))["response"]
53
  except Exception:
54
  return None
55
 
56
+
57
  def _set_cache(p: str, r: str) -> None:
58
  try:
59
  _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8")
60
  except Exception:
61
  pass
62
 
63
+
64
  # -------- Async GPT helpers --------------------------------------------------
65
+ _SEM = asyncio.Semaphore(100) # ≤100 concurrent OpenAI calls
66
  _inflight: dict[str, asyncio.Future] = {} # prompt → Future
67
 
68
+
69
  async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
70
+ """Single LLM call with cache, concurrency cap, and de-duplication."""
71
  cached = _get_cached(prompt)
72
  if cached is not None:
73
  return cached
74
 
75
+ # de-dup identical prompts already in-flight
76
+ existing = _inflight.get(prompt)
77
+ if existing is not None:
78
+ return await existing
79
 
80
  loop = asyncio.get_running_loop()
81
 
82
  async def _call_api() -> str:
83
+ async with _SEM:
84
  try:
85
  msg = await client.chat.completions.create(
86
  model="gpt-4o-mini",
 
100
  finally:
101
  _inflight.pop(prompt, None)
102
 
103
+
104
+ async def _batch_async(
105
+ lst: list[str], instruction: str, client: openai.AsyncOpenAI
106
+ ) -> list[str]:
107
  """Vectorised helper — returns an output list matching *lst* length."""
108
  out: list[str] = ["" for _ in lst]
109
  idx, prompts = [], []
 
119
  out[idx[j]] = val
120
  return out
121
 
122
+
123
  # -------- Core converter -----------------------------------------------------
124
  DEFAULT_PREREQ = (
125
  "No specific prerequisites are required for this course. Basic computer literacy and "
 
127
  "learning experience."
128
  )
129
 
130
+
131
  def _read(path: str) -> pd.DataFrame:
132
  if path.lower().endswith((".xlsx", ".xls")):
133
  return pd.read_excel(path)
134
  return pd.read_csv(path, encoding="latin1")
135
 
136
+
137
  async def _enrich_dataframe(
138
  df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str
139
  ) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
 
162
  ),
163
  )
164
 
165
+ # prerequisites
166
  prereq_raw = df.get(pcol, "").fillna("").tolist()
167
  fpre: list[str] = []
168
  for req in prereq_raw:
 
178
 
179
  return sdesc, ldesc, fobj, fout, fpre
180
 
181
+
182
  def convert(path: str) -> BytesIO:
183
  logos = {
184
  "Amazon Web Services": "/wp-content/uploads/2025/04/aws.png",
185
+ "AWS": "/wp-content/uploads/2025/04/aws.png",
186
  "Cisco": "/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp",
187
  "Microsoft": "/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png",
188
  "Google Cloud": "/wp-content/uploads/2025/04/Google_Cloud.png",
 
208
  sid = first_col("Course SID", "Course SID")
209
 
210
  if dur not in df.columns:
211
+ df[dur] = ""
212
 
213
  # ---------- LLM enrichment (async) -------------------------------------
214
  sdesc, ldesc, fobj, fout, fpre = asyncio.run(
 
241
  g["Course Start Time"], g["Course End Time"], g["Time Zone"]
242
  )
243
  ),
244
+ include_groups=False,
245
  )
246
  .reset_index(name="Times")
247
  )
 
376
  out.seek(0)
377
  return out
378
 
 
379
 
380
+ # -------- Gradio wrappers ----------------------------------------------------
381
  def process_file(upload: gr.File) -> str:
382
  csv_bytes = convert(upload.name)
383
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
 
385
  path = tmp.name
386
  return path
387
 
388
+
389
  ui = gr.Interface(
390
  fn=process_file,
391
  inputs=gr.File(