shukdevdattaEX commited on
Commit
fd142b1
Β·
verified Β·
1 Parent(s): e119bf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -39
app.py CHANGED
@@ -1,61 +1,307 @@
 
 
1
  import json
 
2
  import gradio as gr
 
3
  from together import Together
4
 
5
- # Load your JSON file
6
- with open("sultanbr_innovativeskills.json", "r", encoding="utf-8") as f:
7
- json_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Flatten JSON into a string context
10
- context = json.dumps(json_data, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- # Chat function
13
- def chat_with_json(api_key, user_message, history):
14
- if not api_key:
15
- return history + [[user_message, "⚠️ Please enter your Together API key first."]]
16
-
 
17
  try:
18
- client = Together(api_key=api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Construct the system + user prompt
21
- prompt = f"""You are an assistant that answers questions based on the following JSON data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  {context}
23
 
24
- User question: {user_message}
25
- Answer clearly using only the relevant JSON information.
26
- """
27
 
28
- response = client.chat.completions.create(
29
- model="lgai/exaone-3-5-32b-instruct",
30
- messages=[{"role": "user", "content": prompt}]
31
- )
32
 
33
- bot_reply = response.choices[0].message.content
34
- history.append([user_message, bot_reply])
35
- return history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- except Exception as e:
38
- return history + [[user_message, f"⚠️ Error: {str(e)}"]]
 
 
39
 
40
- # Build Gradio UI
41
- with gr.Blocks() as demo:
42
- gr.Markdown("## πŸ“š JSON Chatbot (Powered by Together API)")
43
- api_key = gr.Textbox(label="Enter Together API Key", type="password")
44
- chatbot = gr.Chatbot()
45
- msg = gr.Textbox(label="Ask something...")
46
- clear = gr.Button("Clear Chat")
47
 
48
- state = gr.State([])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- def respond(user_message, chat_history, api_key):
51
- return chat_with_json(api_key, user_message, chat_history)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- msg.submit(respond, [msg, state, api_key], state, queue=False).then(
54
- lambda h: (h, ""), state, [chatbot, msg]
 
 
55
  )
56
 
57
- clear.click(lambda: [], None, state).then(lambda: [], None, chatbot)
 
 
58
 
59
- # Launch app
60
  if __name__ == "__main__":
61
  demo.launch()
 
1
+ import os
2
+ import re
3
  import json
4
+ import math
5
  import gradio as gr
6
+ from typing import List, Dict, Any, Tuple
7
  from together import Together
8
 
9
+ # -----------------------------
10
+ # Tolerant JSON loader (fixes your error)
11
+ # -----------------------------
12
+ def _remove_trailing_commas(s: str) -> str:
13
+ """Remove trailing commas before ] or } when not inside strings."""
14
+ out = []
15
+ in_str = False
16
+ esc = False
17
+ for i, ch in enumerate(s):
18
+ if in_str:
19
+ out.append(ch)
20
+ if esc:
21
+ esc = False
22
+ elif ch == '\\':
23
+ esc = True
24
+ elif ch == '"':
25
+ in_str = False
26
+ continue
27
+ else:
28
+ if ch == '"':
29
+ in_str = True
30
+ out.append(ch)
31
+ continue
32
+ if ch == ',':
33
+ j = i + 1
34
+ while j < len(s) and s[j] in ' \t\r\n':
35
+ j += 1
36
+ if j < len(s) and s[j] in ']}':
37
+ # skip this comma
38
+ continue
39
+ out.append(ch)
40
+ return ''.join(out)
41
 
42
+ def _extract_json_objects(text: str) -> List[str]:
43
+ """Extract top-level JSON objects by balancing curly braces, ignoring braces inside strings."""
44
+ objs = []
45
+ in_str = False
46
+ esc = False
47
+ brace_depth = 0
48
+ start = None
49
+ for i, ch in enumerate(text):
50
+ if in_str:
51
+ if esc:
52
+ esc = False
53
+ elif ch == '\\':
54
+ esc = True
55
+ elif ch == '"':
56
+ in_str = False
57
+ else:
58
+ if ch == '"':
59
+ in_str = True
60
+ elif ch == '{':
61
+ if brace_depth == 0:
62
+ start = i
63
+ brace_depth += 1
64
+ elif ch == '}':
65
+ if brace_depth > 0:
66
+ brace_depth -= 1
67
+ if brace_depth == 0 and start is not None:
68
+ objs.append(text[start:i+1])
69
+ start = None
70
+ return objs
71
 
72
+ def safe_load_phpmyadmin_like_json(raw_text: str) -> List[Dict[str, Any]]:
73
+ """
74
+ Attempt strict JSON first; if it fails (e.g., trailing comma issues),
75
+ fall back to extracting individual objects and parsing them.
76
+ Returns a list of objects (header + tables, etc.).
77
+ """
78
  try:
79
+ return json.loads(raw_text)
80
+ except json.JSONDecodeError:
81
+ # Try removing trailing commas globally
82
+ cleaned = _remove_trailing_commas(raw_text)
83
+ try:
84
+ return json.loads(cleaned)
85
+ except json.JSONDecodeError:
86
+ # Last-resort: parse object-by-object and combine into an array
87
+ chunks = _extract_json_objects(raw_text)
88
+ objs = []
89
+ for ch in chunks:
90
+ s = _remove_trailing_commas(ch)
91
+ try:
92
+ objs.append(json.loads(s))
93
+ except json.JSONDecodeError:
94
+ # If a chunk is still bad, skip it rather than crashing
95
+ # (you can log or collect stats if you want)
96
+ continue
97
+ return objs
98
 
99
+ # -----------------------------
100
+ # Build a retriever-friendly corpus
101
+ # -----------------------------
102
+ def flatten_json_to_corpus(docs: List[Dict[str, Any]], max_value_len: int = 500) -> List[Dict[str, Any]]:
103
+ """
104
+ Turn the exported structure into small searchable text chunks.
105
+ For each table row: create a text like: [table=name idx=i] key=value; ...
106
+ """
107
+ corpus = []
108
+ for obj in docs:
109
+ otype = obj.get("type")
110
+ if otype == "table":
111
+ tname = obj.get("name", "unknown_table")
112
+ rows = obj.get("data", [])
113
+ if isinstance(rows, list):
114
+ for i, row in enumerate(rows):
115
+ if isinstance(row, dict):
116
+ parts = []
117
+ for k, v in row.items():
118
+ val = str(v)
119
+ if len(val) > max_value_len:
120
+ val = val[:max_value_len] + "…"
121
+ parts.append(f"{k}={val}")
122
+ text = f"[table={tname} idx={i}] " + " ; ".join(parts)
123
+ corpus.append({"table": tname, "idx": i, "text": text})
124
+ else:
125
+ # Non-table entries (headers, etc.) β€” keep a small representation
126
+ text = json.dumps(obj, ensure_ascii=False)[:2000]
127
+ corpus.append({"table": otype or "meta", "idx": -1, "text": text})
128
+ return corpus
129
+
130
+ # -----------------------------
131
+ # Super-simple keyword retriever
132
+ # -----------------------------
133
+ def _tokenize(s: str) -> List[str]:
134
+ return re.findall(r"[A-Za-z0-9_]+", s.lower())
135
+
136
+ def score_doc(query: str, doc_text: str) -> float:
137
+ """
138
+ Very light scorer: term overlap + a tiny BM25-ish adjustment by doc length.
139
+ """
140
+ q_tokens = _tokenize(query)
141
+ d_tokens = _tokenize(doc_text)
142
+ if not d_tokens:
143
+ return 0.0
144
+ q_set = set(q_tokens)
145
+ overlap = sum(1 for t in d_tokens if t in q_set)
146
+ # length normalization
147
+ return overlap / math.log2(len(d_tokens) + 2)
148
+
149
+ def retrieve_top_k(query: str, corpus: List[Dict[str, Any]], k: int = 10, per_table_cap: int = 5) -> List[Dict[str, Any]]:
150
+ # Score every doc
151
+ scored = [(score_doc(query, c["text"]), c) for c in corpus]
152
+ scored.sort(key=lambda x: x[0], reverse=True)
153
+ # Optional cap per table to avoid one table flooding the context
154
+ table_counts = {}
155
+ out = []
156
+ for s, c in scored:
157
+ if s <= 0:
158
+ continue
159
+ t = c.get("table", "unknown")
160
+ if table_counts.get(t, 0) >= per_table_cap:
161
+ continue
162
+ out.append(c)
163
+ table_counts[t] = table_counts.get(t, 0) + 1
164
+ if len(out) >= k:
165
+ break
166
+ # If nothing scored positive, at least return a couple of diverse items
167
+ if not out:
168
+ out = [c for _, c in scored[:k]]
169
+ return out
170
+
171
+ # -----------------------------
172
+ # Compose prompt for Together model
173
+ # -----------------------------
174
+ def build_prompt(query: str, passages: List[Dict[str, Any]]) -> str:
175
+ context_blocks = []
176
+ for p in passages:
177
+ context_blocks.append(p["text"])
178
+ context = "\n\n".join(context_blocks)
179
+ prompt = f"""You are a strict JSON-knowledge assistant. Answer ONLY using the provided context from the JSON export.
180
+ If the answer is not present, say you could not find it in the JSON.
181
+
182
+ # User question
183
+ {query}
184
+
185
+ # Context (JSON-derived snippets)
186
  {context}
187
 
188
+ # Instructions
189
+ - Cite table names and ids if helpful (e.g., table=admission_acceptance_lists idx=12).
190
+ - Do not invent any data that is not in the context."""
191
 
192
+ return prompt
 
 
 
193
 
194
+ # -----------------------------
195
+ # Together client helper
196
+ # -----------------------------
197
+ def call_together(api_key: str, prompt: str) -> str:
198
+ if not api_key or not api_key.strip():
199
+ return "⚠️ Please enter your Together API key."
200
+ # Set env and client to ensure the SDK picks it up everywhere
201
+ os.environ["TOGETHER_API_KEY"] = api_key.strip()
202
+ client = Together(api_key=api_key.strip())
203
+ resp = client.chat.completions.create(
204
+ model="lgai/exaone-3-5-32b-instruct",
205
+ messages=[{"role": "user", "content": prompt}],
206
+ temperature=0.2,
207
+ )
208
+ return resp.choices[0].message.content
209
+
210
+ # -----------------------------
211
+ # Gradio App
212
+ # -----------------------------
213
+ with gr.Blocks(title="JSON Chatbot (Together)") as demo:
214
+ gr.Markdown("## πŸ“š JSON Chatbot on Your Dump (Together Exaone 3.5 32B)\nUpload your JSON export and ask questions. The app safely loads imperfect JSON and retrieves the most relevant rows to answer your query.")
215
+
216
+ with gr.Row():
217
+ api_key_tb = gr.Textbox(label="Together API Key", type="password", placeholder="Paste your TOGETHER_API_KEY here")
218
+ topk_slider = gr.Slider(3, 20, value=10, step=1, label="Top-K JSON Passages")
219
+
220
+ with gr.Row():
221
+ json_file = gr.File(label="Upload JSON export (e.g., phpMyAdmin export)", file_count="single", file_types=[".json"])
222
+ fallback_path = gr.Textbox(label="Or fixed path on disk (optional)", placeholder="e.g., sultanbr_innovativeskills.json")
223
+
224
+ with gr.Accordion("Advanced", open=False):
225
+ per_table_cap = gr.Slider(1, 10, value=5, step=1, label="Max passages per table")
226
+ max_val_len = gr.Slider(100, 2000, value=500, step=50, label="Max value length per field (truncation)")
227
 
228
+ status = gr.Markdown("")
229
+ chatbot = gr.Chatbot(height=420)
230
+ user_box = gr.Textbox(label="Ask something about the JSON...", placeholder="e.g., What are the admission criteria?")
231
+ clear_btn = gr.Button("Clear", variant="secondary")
232
 
233
+ # States
234
+ state_corpus = gr.State([]) # list of {"table","idx","text"}
235
+ state_docs = gr.State([]) # raw list of parsed json objects
 
 
 
 
236
 
237
+ def load_json_to_corpus(file_obj, path_text, max_value_len):
238
+ """
239
+ Load JSON from uploaded file (preferred) or from a disk path (fallback).
240
+ Build corpus for retrieval. Returns (status_text, corpus, docs)
241
+ """
242
+ try:
243
+ if file_obj is not None:
244
+ with open(file_obj.name, "r", encoding="utf-8", errors="replace") as f:
245
+ raw = f.read()
246
+ else:
247
+ p = (path_text or "").strip()
248
+ if not p:
249
+ return ("⚠️ Please upload a JSON file or provide a valid path.", [], [])
250
+ with open(p, "r", encoding="utf-8", errors="replace") as f:
251
+ raw = f.read()
252
 
253
+ docs = safe_load_phpmyadmin_like_json(raw)
254
+
255
+ if not isinstance(docs, list):
256
+ # Some exports might be a single object β€” normalize to list
257
+ docs = [docs]
258
+
259
+ corpus = flatten_json_to_corpus(docs, max_value_len=int(max_value_len))
260
+
261
+ return (f"βœ… Loaded {len(docs)} top-level objects; built {len(corpus)} passages.", corpus, docs)
262
+
263
+ except Exception as e:
264
+ return (f"❌ Load error: {e}", [], [])
265
+
266
+ def ask(api_key, query, history, corpus, k, cap):
267
+ if not corpus:
268
+ return history + [[query, "⚠️ Please upload/load the JSON first."]]
269
+ if not query or not query.strip():
270
+ return history + [["", "⚠️ Please enter a question."]]
271
+
272
+ # Retrieve relevant snippets
273
+ top_passages = retrieve_top_k(query, corpus, k=int(k), per_table_cap=int(cap))
274
+ prompt = build_prompt(query, top_passages)
275
+
276
+ try:
277
+ answer = call_together(api_key, prompt)
278
+ except Exception as e:
279
+ answer = f"❌ API error: {e}"
280
+
281
+ history = history + [[query, answer]]
282
+ return history
283
+
284
+ # Wire events
285
+ json_file.upload(
286
+ load_json_to_corpus,
287
+ inputs=[json_file, fallback_path, max_val_len],
288
+ outputs=[status, state_corpus, state_docs],
289
+ )
290
+ fallback_path.change(
291
+ load_json_to_corpus,
292
+ inputs=[json_file, fallback_path, max_val_len],
293
+ outputs=[status, state_corpus, state_docs],
294
+ )
295
 
296
+ user_box.submit(
297
+ ask,
298
+ inputs=[api_key_tb, user_box, chatbot, state_corpus, topk_slider, per_table_cap],
299
+ outputs=[chatbot],
300
  )
301
 
302
+ clear_btn.click(lambda: ([], "", "πŸ”„ Ready. Upload JSON or set a path, then ask a question."),
303
+ inputs=[],
304
+ outputs=[chatbot, user_box, status])
305
 
 
306
  if __name__ == "__main__":
307
  demo.launch()