3v324v23 commited on
Commit
477e431
Β·
1 Parent(s): af954c4
Files changed (1) hide show
  1. app.py +39 -40
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
- from typing import List, Dict
4
 
 
5
  ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
6
 
7
  def respond(
@@ -11,27 +12,31 @@ def respond(
11
  max_tokens: int,
12
  temperature: float,
13
  top_p: float,
14
- hf_token: gr.OAuthToken, # <-- LoginButton injects this
 
15
  ):
16
- # 0) Make sure user actually clicked "Login"
17
- if hf_token is None or not getattr(hf_token, "token", None):
18
- yield "πŸ”’ Please click **Login** (left sidebar) to authorize Hugging Face access."
 
 
 
 
 
 
 
19
  return
20
 
21
- # 1) Create client against your endpoint (not model=)
22
- client = InferenceClient(
23
- base_url=ENDPOINT_URL,
24
- token=hf_token.token, # <-- PAT from Login flow
25
- )
26
 
27
- # 2) Build messages for chat APIs
28
  messages = []
29
  if system_message:
30
  messages.append({"role": "system", "content": system_message})
31
  messages.extend(history or [])
32
  messages.append({"role": "user", "content": user_msg})
33
 
34
- # 3) Try OpenAI-style /v1/chat if your endpoint supports it
35
  try:
36
  out = ""
37
  for chunk in client.chat_completion(
@@ -41,16 +46,16 @@ def respond(
41
  top_p=top_p,
42
  stream=True,
43
  ):
44
- token = ""
45
  if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
46
- token = chunk.choices[0].delta.content or ""
47
- out += token
48
  yield out
49
  return
50
- except Exception as chat_err:
51
- chat_err_msg = str(chat_err)
52
 
53
- # 4) Fallback to plain text-generation (works on vanilla TGI endpoints)
54
  try:
55
  def to_prompt(msgs: List[Dict[str, str]]) -> str:
56
  lines = []
@@ -59,11 +64,10 @@ def respond(
59
  content = m.get("content", "")
60
  tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
61
  lines.append(f"[{tag}] {content}")
62
- lines.append("[ASSISTANT]") # cue the model to speak
63
  return "\n".join(lines)
64
 
65
  prompt = to_prompt(messages)
66
-
67
  out = ""
68
  for tok in client.text_generation(
69
  prompt,
@@ -78,22 +82,14 @@ def respond(
78
  piece = piece["text"]
79
  out += str(piece)
80
  yield out
81
-
82
- except Exception as gen_err:
83
- # 5) Clear, helpful errors for auth/permissions/runtime
84
- err_text = f"""❗ Failed to query the endpoint.
85
-
86
- β€’ Chat API error: {chat_err_msg}
87
- β€’ Text-generation fallback error: {gen_err}
88
-
89
- Quick checks:
90
- 1) You clicked **Login** and authorized this app.
91
- 2) Your HF token includes `inference.endpoints.infer.write`.
92
- 3) The endpoint is running and supports either OpenAI chat or TGI generation.
93
- Endpoint: {ENDPOINT_URL}
94
- """
95
- yield err_text
96
-
97
 
98
  # --- UI ---
99
  chat = gr.ChatInterface(
@@ -104,16 +100,19 @@ chat = gr.ChatInterface(
104
  gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
105
  gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
106
  gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
 
 
107
  ],
108
  )
109
 
110
  with gr.Blocks() as demo:
111
  with gr.Sidebar():
112
- gr.Markdown("### Hugging Face Login")
113
- gr.LoginButton() # <-- keep this
114
  gr.Markdown(
115
- "- Make sure your token has **`inference.endpoints.infer.write`**.\n"
116
- "- This app will use your HF token only to call the endpoint."
 
117
  )
118
  gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
119
  chat.render()
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ from typing import List, Dict, Optional
4
 
5
+ # Your endpoint root (no trailing /v1 here; the client adds it for chat)
6
  ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
7
 
8
  def respond(
 
12
  max_tokens: int,
13
  temperature: float,
14
  top_p: float,
15
+ hf_token: Optional[gr.OAuthToken], # from LoginButton (kept)
16
+ pat_override: str, # NEW: user-pasted PAT (password field)
17
  ):
18
+ """
19
+ Use PAT override if provided; otherwise fall back to LoginButton token.
20
+ NOTE: OAuth token from LoginButton usually lacks `inference.endpoints.infer.write`,
21
+ so for Inference Endpoints you almost always need to paste a PAT here.
22
+ """
23
+
24
+ # Choose a token: prefer user-supplied PAT with endpoints write scope
25
+ token = pat_override.strip() or (getattr(hf_token, "token", None) if hf_token else None)
26
+ if not token:
27
+ yield "πŸ”’ Please click **Login** OR paste a **Hugging Face PAT** with `inference.endpoints.infer.write`."
28
  return
29
 
30
+ client = InferenceClient(base_url=ENDPOINT_URL, token=token)
 
 
 
 
31
 
32
+ # Build messages (OpenAI-style)
33
  messages = []
34
  if system_message:
35
  messages.append({"role": "system", "content": system_message})
36
  messages.extend(history or [])
37
  messages.append({"role": "user", "content": user_msg})
38
 
39
+ # Try OpenAI-compatible chat route first: /v1/chat/completions
40
  try:
41
  out = ""
42
  for chunk in client.chat_completion(
 
46
  top_p=top_p,
47
  stream=True,
48
  ):
49
+ tok = ""
50
  if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
51
+ tok = chunk.choices[0].delta.content or ""
52
+ out += tok
53
  yield out
54
  return
55
+ except Exception as e_chat:
56
+ chat_err = str(e_chat)
57
 
58
+ # Fallback to plain generation (for non-OpenAI runtimes)
59
  try:
60
  def to_prompt(msgs: List[Dict[str, str]]) -> str:
61
  lines = []
 
64
  content = m.get("content", "")
65
  tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
66
  lines.append(f"[{tag}] {content}")
67
+ lines.append("[ASSISTANT]")
68
  return "\n".join(lines)
69
 
70
  prompt = to_prompt(messages)
 
71
  out = ""
72
  for tok in client.text_generation(
73
  prompt,
 
82
  piece = piece["text"]
83
  out += str(piece)
84
  yield out
85
+ except Exception as e_gen:
86
+ yield (
87
+ "❗ Endpoint call failed.\n\n"
88
+ f"β€’ Chat API error: {chat_err}\n"
89
+ f"β€’ Text-generation fallback error: {e_gen}\n\n"
90
+ "Most likely cause: the token used does NOT have `inference.endpoints.infer.write`.\n"
91
+ "Paste a PAT with that scope in the sidebar."
92
+ )
 
 
 
 
 
 
 
 
93
 
94
  # --- UI ---
95
  chat = gr.ChatInterface(
 
100
  gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
101
  gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
102
  gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
103
+ # NEW: secure PAT override
104
+ gr.Textbox(value="", label="HF PAT (with `inference.endpoints.infer.write`)", type="password"),
105
  ],
106
  )
107
 
108
  with gr.Blocks() as demo:
109
  with gr.Sidebar():
110
+ gr.Markdown("### Hugging Face Login (optional)")
111
+ gr.LoginButton()
112
  gr.Markdown(
113
+ "**Important:** Inference Endpoints require a PAT with\n"
114
+ "`inference.endpoints.infer.write`. The Login token usually does **not** have this.\n"
115
+ "Paste a PAT in the password field if you see 403 errors."
116
  )
117
  gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
118
  chat.render()