app
Browse files
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
-
from typing import List, Dict
|
4 |
|
|
|
5 |
ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
|
6 |
|
7 |
def respond(
|
@@ -11,27 +12,31 @@ def respond(
|
|
11 |
max_tokens: int,
|
12 |
temperature: float,
|
13 |
top_p: float,
|
14 |
-
hf_token: gr.OAuthToken,
|
|
|
15 |
):
|
16 |
-
|
17 |
-
if
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
return
|
20 |
|
21 |
-
|
22 |
-
client = InferenceClient(
|
23 |
-
base_url=ENDPOINT_URL,
|
24 |
-
token=hf_token.token, # <-- PAT from Login flow
|
25 |
-
)
|
26 |
|
27 |
-
#
|
28 |
messages = []
|
29 |
if system_message:
|
30 |
messages.append({"role": "system", "content": system_message})
|
31 |
messages.extend(history or [])
|
32 |
messages.append({"role": "user", "content": user_msg})
|
33 |
|
34 |
-
#
|
35 |
try:
|
36 |
out = ""
|
37 |
for chunk in client.chat_completion(
|
@@ -41,16 +46,16 @@ def respond(
|
|
41 |
top_p=top_p,
|
42 |
stream=True,
|
43 |
):
|
44 |
-
|
45 |
if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
|
46 |
-
|
47 |
-
out +=
|
48 |
yield out
|
49 |
return
|
50 |
-
except Exception as
|
51 |
-
|
52 |
|
53 |
-
#
|
54 |
try:
|
55 |
def to_prompt(msgs: List[Dict[str, str]]) -> str:
|
56 |
lines = []
|
@@ -59,11 +64,10 @@ def respond(
|
|
59 |
content = m.get("content", "")
|
60 |
tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
|
61 |
lines.append(f"[{tag}] {content}")
|
62 |
-
lines.append("[ASSISTANT]")
|
63 |
return "\n".join(lines)
|
64 |
|
65 |
prompt = to_prompt(messages)
|
66 |
-
|
67 |
out = ""
|
68 |
for tok in client.text_generation(
|
69 |
prompt,
|
@@ -78,22 +82,14 @@ def respond(
|
|
78 |
piece = piece["text"]
|
79 |
out += str(piece)
|
80 |
yield out
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
Quick checks:
|
90 |
-
1) You clicked **Login** and authorized this app.
|
91 |
-
2) Your HF token includes `inference.endpoints.infer.write`.
|
92 |
-
3) The endpoint is running and supports either OpenAI chat or TGI generation.
|
93 |
-
Endpoint: {ENDPOINT_URL}
|
94 |
-
"""
|
95 |
-
yield err_text
|
96 |
-
|
97 |
|
98 |
# --- UI ---
|
99 |
chat = gr.ChatInterface(
|
@@ -104,16 +100,19 @@ chat = gr.ChatInterface(
|
|
104 |
gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
|
105 |
gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
|
106 |
gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
|
|
|
|
|
107 |
],
|
108 |
)
|
109 |
|
110 |
with gr.Blocks() as demo:
|
111 |
with gr.Sidebar():
|
112 |
-
gr.Markdown("### Hugging Face Login")
|
113 |
-
gr.LoginButton()
|
114 |
gr.Markdown(
|
115 |
-
"
|
116 |
-
"
|
|
|
117 |
)
|
118 |
gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
|
119 |
chat.render()
|
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
+
from typing import List, Dict, Optional
|
4 |
|
5 |
+
# Your endpoint root (no trailing /v1 here; the client adds it for chat)
|
6 |
ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
|
7 |
|
8 |
def respond(
|
|
|
12 |
max_tokens: int,
|
13 |
temperature: float,
|
14 |
top_p: float,
|
15 |
+
hf_token: Optional[gr.OAuthToken], # from LoginButton (kept)
|
16 |
+
pat_override: str, # NEW: user-pasted PAT (password field)
|
17 |
):
|
18 |
+
"""
|
19 |
+
Use PAT override if provided; otherwise fall back to LoginButton token.
|
20 |
+
NOTE: OAuth token from LoginButton usually lacks `inference.endpoints.infer.write`,
|
21 |
+
so for Inference Endpoints you almost always need to paste a PAT here.
|
22 |
+
"""
|
23 |
+
|
24 |
+
# Choose a token: prefer user-supplied PAT with endpoints write scope
|
25 |
+
token = pat_override.strip() or (getattr(hf_token, "token", None) if hf_token else None)
|
26 |
+
if not token:
|
27 |
+
yield "π Please click **Login** OR paste a **Hugging Face PAT** with `inference.endpoints.infer.write`."
|
28 |
return
|
29 |
|
30 |
+
client = InferenceClient(base_url=ENDPOINT_URL, token=token)
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
# Build messages (OpenAI-style)
|
33 |
messages = []
|
34 |
if system_message:
|
35 |
messages.append({"role": "system", "content": system_message})
|
36 |
messages.extend(history or [])
|
37 |
messages.append({"role": "user", "content": user_msg})
|
38 |
|
39 |
+
# Try OpenAI-compatible chat route first: /v1/chat/completions
|
40 |
try:
|
41 |
out = ""
|
42 |
for chunk in client.chat_completion(
|
|
|
46 |
top_p=top_p,
|
47 |
stream=True,
|
48 |
):
|
49 |
+
tok = ""
|
50 |
if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
|
51 |
+
tok = chunk.choices[0].delta.content or ""
|
52 |
+
out += tok
|
53 |
yield out
|
54 |
return
|
55 |
+
except Exception as e_chat:
|
56 |
+
chat_err = str(e_chat)
|
57 |
|
58 |
+
# Fallback to plain generation (for non-OpenAI runtimes)
|
59 |
try:
|
60 |
def to_prompt(msgs: List[Dict[str, str]]) -> str:
|
61 |
lines = []
|
|
|
64 |
content = m.get("content", "")
|
65 |
tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
|
66 |
lines.append(f"[{tag}] {content}")
|
67 |
+
lines.append("[ASSISTANT]")
|
68 |
return "\n".join(lines)
|
69 |
|
70 |
prompt = to_prompt(messages)
|
|
|
71 |
out = ""
|
72 |
for tok in client.text_generation(
|
73 |
prompt,
|
|
|
82 |
piece = piece["text"]
|
83 |
out += str(piece)
|
84 |
yield out
|
85 |
+
except Exception as e_gen:
|
86 |
+
yield (
|
87 |
+
"β Endpoint call failed.\n\n"
|
88 |
+
f"β’ Chat API error: {chat_err}\n"
|
89 |
+
f"β’ Text-generation fallback error: {e_gen}\n\n"
|
90 |
+
"Most likely cause: the token used does NOT have `inference.endpoints.infer.write`.\n"
|
91 |
+
"Paste a PAT with that scope in the sidebar."
|
92 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
# --- UI ---
|
95 |
chat = gr.ChatInterface(
|
|
|
100 |
gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
|
101 |
gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
|
102 |
gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
|
103 |
+
# NEW: secure PAT override
|
104 |
+
gr.Textbox(value="", label="HF PAT (with `inference.endpoints.infer.write`)", type="password"),
|
105 |
],
|
106 |
)
|
107 |
|
108 |
with gr.Blocks() as demo:
|
109 |
with gr.Sidebar():
|
110 |
+
gr.Markdown("### Hugging Face Login (optional)")
|
111 |
+
gr.LoginButton()
|
112 |
gr.Markdown(
|
113 |
+
"**Important:** Inference Endpoints require a PAT with\n"
|
114 |
+
"`inference.endpoints.infer.write`. The Login token usually does **not** have this.\n"
|
115 |
+
"Paste a PAT in the password field if you see 403 errors."
|
116 |
)
|
117 |
gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
|
118 |
chat.render()
|