Spaces:

kaizen9
/

server

Running

App Files Files Community

3v324v23 commited on 11 days ago

Commit

477e431

1 Parent(s): af954c4

app

Browse files

Files changed (1) hide show

app.py +39 -40

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
-from typing import List, Dict
 ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
 def respond(
@@ -11,27 +12,31 @@ def respond(
     max_tokens: int,
     temperature: float,
     top_p: float,
-    hf_token: gr.OAuthToken,  # <-- LoginButton injects this
 ):
-    # 0) Make sure user actually clicked "Login"
-    if hf_token is None or not getattr(hf_token, "token", None):
-        yield "🔒 Please click **Login** (left sidebar) to authorize Hugging Face access."
         return
-    # 1) Create client against your endpoint (not model=)
-    client = InferenceClient(
-        base_url=ENDPOINT_URL,
-        token=hf_token.token,   # <-- PAT from Login flow
-    )
-    # 2) Build messages for chat APIs
     messages = []
     if system_message:
         messages.append({"role": "system", "content": system_message})
     messages.extend(history or [])
     messages.append({"role": "user", "content": user_msg})
-    # 3) Try OpenAI-style /v1/chat if your endpoint supports it
     try:
         out = ""
         for chunk in client.chat_completion(
@@ -41,16 +46,16 @@ def respond(
             top_p=top_p,
             stream=True,
         ):
-            token = ""
             if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
-                token = chunk.choices[0].delta.content or ""
-            out += token
             yield out
         return
-    except Exception as chat_err:
-        chat_err_msg = str(chat_err)
-    # 4) Fallback to plain text-generation (works on vanilla TGI endpoints)
     try:
         def to_prompt(msgs: List[Dict[str, str]]) -> str:
             lines = []
@@ -59,11 +64,10 @@ def respond(
                 content = m.get("content", "")
                 tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
                 lines.append(f"[{tag}] {content}")
-            lines.append("[ASSISTANT]")  # cue the model to speak
             return "\n".join(lines)
         prompt = to_prompt(messages)
         out = ""
         for tok in client.text_generation(
             prompt,
@@ -78,22 +82,14 @@ def respond(
                 piece = piece["text"]
             out += str(piece)
             yield out
-    except Exception as gen_err:
-        # 5) Clear, helpful errors for auth/permissions/runtime
-        err_text = f"""❗ Failed to query the endpoint.
-• Chat API error: {chat_err_msg}
-• Text-generation fallback error: {gen_err}
-Quick checks:
-1) You clicked **Login** and authorized this app.
-2) Your HF token includes `inference.endpoints.infer.write`.
-3) The endpoint is running and supports either OpenAI chat or TGI generation.
-Endpoint: {ENDPOINT_URL}
-"""
-        yield err_text
 # --- UI ---
 chat = gr.ChatInterface(
@@ -104,16 +100,19 @@ chat = gr.ChatInterface(
         gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
         gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
     ],
 )
 with gr.Blocks() as demo:
     with gr.Sidebar():
-        gr.Markdown("### Hugging Face Login")
-        gr.LoginButton()  # <-- keep this
         gr.Markdown(
-            "- Make sure your token has **`inference.endpoints.infer.write`**.\n"
-            "- This app will use your HF token only to call the endpoint."
         )
         gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
     chat.render()

 import gradio as gr
 from huggingface_hub import InferenceClient
+from typing import List, Dict, Optional
+# Your endpoint root (no trailing /v1 here; the client adds it for chat)
 ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
 def respond(
     max_tokens: int,
     temperature: float,
     top_p: float,
+    hf_token: Optional[gr.OAuthToken],   # from LoginButton (kept)
+    pat_override: str,                   # NEW: user-pasted PAT (password field)
 ):
+    """
+    Use PAT override if provided; otherwise fall back to LoginButton token.
+    NOTE: OAuth token from LoginButton usually lacks `inference.endpoints.infer.write`,
+    so for Inference Endpoints you almost always need to paste a PAT here.
+    """
+    # Choose a token: prefer user-supplied PAT with endpoints write scope
+    token = pat_override.strip() or (getattr(hf_token, "token", None) if hf_token else None)
+    if not token:
+        yield "🔒 Please click **Login** OR paste a **Hugging Face PAT** with `inference.endpoints.infer.write`."
         return
+    client = InferenceClient(base_url=ENDPOINT_URL, token=token)
+    # Build messages (OpenAI-style)
     messages = []
     if system_message:
         messages.append({"role": "system", "content": system_message})
     messages.extend(history or [])
     messages.append({"role": "user", "content": user_msg})
+    # Try OpenAI-compatible chat route first: /v1/chat/completions
     try:
         out = ""
         for chunk in client.chat_completion(
             top_p=top_p,
             stream=True,
         ):
+            tok = ""
             if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
+                tok = chunk.choices[0].delta.content or ""
+            out += tok
             yield out
         return
+    except Exception as e_chat:
+        chat_err = str(e_chat)
+    # Fallback to plain generation (for non-OpenAI runtimes)
     try:
         def to_prompt(msgs: List[Dict[str, str]]) -> str:
             lines = []
                 content = m.get("content", "")
                 tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
                 lines.append(f"[{tag}] {content}")
+            lines.append("[ASSISTANT]")
             return "\n".join(lines)
         prompt = to_prompt(messages)
         out = ""
         for tok in client.text_generation(
             prompt,
                 piece = piece["text"]
             out += str(piece)
             yield out
+    except Exception as e_gen:
+        yield (
+            "❗ Endpoint call failed.\n\n"
+            f"• Chat API error: {chat_err}\n"
+            f"• Text-generation fallback error: {e_gen}\n\n"
+            "Most likely cause: the token used does NOT have `inference.endpoints.infer.write`.\n"
+            "Paste a PAT with that scope in the sidebar."
+        )
 # --- UI ---
 chat = gr.ChatInterface(
         gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
         gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
+        # NEW: secure PAT override
+        gr.Textbox(value="", label="HF PAT (with `inference.endpoints.infer.write`)", type="password"),
     ],
 )
 with gr.Blocks() as demo:
     with gr.Sidebar():
+        gr.Markdown("### Hugging Face Login (optional)")
+        gr.LoginButton()
         gr.Markdown(
+            "**Important:** Inference Endpoints require a PAT with\n"
+            "`inference.endpoints.infer.write`. The Login token usually does **not** have this.\n"
+            "Paste a PAT in the password field if you see 403 errors."
         )
         gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
     chat.render()