3v324v23 commited on
Commit
d081499
·
1 Parent(s): ff382d3
Files changed (1) hide show
  1. app.py +111 -22
app.py CHANGED
@@ -1,52 +1,141 @@
1
- import os
2
  import gradio as gr
3
  from huggingface_hub import InferenceClient
 
4
 
5
  ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
6
 
7
- def respond(message, history, system_message, max_tokens, temperature, top_p, hf_pat):
8
- token = hf_pat or os.getenv("HUGGINGFACEHUB_API_TOKEN")
9
- if not token:
10
- yield "Missing token. Paste a Hugging Face PAT with endpoint write access."
11
- return
 
 
 
 
 
 
12
 
13
- client = InferenceClient(model=ENDPOINT_URL, token=token)
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- messages = [{"role": "system", "content": system_message}, *history,
16
- {"role": "user", "content": message}]
 
 
17
 
 
 
 
 
 
 
18
  try:
19
- out = ""
20
  for chunk in client.chat_completion(
21
  messages=messages,
22
  max_tokens=max_tokens,
23
- stream=True,
24
  temperature=temperature,
25
  top_p=top_p,
 
26
  ):
27
- delta = chunk.choices[0].delta.content if chunk.choices and chunk.choices[0].delta else ""
28
- if delta:
29
- out += delta
30
- yield out
 
 
 
31
  except Exception as e:
32
- yield f"Request failed: {e}\n\nCheck that your token has `inference.endpoints.infer.write` and access to this endpoint."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
34
  chatbot = gr.ChatInterface(
35
  respond,
36
- type="messages",
37
  additional_inputs=[
38
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
39
- gr.Slider(1, 2048, value=512, step=1, label="Max new tokens"),
40
- gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"),
41
- gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"),
42
- gr.Textbox(lines=1, type="password", placeholder="hf_... token", label="HF PAT (with endpoint write)"),
43
  ],
44
  )
45
 
46
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
47
  chatbot.render()
48
 
49
  if __name__ == "__main__":
50
- # Or export HUGGINGFACEHUB_API_TOKEN in your shell and leave the textbox empty.
51
  demo.launch()
52
 
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ from typing import List, Dict
4
 
5
  ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud"
6
 
7
+ def respond(
8
+ user_msg: str,
9
+ history: List[Dict[str, str]],
10
+ system_message: str,
11
+ max_tokens: int,
12
+ temperature: float,
13
+ top_p: float,
14
+ hf_token: gr.OAuthToken,
15
+ ):
16
+ """
17
+ Streams chat responses from a Hugging Face Inference Endpoint.
18
 
19
+ Notes:
20
+ - Requires your endpoint to allow inference with your token (permission:
21
+ `inference.endpoints.infer.write`).
22
+ - If the endpoint doesn't support OpenAI-style /v1/chat (e.g., plain TGI),
23
+ we fallback to a single-prompt `.text_generation()` call using a simple
24
+ prompt format built from the chat history.
25
+ """
26
+ # 1) Client that talks directly to your endpoint
27
+ client = InferenceClient(
28
+ base_url=ENDPOINT_URL,
29
+ token=hf_token.token, # uses the OAuth token from the LoginButton
30
+ )
31
 
32
+ # 2) Build OpenAI-style messages for chat backends
33
+ messages = []
34
+ if system_message:
35
+ messages.append({"role": "system", "content": system_message})
36
 
37
+ # Gradio gives `history` as a list of {"role": "...", "content": "..."} when type="messages"
38
+ # Append previous turns, then the new user message
39
+ messages.extend(history or [])
40
+ messages.append({"role": "user", "content": user_msg})
41
+
42
+ # 3) Try OpenAI-style chat first (works if your endpoint exposes /v1/chat/completions)
43
  try:
44
+ response_text = ""
45
  for chunk in client.chat_completion(
46
  messages=messages,
47
  max_tokens=max_tokens,
 
48
  temperature=temperature,
49
  top_p=top_p,
50
+ stream=True,
51
  ):
52
+ # chunk.choices[0].delta.content is the streamed token (if present)
53
+ token = ""
54
+ if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
55
+ token = chunk.choices[0].delta.content or ""
56
+ response_text += token
57
+ yield response_text
58
+ return # success via chat api
59
  except Exception as e:
60
+ # If chat endpoint isn't available, fall back to text_generation
61
+ # (common when the endpoint is plain TGI without OpenAI route enabled)
62
+ fallback_reason = str(e)
63
+
64
+ # 4) Fallback: Plain text generation with a simple chat-to-prompt adapter
65
+ try:
66
+ def to_plain_prompt(msgs: List[Dict[str, str]]) -> str:
67
+ lines = []
68
+ for m in msgs:
69
+ role = m.get("role", "user")
70
+ content = m.get("content", "")
71
+ if role == "system":
72
+ lines.append(f"[SYSTEM] {content}")
73
+ elif role == "user":
74
+ lines.append(f"[USER] {content}")
75
+ else:
76
+ lines.append(f"[ASSISTANT] {content}")
77
+ lines.append("[ASSISTANT]") # cue the model to speak
78
+ return "\n".join(lines)
79
+
80
+ prompt = to_plain_prompt(messages)
81
+
82
+ response_text = ""
83
+ # stream text_generation tokens if the backend supports it
84
+ for tok in client.text_generation(
85
+ prompt,
86
+ max_new_tokens=max_tokens,
87
+ temperature=temperature,
88
+ top_p=top_p,
89
+ stream=True,
90
+ # Many TGI backends respect these kwargs; safe to include
91
+ return_full_text=False,
92
+ ):
93
+ # `tok` can be a string or an object depending on server; normalize to str
94
+ piece = getattr(tok, "token", tok)
95
+ if isinstance(piece, dict) and "text" in piece:
96
+ piece = piece["text"]
97
+ piece = str(piece)
98
+ response_text += piece
99
+ yield response_text
100
+
101
+ except Exception as e2:
102
+ # Surface a readable error in the chat window
103
+ err = (
104
+ "Failed to query the endpoint.\n\n"
105
+ f"- Chat attempt error: {fallback_reason}\n"
106
+ f"- Text-generation fallback error: {e2}\n\n"
107
+ "Check that your endpoint is running, your token has "
108
+ "`inference.endpoints.infer.write`, and the runtime supports either "
109
+ "OpenAI chat (/v1/chat/completions) or TGI text-generation."
110
+ )
111
+ yield err
112
+
113
 
114
+ # --- Gradio UI ---
115
  chatbot = gr.ChatInterface(
116
  respond,
117
+ type="messages", # history comes as [{"role": "...", "content": "..."}]
118
  additional_inputs=[
119
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
120
+ gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
121
+ gr.Slider(minimum=0.0, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
122
+ gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
 
123
  ],
124
  )
125
 
126
  with gr.Blocks() as demo:
127
+ with gr.Sidebar():
128
+ gr.Markdown("### Hugging Face Login")
129
+ # This provides `hf_token: gr.OAuthToken` to `respond`
130
+ gr.LoginButton()
131
+ gr.Markdown(
132
+ "Make sure your token has **`inference.endpoints.infer.write`** permission."
133
+ )
134
+ gr.Markdown(
135
+ f"**Endpoint**:\n\n`{ENDPOINT_URL}`"
136
+ )
137
  chatbot.render()
138
 
139
  if __name__ == "__main__":
 
140
  demo.launch()
141