Spaces:

kaizen9
/

server

Running

server / app.py

6220bad 16 days ago

4.45 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	from typing import List, Dict, Optional

	# Your endpoint root (no trailing /v1 here; the client adds it for chat)
	ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud/v1/"
	def respond(
	user_msg: str,
	history: List[Dict[str, str]],
	system_message: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	hf_token: Optional[gr.OAuthToken], # from LoginButton (kept)
	pat_override: str, # NEW: user-pasted PAT (password field)
	):
	"""
	Use PAT override if provided; otherwise fall back to LoginButton token.
	NOTE: OAuth token from LoginButton usually lacks `inference.endpoints.infer.write`,
	so for Inference Endpoints you almost always need to paste a PAT here.
	"""

	# Choose a token: prefer user-supplied PAT with endpoints write scope
	token = pat_override.strip() or (getattr(hf_token, "token", None) if hf_token else None)
	if not token:
	yield "🔒 Please click Login OR paste a Hugging Face PAT with `inference.endpoints.infer.write`."
	return

	client = InferenceClient(base_url=ENDPOINT_URL, token=token)

	# Build messages (OpenAI-style)
	messages = []
	if system_message:
	messages.append({"role": "system", "content": system_message})
	messages.extend(history or [])
	messages.append({"role": "user", "content": user_msg})

	# Try OpenAI-compatible chat route first: /v1/chat/completions
	try:
	out = ""
	for chunk in client.chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=True,
	):
	tok = ""
	if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
	tok = chunk.choices[0].delta.content or ""
	out += tok
	yield out
	return
	except Exception as e_chat:
	chat_err = str(e_chat)

	# Fallback to plain generation (for non-OpenAI runtimes)
	try:
	def to_prompt(msgs: List[Dict[str, str]]) -> str:
	lines = []
	for m in msgs:
	role = m.get("role", "user")
	content = m.get("content", "")
	tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
	lines.append(f"[{tag}] {content}")
	lines.append("[ASSISTANT]")
	return "\n".join(lines)

	prompt = to_prompt(messages)
	out = ""
	for tok in client.text_generation(
	prompt,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=True,
	return_full_text=False,
	):
	piece = getattr(tok, "token", tok)
	if isinstance(piece, dict) and "text" in piece:
	piece = piece["text"]
	out += str(piece)
	yield out
	except Exception as e_gen:
	yield (
	"❗ Endpoint call failed.\n\n"
	f"• Chat API error: {chat_err}\n"
	f"• Text-generation fallback error: {e_gen}\n\n"
	"Most likely cause: the token used does NOT have `inference.endpoints.infer.write`.\n"
	"Paste a PAT with that scope in the sidebar."
	)

	# --- UI ---
	chat = gr.ChatInterface(
	respond,
	type="messages",
	additional_inputs=[
	gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
	gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
	gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
	# NEW: secure PAT override
	gr.Textbox(value="", label="HF PAT (with `inference.endpoints.infer.write`)", type="password"),
	],
	)

	with gr.Blocks() as demo:
	with gr.Sidebar():
	gr.Markdown("### Hugging Face Login (optional)")
	gr.LoginButton()
	gr.Markdown(
	"Important: Inference Endpoints require a PAT with\n"
	"`inference.endpoints.infer.write`. The Login token usually does not have this.\n"
	"Paste a PAT in the password field if you see 403 errors."
	)
	gr.Markdown(f"Endpoint: `{ENDPOINT_URL}`")
	chat.render()

	if __name__ == "__main__":
	demo.launch()