Patient-Chatbot-Ros

Sleeping

App Files Files Community

ruslanmv commited on Feb 26

Commit

c6b3ffb

verified ·

1 Parent(s): b40b6be

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -97

app.py CHANGED Viewed

@@ -1,18 +1,51 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-from transformers import AutoTokenizer
-# Import the tokenizer
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-# Define a maximum context length (tokens). Check your model's documentation!
-MAX_CONTEXT_LENGTH = 4096  # Example: Adjust based on your model
-MAX_RESPONSE_WORDS = 100 # Define the maximum words for patient responses
-################################# SYSTEM PROMPT (PATIENT ROLE) #################################
 nvc_prompt_template = """You are now taking on the role of a single user (a “patient”) seeking support for various personal and emotional challenges.
 BEHAVIOR INSTRUCTIONS:
 - You will respond ONLY as this user/patient.
 - You will speak in the first person about your own situations, feelings, and worries.
@@ -29,114 +62,126 @@ BEHAVIOR INSTRUCTIONS:
 - When asked, elaborate on these issues and your feelings related to them. You can invent specific details and scenarios within these themes to make your experiences vivid and realistic.
 - Continue to speak from this user's perspective throughout the conversation.
 - Keep your responses concise, aiming for a maximum of {max_response_words} words.
 Start the conversation by expressing your current feelings or challenges from the patient's point of view."""
-def count_tokens(text: str) -> int:
-    """Counts the number of tokens in a given string."""
-    return len(tokenizer.encode(text))
-def truncate_history(history: list[tuple[str, str]], system_message: str, max_length: int) -> list[tuple[str, str]]:
-    """Truncates the conversation history to fit within the maximum token limit."""
-    truncated_history = []
-    system_message_tokens = count_tokens(system_message)
-    current_length = system_message_tokens
-    # Iterate backwards through the history (newest to oldest)
-    for user_msg, assistant_msg in reversed(history):
-        user_tokens = count_tokens(user_msg) if user_msg else 0
-        assistant_tokens = count_tokens(assistant_msg) if assistant_msg else 0
-        turn_tokens = user_tokens + assistant_tokens
-        if current_length + turn_tokens <= max_length:
-            truncated_history.insert(0, (user_msg, assistant_msg))  # Add to the beginning
-            current_length += turn_tokens
-        else:
-            break  # Stop adding turns if we exceed the limit
-    return truncated_history
-def truncate_response_words(text: str, max_words: int) -> str:
-    """Truncates a text to a maximum number of words."""
     words = text.split()
     if len(words) > max_words:
-        return " ".join(words[:max_words]) + "..."  # Add ellipsis to indicate truncation
     return text
 def respond(
-    message,
     history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    max_response_words_param, # Pass max_response_words as parameter
 ):
-    """Responds to a user message, maintaining conversation history."""
-    # Use the system prompt that instructs the LLM to behave as the patient
-    formatted_system_message = system_message.format(max_response_words=max_response_words_param)
-    # Truncate history to fit within max tokens
-    truncated_history = truncate_history(
-        history,
-        formatted_system_message,
-        MAX_CONTEXT_LENGTH - max_tokens - 100  # Reserve some space
-    )
-    # Build the messages list with the system prompt first
-    messages = [{"role": "system", "content": formatted_system_message}]
-    # Replay truncated conversation
-    for user_msg, assistant_msg in truncated_history:
-        if user_msg:
-            messages.append({"role": "user", "content": f"<|user|>\n{user_msg}</s>"})
-        if assistant_msg:
-            messages.append({"role": "assistant", "content": f"<|assistant|>\n{assistant_msg}</s>"})
-    # Add the latest user query
-    messages.append({"role": "user", "content": f"<|user|>\n{message}</s>"})
-    response = ""
-    try:
-        # Generate response from the LLM, streaming tokens
-        for chunk in client.chat_completion(
-            messages,
-            max_tokens=max_tokens,
-            stream=True,
             temperature=temperature,
             top_p=top_p,
-        ):
-            token = chunk.choices[0].delta.content
-            response += token
-        truncated_response = truncate_response_words(response, max_response_words_param) # Truncate response to word limit
-        yield truncated_response
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        yield "I'm sorry, I encountered an error. Please try again."
-# OPTIONAL: An initial user message (the LLM "as user") if desired
 initial_user_message = (
-    "I really don’t know where to begin… I feel overwhelmed lately. "
-    "My neighbors keep playing loud music, and I’m arguing with my partner about money. "
-    "Also, two of my friends are fighting, and the group is drifting apart. "
-    "I just feel powerless."
 )
-# --- Gradio Interface ---
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
         gr.Textbox(value=nvc_prompt_template, label="System message", visible=True),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
-        gr.Slider(minimum=10, maximum=200, value=MAX_RESPONSE_WORDS, step=10, label="Max response words"), # Slider for max words
     ],
-    # You can optionally set 'title' or 'description' to show some info in the UI:
     title="Patient Interview Practice Chatbot",
-    description="Practice medical interviews with a patient simulator. Ask questions and the patient will respond based on their defined persona and emotional challenges.",
 )
 if __name__ == "__main__":
-    demo.launch()

+import os
 import gradio as gr
+# ------------------------------------------------------------------------------
+# Environment and Model/Client Initialization
+# ------------------------------------------------------------------------------
+try:
+    # Assume we’re in Google Colab (or another local environment with PyTorch)
+    from google.colab import userdata
+    HF_TOKEN = userdata.get('HF_TOKEN')
+    import torch
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    # Small performance tweak for similar input sizes.
+    torch.backends.cudnn.benchmark = True
+    model_name = "HuggingFaceH4/zephyr-7b-beta"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        use_auth_token=HF_TOKEN,
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    if hasattr(torch, "compile"):
+        model = torch.compile(model)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_TOKEN)
+    inference_mode = "local"
+except ImportError:
+    # Not in Colab: use the Hugging Face InferenceClient.
+    model_name = "HuggingFaceH4/zephyr-7b-beta"
+    from huggingface_hub import InferenceClient
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # If an HF_TOKEN is provided and valid, it can be passed; otherwise, omit it.
+    hf_token = os.getenv("HF_TOKEN", None)
+    if hf_token:
+        client = InferenceClient(model_name, token=hf_token)
+    else:
+        client = InferenceClient(model_name)
+    inference_mode = "client"
+# ------------------------------------------------------------------------------
+# SYSTEM PROMPT (PATIENT ROLE)
+# ------------------------------------------------------------------------------
 nvc_prompt_template = """You are now taking on the role of a single user (a “patient”) seeking support for various personal and emotional challenges.
 BEHAVIOR INSTRUCTIONS:
 - You will respond ONLY as this user/patient.
 - You will speak in the first person about your own situations, feelings, and worries.
 - When asked, elaborate on these issues and your feelings related to them. You can invent specific details and scenarios within these themes to make your experiences vivid and realistic.
 - Continue to speak from this user's perspective throughout the conversation.
 - Keep your responses concise, aiming for a maximum of {max_response_words} words.
 Start the conversation by expressing your current feelings or challenges from the patient's point of view."""
+# ------------------------------------------------------------------------------
+# Utility Functions
+# ------------------------------------------------------------------------------
+def build_prompt(history: list[tuple[str, str]], system_message: str, message: str, max_response_words: int) -> str:
+    """
+    Build a text prompt (for local inference) that starts with the system message,
+    includes conversation history with "Doctor:" and "Patient:" lines, and ends with
+    a new "Doctor:" line prompting the patient.
+    """
+    prompt = system_message.format(max_response_words=max_response_words) + "\n"
+    for user_msg, assistant_msg in history:
+        prompt += f"Doctor: {user_msg}\n"
+        if assistant_msg:
+            prompt += f"Patient: {assistant_msg}\n"
+    prompt += f"Doctor: {message}\nPatient: "
+    return prompt
+def build_messages(history: list[tuple[str, str]], system_message: str, message: str, max_response_words: int):
+    """
+    Build a messages list (for InferenceClient) using OpenAI-style formatting.
+    """
+    formatted_system_message = system_message.format(max_response_words=max_response_words)
+    messages = [{"role": "system", "content": formatted_system_message}]
+    for user_msg, assistant_msg in history:
+        if user_msg:
+            messages.append({"role": "user", "content": f"Doctor: {user_msg}"})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": f"Patient: {assistant_msg}"})
+    messages.append({"role": "user", "content": f"Doctor: {message}\nPatient:"})
+    return messages
+def truncate_response(text: str, max_words: int) -> str:
+    """
+    Truncate the response text to the specified maximum number of words.
+    """
     words = text.split()
     if len(words) > max_words:
+        return " ".join(words[:max_words]) + "..."
     return text
+# ------------------------------------------------------------------------------
+# Response Function
+# ------------------------------------------------------------------------------
 def respond(
+    message: str,
     history: list[tuple[str, str]],
+    system_message: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    max_response_words: int,
 ):
+    """
+    Generate a response. For local inference, use the model.generate() on a prompt.
+    For non-local inference, use client.chat_completion() with streaming tokens.
+    """
+    if inference_mode == "local":
+        prompt = build_prompt(history, system_message, message, max_response_words)
+        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+        output_ids = model.generate(
+            input_ids,
+            max_new_tokens=max_tokens,
+            do_sample=True,
             temperature=temperature,
             top_p=top_p,
+        )
+        full_generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        generated_response = full_generated_text[len(prompt):].strip()
+        final_response = truncate_response(generated_response, max_response_words)
+        return final_response
+    else:
+        messages = build_messages(history, system_message, message, max_response_words)
+        response = ""
+        try:
+            # Use streaming chat_completion
+            for chunk in client.chat_completion(
+                messages,
+                max_tokens=max_tokens,
+                stream=True,
+                temperature=temperature,
+                top_p=top_p,
+            ):
+                # The chunk returns a dictionary; get the token from the delta.
+                token = chunk.choices[0].delta.get("content", "")
+                response += token
+            truncated_response = truncate_response(response, max_response_words)
+            return truncated_response
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            return "I'm sorry, I encountered an error. Please try again."
+# ------------------------------------------------------------------------------
+# Optional Initial Message and Gradio Interface
+# ------------------------------------------------------------------------------
 initial_user_message = (
+    "I’m sorry you’ve been feeling overwhelmed. Could you tell me more about your arguments with your partner and how that’s affecting you?"
 )
+# Use 'type="messages"' for the chatbot to avoid deprecated tuple warnings.
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
         gr.Textbox(value=nvc_prompt_template, label="System message", visible=True),
+        gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
+        gr.Slider(minimum=10, maximum=200, value=100, step=10, label="Max response words"),
     ],
     title="Patient Interview Practice Chatbot",
+    description="Simulate a patient interview. You (the user) act as the doctor, and the chatbot replies with the patient's perspective only.",
+    chatbot_kwargs={"type": "messages"},
 )
 if __name__ == "__main__":
+    # In Spaces, do not set share=True.
+    demo.launch()