Spaces:

Lyte
/

Atlas-Chat-2B-Demo

Sleeping

App Files Files Community

Lyte commited on Sep 30, 2024

Commit

fce1fce

verified ·

1 Parent(s): 709df81

Create app.py

Browse files

Files changed (1) hide show

app.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import gradio as gr
+import copy
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+# Initialize Llama model from Hugging Face
+llm = Llama(
+    model_path=hf_hub_download(
+        repo_id=os.environ.get("REPO_ID", "mradermacher/Atlas-Chat-9B-GGUF"),
+        filename=os.environ.get("MODEL_FILE", "Atlas-Chat-9B.Q4_K_M.gguf"),
+    ),
+    n_ctx=4096,
+    n_gpu_layers=-1,
+)
+# Training prompt format for Atlas-Chat style conversation
+training_prompt = """<bos><start_of_turn>user
+{}<end_of_turn>
+<start_of_turn>model
+{}<end_of_turn>"""
+EOS_TOKEN = "<end_of_turn>"
+# Function to generate the text response based on conversation history
+def generate_text(
+    message,
+    history: list[tuple[str, str]],
+    max_tokens,
+    temperature,
+    top_p,
+):
+    temp = ""
+    input_prompt = ""
+    # Loop through the conversation history and add each turn to the prompt
+    for user_input, assistant_response in history:
+        input_prompt += training_prompt.format(user_input, assistant_response)
+    # Add the current message to the prompt
+    input_prompt += training_prompt.format(message, "")
+    # Generate the output using the model
+    output = llm(
+        input_prompt,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=40,
+        repeat_penalty=1.1,
+        max_tokens=max_tokens,
+        stop=[
+            EOS_TOKEN,
+            "<|endoftext|>"
+        ],
+        stream=True,
+    )
+    # Stream and yield the model’s output
+    for out in output:
+        stream = copy.deepcopy(out)
+        temp += stream["choices"][0]["text"]
+        yield temp
+# Define the Gradio interface
+demo = gr.ChatInterface(
+    generate_text,
+    title="Llama-3.1-8B-Instruct-Reasoner",
+    description="Running LLM with https://github.com/abetlen/llama-cpp-python",
+    examples=[
+        ['How to setup a human base on Mars? Give short answer.'],
+        ['Explain theory of relativity to me like I’m 8 years old.'],
+        ['شكون لي صنعك؟'],
+        ['أشنو كايمييز المملكة المغربية'],
+        ['شنو كيتسمى المنتخب المغربي؟']
+    ],
+    cache_examples=False,
+    retry_btn=None,
+    undo_btn="Delete Previous",
+    clear_btn="Clear",
+    additional_inputs=[
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p (nucleus sampling)",
+        ),
+    ],
+)
+# Launch the Gradio demo interface
+if __name__ == "__main__":
+    demo.launch()