Spaces:

sdafd
/

deepseek-r1-cpu

Sleeping

App Files Files Community

sdafd commited on Jan 31

Commit

6ea0840

verified ·

1 Parent(s): 704f6d2

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -34

app.py CHANGED Viewed

@@ -1,33 +1,29 @@
 import torch
-from transformers import pipeline, TextIteratorStreamer
 import gradio as gr
 import threading
 import time
-# Global variable to store the model pipeline
-model_pipeline = None
 model_loading_lock = threading.Lock()
 model_loaded = False  # Status flag to indicate if the model is loaded
 def load_model(model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"):
-    global model_pipeline, model_loaded
     with model_loading_lock:
         if not model_loaded:
             print("Loading model...")
-            pipe = pipeline(
-                "text-generation",
-                model=model_name,
                 device_map="sequential",
                 torch_dtype=torch.float16,
                 trust_remote_code=True,
-                truncation=True,
-                max_new_tokens=2048,
-                model_kwargs={
-                    "low_cpu_mem_usage": True,
-                    "offload_folder": "offload"
-                }
             )
-            model_pipeline = pipe
             model_loaded = True
             print("Model loaded successfully.")
         else:
@@ -42,9 +38,9 @@ def check_model_status():
     return model_loaded
 def chat(message, history, temperature, max_new_tokens):
-    global model_pipeline
-    stop_tokens = ["<|endoftext|>", "<|im_end|>","|im_end|"]
     # Ensure the model is loaded before proceeding
     if not check_model_status():
         yield "Model is not ready. Please try again later."
@@ -52,36 +48,35 @@ def chat(message, history, temperature, max_new_tokens):
     prompt = f"Human: {message}\n\nAssistant:"
     # Stream the response
     start_time = time.time()
     # Create a TextStreamer for token streaming
-    tokenizer = model_pipeline.tokenizer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    pipeline_kwargs = dict(
-        prompt=prompt,
         max_new_tokens=max_new_tokens,
         temperature=temperature,
         do_sample=True,
-        truncation=True,
         pad_token_id=tokenizer.eos_token_id,
         streamer=streamer  # Use the TextStreamer here
     )
-    # Create and start the thread with the model_pipeline function
-    t = threading.Thread(target=lambda: model_pipeline(**pipeline_kwargs))
     t.start()
     for new_token in streamer:
-        print(new_token)
         outputs.append(new_token)
-        if new_token in stop_tokens:
             break
-        yield "".join(outputs), "not implemented"
 def reload_model_button():
     """Reload the model manually via a button."""
     global model_loaded
@@ -119,11 +114,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     def respond(message, chat_history, temperature, max_new_tokens):
         bot_message = ""
-        status = ""
-        for partial_response, partial_status in chat(message, chat_history, temperature, max_new_tokens):
             bot_message = partial_response
-            status = partial_status
-            token_status.update(value=status)
             yield "", chat_history + [(message, bot_message)]
     send_button.click(respond, inputs=[textbox, chatbot, temperature_slider, max_tokens_slider], outputs=[textbox, chatbot])

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
 import threading
 import time
+# Global variables to store the model and tokenizer
+model = None
+tokenizer = None
 model_loading_lock = threading.Lock()
 model_loaded = False  # Status flag to indicate if the model is loaded
 def load_model(model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"):
+    global model, tokenizer, model_loaded
     with model_loading_lock:
         if not model_loaded:
             print("Loading model...")
+            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
                 device_map="sequential",
                 torch_dtype=torch.float16,
                 trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                offload_folder="offload"
             )
             model_loaded = True
             print("Model loaded successfully.")
         else:
     return model_loaded
 def chat(message, history, temperature, max_new_tokens):
+    global model, tokenizer
+    stop_tokens = ["\n", "|im_end|"]
     # Ensure the model is loaded before proceeding
     if not check_model_status():
         yield "Model is not ready. Please try again later."
     prompt = f"Human: {message}\n\nAssistant:"
+    # Tokenize the input
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     # Stream the response
     start_time = time.time()
     # Create a TextStreamer for token streaming
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=inputs.input_ids,
         max_new_tokens=max_new_tokens,
         temperature=temperature,
         do_sample=True,
         pad_token_id=tokenizer.eos_token_id,
         streamer=streamer  # Use the TextStreamer here
     )
+    # Create and start the thread with the model.generate function
+    t = threading.Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    outputs = []
     for new_token in streamer:
         outputs.append(new_token)
+        if any(stop_token in new_token for stop_token in stop_tokens):
             break
+        yield "".join(outputs)
 def reload_model_button():
     """Reload the model manually via a button."""
     global model_loaded
     def respond(message, chat_history, temperature, max_new_tokens):
         bot_message = ""
+        for partial_response in chat(message, chat_history, temperature, max_new_tokens):
             bot_message = partial_response
             yield "", chat_history + [(message, bot_message)]
     send_button.click(respond, inputs=[textbox, chatbot, temperature_slider, max_tokens_slider], outputs=[textbox, chatbot])