Spaces:

ultralight99
/

training_deepseek

Sleeping

App Files Files Community

ultralight99 commited on Mar 3

Commit

d2b9475

1 Parent(s): cd49348

Added files

Browse files

Files changed (2) hide show

app.py +151 -2
requirements.txt +8 -0

app.py CHANGED Viewed

@@ -1,4 +1,153 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import psutil
+import matplotlib.pyplot as plt
+import seaborn as sns
+import time
+import os
+from vllm import LLM, SamplingParams
+import numpy as np
+# Streamlit app configuration
+st.set_page_config(page_title="DeepSeek Tuning App", layout="wide")
+st.title("DeepSeek Model Tuning for RAM and Context Length")
+# Sidebar for user inputs
+st.sidebar.header("Configuration")
+model_choice = st.sidebar.selectbox(
+    "Select DeepSeek Model",
+    ["deepseek-ai/DeepSeek-V2-Lite-Instruct", "deepseek-ai/DeepSeek-V3"],
+    help="DeepSeek-V3 is 671B params, V2-Lite is more manageable at 15.7B."
+)
+context_length = st.sidebar.slider("Max Context Length", 1024, 32768, 4096, step=1024)
+quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True)
+run_button = st.sidebar.button("Run Model")
+# Function to get RAM usage
+def get_ram_usage():
+    return psutil.virtual_memory().percent
+# Function to install and load the model
+@st.cache_resource
+def load_model(model_name, quantize=False):
+    try:
+        st.write(f"Loading {model_name}...")
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        if model_name == "deepseek-ai/DeepSeek-V3":
+            # For V3, we'll assume vLLM for efficiency (requires setup)
+            llm = LLM(model=model_name, max_model_len=context_length, tensor_parallel_size=1)
+            return llm, tokenizer
+        else:
+            # For V2-Lite, use transformers with quantization if selected
+            if quantize:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    trust_remote_code=True,
+                    torch_dtype=torch.bfloat16,
+                    device_map="auto",
+                    load_in_4bit=True
+                )
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    trust_remote_code=True,
+                    torch_dtype=torch.bfloat16,
+                    device_map="auto"
+                )
+            return model, tokenizer
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None, None
+# Function to tune and run inference
+def run_inference(model, tokenizer, context_len, model_name):
+    ram_usages = []
+    inference_times = []
+    prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50)  # Scale prompt to context length
+    if model_name == "deepseek-ai/DeepSeek-V3":
+        # vLLM inference
+        sampling_params = SamplingParams(max_tokens=100, temperature=0.7)
+        start_time = time.time()
+        ram_before = get_ram_usage()
+        outputs = model.generate([prompt], sampling_params)
+        ram_after = get_ram_usage()
+        inference_time = time.time() - start_time
+        result = outputs[0].outputs[0].text
+    else:
+        # Transformers inference
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len).to("cuda")
+        start_time = time.time()
+        ram_before = get_ram_usage()
+        outputs = model.generate(**inputs, max_new_tokens=100)
+        ram_after = get_ram_usage()
+        inference_time = time.time() - start_time
+        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    ram_usages.extend([ram_before, ram_after])
+    inference_times.append(inference_time)
+    return result, ram_usages, inference_times
+# Visualization function
+def plot_results(ram_usages, inference_times, context_len):
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
+    # RAM Usage Plot
+    sns.barplot(x=["Before", "After"], y=ram_usages, ax=ax1)
+    ax1.set_title(f"RAM Usage (%) - Context Length: {context_len}")
+    ax1.set_ylabel("RAM Usage (%)")
+    # Inference Time Plot
+    sns.barplot(x=["Inference"], y=inference_times, ax=ax2)
+    ax2.set_title("Inference Time (seconds)")
+    ax2.set_ylabel("Time (s)")
+    st.pyplot(fig)
+# Main execution
+if run_button:
+    with st.spinner("Installing and tuning the model..."):
+        # Install dependencies if needed (for Hugging Face Space, assume pre-installed)
+        if not os.path.exists("./vllm_installed"):
+            st.write("Installing vLLM for DeepSeek-V3 support...")
+            os.system("pip install vllm")
+            with open("./vllm_installed", "w") as f:
+                f.write("installed")
+        # Load model
+        model, tokenizer = load_model(model_choice, quantization)
+        if model is None or tokenizer is None:
+            st.stop()
+        # Tune for max RAM and context length
+        st.write(f"Tuning {model_choice} with context length {context_length}...")
+        if model_choice == "deepseek-ai/DeepSeek-V3":
+            st.warning("DeepSeek-V3 requires significant GPU resources. Ensure proper setup.")
+        # Run inference
+        result, ram_usages, inference_times = run_inference(model, tokenizer, context_length, model_choice)
+        # Display results
+        st.subheader("Generated Output")
+        st.write(result)
+        st.subheader("Performance Metrics")
+        plot_results(ram_usages, inference_times, context_length)
+        # Additional info
+        st.write(f"Max Context Length Used: {context_length}")
+        st.write(f"Quantization Enabled: {quantization}")
+        st.write(f"Average RAM Usage: {np.mean(ram_usages):.2f}%")
+        st.write(f"Inference Time: {inference_times[0]:.2f} seconds")
+# Instructions for user
+st.markdown("""
+### Instructions
+1. Select the DeepSeek model from the sidebar.
+2. Adjust the context length (higher values use more RAM).
+3. Enable quantization to reduce RAM usage (optional).
+4. Click 'Run Model' to install, tune, and visualize results.
+**Note:** DeepSeek-V3 (671B) requires high-end hardware. Use V2-Lite for moderate setups.
+""")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+transformers
+vllm
+psutil
+matplotlib
+seaborn
+streamlit
+numpy