ultralight99 commited on
Commit
d2b9475
·
1 Parent(s): cd49348

Added files

Browse files
Files changed (2) hide show
  1. app.py +151 -2
  2. requirements.txt +8 -0
app.py CHANGED
@@ -1,4 +1,153 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import psutil
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import time
8
+ import os
9
+ from vllm import LLM, SamplingParams
10
+ import numpy as np
11
 
12
+ # Streamlit app configuration
13
+ st.set_page_config(page_title="DeepSeek Tuning App", layout="wide")
14
+ st.title("DeepSeek Model Tuning for RAM and Context Length")
15
+
16
+ # Sidebar for user inputs
17
+ st.sidebar.header("Configuration")
18
+ model_choice = st.sidebar.selectbox(
19
+ "Select DeepSeek Model",
20
+ ["deepseek-ai/DeepSeek-V2-Lite-Instruct", "deepseek-ai/DeepSeek-V3"],
21
+ help="DeepSeek-V3 is 671B params, V2-Lite is more manageable at 15.7B."
22
+ )
23
+ context_length = st.sidebar.slider("Max Context Length", 1024, 32768, 4096, step=1024)
24
+ quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True)
25
+ run_button = st.sidebar.button("Run Model")
26
+
27
+ # Function to get RAM usage
28
+ def get_ram_usage():
29
+ return psutil.virtual_memory().percent
30
+
31
+ # Function to install and load the model
32
+ @st.cache_resource
33
+ def load_model(model_name, quantize=False):
34
+ try:
35
+ st.write(f"Loading {model_name}...")
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
37
+
38
+ if model_name == "deepseek-ai/DeepSeek-V3":
39
+ # For V3, we'll assume vLLM for efficiency (requires setup)
40
+ llm = LLM(model=model_name, max_model_len=context_length, tensor_parallel_size=1)
41
+ return llm, tokenizer
42
+ else:
43
+ # For V2-Lite, use transformers with quantization if selected
44
+ if quantize:
45
+ model = AutoModelForCausalLM.from_pretrained(
46
+ model_name,
47
+ trust_remote_code=True,
48
+ torch_dtype=torch.bfloat16,
49
+ device_map="auto",
50
+ load_in_4bit=True
51
+ )
52
+ else:
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ model_name,
55
+ trust_remote_code=True,
56
+ torch_dtype=torch.bfloat16,
57
+ device_map="auto"
58
+ )
59
+ return model, tokenizer
60
+ except Exception as e:
61
+ st.error(f"Error loading model: {str(e)}")
62
+ return None, None
63
+
64
+ # Function to tune and run inference
65
+ def run_inference(model, tokenizer, context_len, model_name):
66
+ ram_usages = []
67
+ inference_times = []
68
+ prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50) # Scale prompt to context length
69
+
70
+ if model_name == "deepseek-ai/DeepSeek-V3":
71
+ # vLLM inference
72
+ sampling_params = SamplingParams(max_tokens=100, temperature=0.7)
73
+ start_time = time.time()
74
+ ram_before = get_ram_usage()
75
+ outputs = model.generate([prompt], sampling_params)
76
+ ram_after = get_ram_usage()
77
+ inference_time = time.time() - start_time
78
+ result = outputs[0].outputs[0].text
79
+ else:
80
+ # Transformers inference
81
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len).to("cuda")
82
+ start_time = time.time()
83
+ ram_before = get_ram_usage()
84
+ outputs = model.generate(**inputs, max_new_tokens=100)
85
+ ram_after = get_ram_usage()
86
+ inference_time = time.time() - start_time
87
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
+
89
+ ram_usages.extend([ram_before, ram_after])
90
+ inference_times.append(inference_time)
91
+ return result, ram_usages, inference_times
92
+
93
+ # Visualization function
94
+ def plot_results(ram_usages, inference_times, context_len):
95
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
96
+
97
+ # RAM Usage Plot
98
+ sns.barplot(x=["Before", "After"], y=ram_usages, ax=ax1)
99
+ ax1.set_title(f"RAM Usage (%) - Context Length: {context_len}")
100
+ ax1.set_ylabel("RAM Usage (%)")
101
+
102
+ # Inference Time Plot
103
+ sns.barplot(x=["Inference"], y=inference_times, ax=ax2)
104
+ ax2.set_title("Inference Time (seconds)")
105
+ ax2.set_ylabel("Time (s)")
106
+
107
+ st.pyplot(fig)
108
+
109
+ # Main execution
110
+ if run_button:
111
+ with st.spinner("Installing and tuning the model..."):
112
+ # Install dependencies if needed (for Hugging Face Space, assume pre-installed)
113
+ if not os.path.exists("./vllm_installed"):
114
+ st.write("Installing vLLM for DeepSeek-V3 support...")
115
+ os.system("pip install vllm")
116
+ with open("./vllm_installed", "w") as f:
117
+ f.write("installed")
118
+
119
+ # Load model
120
+ model, tokenizer = load_model(model_choice, quantization)
121
+ if model is None or tokenizer is None:
122
+ st.stop()
123
+
124
+ # Tune for max RAM and context length
125
+ st.write(f"Tuning {model_choice} with context length {context_length}...")
126
+ if model_choice == "deepseek-ai/DeepSeek-V3":
127
+ st.warning("DeepSeek-V3 requires significant GPU resources. Ensure proper setup.")
128
+
129
+ # Run inference
130
+ result, ram_usages, inference_times = run_inference(model, tokenizer, context_length, model_choice)
131
+
132
+ # Display results
133
+ st.subheader("Generated Output")
134
+ st.write(result)
135
+
136
+ st.subheader("Performance Metrics")
137
+ plot_results(ram_usages, inference_times, context_length)
138
+
139
+ # Additional info
140
+ st.write(f"Max Context Length Used: {context_length}")
141
+ st.write(f"Quantization Enabled: {quantization}")
142
+ st.write(f"Average RAM Usage: {np.mean(ram_usages):.2f}%")
143
+ st.write(f"Inference Time: {inference_times[0]:.2f} seconds")
144
+
145
+ # Instructions for user
146
+ st.markdown("""
147
+ ### Instructions
148
+ 1. Select the DeepSeek model from the sidebar.
149
+ 2. Adjust the context length (higher values use more RAM).
150
+ 3. Enable quantization to reduce RAM usage (optional).
151
+ 4. Click 'Run Model' to install, tune, and visualize results.
152
+ **Note:** DeepSeek-V3 (671B) requires high-end hardware. Use V2-Lite for moderate setups.
153
+ """)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ vllm
4
+ psutil
5
+ matplotlib
6
+ seaborn
7
+ streamlit
8
+ numpy