kgupta21 commited on
Commit
eb5c340
·
1 Parent(s): a956d76

local inference page with fixes to gpu with zerogpu + add accelerate for device mapping

Browse files
Files changed (2) hide show
  1. app.py +21 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -48,11 +48,28 @@ except Exception as e:
48
  # Initialize Llama
49
  try:
50
  logger.info("Initializing Llama model...")
 
 
 
 
 
 
 
51
  if torch.cuda.is_available():
52
- llm_model_id = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
53
- llm = AutoModelForCausalLM.from_pretrained(llm_model_id, torch_dtype=torch.float16, device_map="auto")
54
- tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
55
- tokenizer.use_default_system_prompt = False
 
 
 
 
 
 
 
 
 
 
56
  except Exception as e:
57
  logger.error(f"Error initializing Llama model: {str(e)}")
58
  llm = None
 
48
  # Initialize Llama
49
  try:
50
  logger.info("Initializing Llama model...")
51
+ llm_model_id = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
52
+
53
+ # Initialize tokenizer first
54
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
55
+ tokenizer.use_default_system_prompt = False
56
+
57
+ # Initialize model with proper device mapping
58
  if torch.cuda.is_available():
59
+ logger.info("Loading Llama model on GPU...")
60
+ llm = AutoModelForCausalLM.from_pretrained(
61
+ llm_model_id,
62
+ torch_dtype=torch.float16,
63
+ device_map="auto",
64
+ load_in_8bit=True # Use 8-bit quantization to reduce memory usage
65
+ )
66
+ else:
67
+ logger.info("Loading Llama model on CPU...")
68
+ llm = AutoModelForCausalLM.from_pretrained(
69
+ llm_model_id,
70
+ device_map={"": "cpu"},
71
+ low_cpu_mem_usage=True
72
+ )
73
  except Exception as e:
74
  logger.error(f"Error initializing Llama model: {str(e)}")
75
  llm = None
requirements.txt CHANGED
@@ -6,4 +6,5 @@ Pillow>=10.0.0
6
  huggingface-hub>=0.20.0
7
  torch>=2.0.0
8
  transformers>=4.36.0
9
- spaces>=0.19.3
 
 
6
  huggingface-hub>=0.20.0
7
  torch>=2.0.0
8
  transformers>=4.36.0
9
+ spaces>=0.19.3
10
+ accelerate>=0.27.0