null and void commited on
Commit
9012900
·
verified ·
1 Parent(s): 212c5a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -15
app.py CHANGED
@@ -2,7 +2,9 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import time
 
5
 
 
6
  print(f"CUDA is available: {torch.cuda.is_available()}")
7
  print(f"CUDA device count: {torch.cuda.device_count()}")
8
  if torch.cuda.is_available():
@@ -23,27 +25,31 @@ class ConversationManager:
23
  if not model_name:
24
  print("Error: Empty model name provided")
25
  return None
26
-
27
  if model_name in self.models:
28
  return self.models[model_name]
29
 
30
  try:
31
  print(f"Attempting to load model: {model_name}")
32
  tokenizer = AutoTokenizer.from_pretrained(model_name)
33
- try:
34
- # Try to load the model with GPU support
35
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
36
- except RuntimeError as e:
37
- print(f"GPU loading failed, falling back to CPU: {e}")
38
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
39
- self.models[model_name] = (model, tokenizer)
40
- print(f"Successfully loaded model: {model_name}")
41
- return self.models[model_name]
42
- except Exception as e:
43
- print(f"Failed to load model {model_name}: {e}")
44
- print(f"Error type: {type(e).__name__}")
45
- print(f"Error details: {str(e)}")
46
- return None
 
 
 
 
47
 
48
  def generate_response(self, model_name, prompt):
49
  model, tokenizer = self.load_model(model_name)
 
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import time
5
+ import bitsandbytes as bnb
6
 
7
+ print(f"bitsandbytes version: {bnb.__version__}")
8
  print(f"CUDA is available: {torch.cuda.is_available()}")
9
  print(f"CUDA device count: {torch.cuda.device_count()}")
10
  if torch.cuda.is_available():
 
25
  if not model_name:
26
  print("Error: Empty model name provided")
27
  return None
28
+
29
  if model_name in self.models:
30
  return self.models[model_name]
31
 
32
  try:
33
  print(f"Attempting to load model: {model_name}")
34
  tokenizer = AutoTokenizer.from_pretrained(model_name)
35
+ try:
36
+ # Try to load the model with 8-bit quantization
37
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
38
+ except RuntimeError as e:
39
+ print(f"8-bit quantization not available, falling back to full precision: {e}")
40
+ if torch.cuda.is_available():
41
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
42
+ else:
43
+ model = AutoModelForCausalLM.from_pretrained(model_name)
44
+
45
+ self.models[model_name] = (model, tokenizer)
46
+ print(f"Successfully loaded model: {model_name}")
47
+ return self.models[model_name]
48
+ except Exception as e:
49
+ print(f"Failed to load model {model_name}: {e}")
50
+ print(f"Error type: {type(e).__name__}")
51
+ print(f"Error details: {str(e)}")
52
+ return None
53
 
54
  def generate_response(self, model_name, prompt):
55
  model, tokenizer = self.load_model(model_name)