Yadav122 commited on
Commit
8115389
·
verified ·
1 Parent(s): 5b27bce

Upload model_config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. model_config.py +208 -0
model_config.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class ModelConfig:
9
+ """Configuration for different LLM models optimized for Hugging Face Spaces"""
10
+
11
+ MODELS = {
12
+ "dialogpt-medium": {
13
+ "name": "microsoft/DialoGPT-medium",
14
+ "description": "Conversational AI model, good for chat",
15
+ "max_length": 512,
16
+ "memory_usage": "medium",
17
+ "recommended_for": "chat, conversation"
18
+ },
19
+ "dialogpt-small": {
20
+ "name": "microsoft/DialoGPT-small",
21
+ "description": "Smaller conversational model, faster inference",
22
+ "max_length": 256,
23
+ "memory_usage": "low",
24
+ "recommended_for": "quick responses, limited resources"
25
+ },
26
+ "gpt2": {
27
+ "name": "gpt2",
28
+ "description": "General purpose text generation",
29
+ "max_length": 1024,
30
+ "memory_usage": "medium",
31
+ "recommended_for": "text generation, creative writing"
32
+ },
33
+ "distilgpt2": {
34
+ "name": "distilgpt2",
35
+ "description": "Distilled GPT-2, faster and smaller",
36
+ "max_length": 512,
37
+ "memory_usage": "low",
38
+ "recommended_for": "fast inference, resource constrained"
39
+ },
40
+ "flan-t5-small": {
41
+ "name": "google/flan-t5-small",
42
+ "description": "Instruction-tuned T5 model",
43
+ "max_length": 512,
44
+ "memory_usage": "low",
45
+ "recommended_for": "instruction following, Q&A"
46
+ }
47
+ }
48
+
49
+ @classmethod
50
+ def get_model_info(cls, model_key: str = None):
51
+ """Get information about available models"""
52
+ if model_key:
53
+ return cls.MODELS.get(model_key)
54
+ return cls.MODELS
55
+
56
+ @classmethod
57
+ def get_recommended_model(cls, use_case: str = "general"):
58
+ """Get recommended model based on use case"""
59
+ recommendations = {
60
+ "chat": "dialogpt-medium",
61
+ "fast": "distilgpt2",
62
+ "general": "gpt2",
63
+ "qa": "flan-t5-small",
64
+ "low_memory": "dialogpt-small"
65
+ }
66
+ return recommendations.get(use_case, "dialogpt-medium")
67
+
68
+ class ModelManager:
69
+ """Manages model loading and inference"""
70
+
71
+ def __init__(self, model_name: str = None):
72
+ self.model_name = model_name or os.getenv("MODEL_NAME", "microsoft/DialoGPT-medium")
73
+ self.model = None
74
+ self.tokenizer = None
75
+ self.pipeline = None
76
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
77
+ self.loaded = False
78
+
79
+ def load_model(self):
80
+ """Load the specified model"""
81
+ try:
82
+ logger.info(f"Loading model: {self.model_name}")
83
+ logger.info(f"Using device: {self.device}")
84
+
85
+ # Load tokenizer
86
+ self.tokenizer = AutoTokenizer.from_pretrained(
87
+ self.model_name,
88
+ padding_side="left"
89
+ )
90
+
91
+ # Add padding token if it doesn't exist
92
+ if self.tokenizer.pad_token is None:
93
+ self.tokenizer.pad_token = self.tokenizer.eos_token
94
+
95
+ # Load model with optimizations
96
+ model_kwargs = {
97
+ "low_cpu_mem_usage": True,
98
+ "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
99
+ }
100
+
101
+ if self.device == "cuda":
102
+ model_kwargs["device_map"] = "auto"
103
+
104
+ self.model = AutoModelForCausalLM.from_pretrained(
105
+ self.model_name,
106
+ **model_kwargs
107
+ )
108
+
109
+ # Move to device if not using device_map
110
+ if self.device == "cpu":
111
+ self.model = self.model.to(self.device)
112
+
113
+ # Create pipeline
114
+ self.pipeline = pipeline(
115
+ "text-generation",
116
+ model=self.model,
117
+ tokenizer=self.tokenizer,
118
+ device=0 if self.device == "cuda" else -1,
119
+ return_full_text=False
120
+ )
121
+
122
+ self.loaded = True
123
+ logger.info("Model loaded successfully!")
124
+
125
+ except Exception as e:
126
+ logger.error(f"Error loading model: {str(e)}")
127
+ raise e
128
+
129
+ def generate_response(self,
130
+ prompt: str,
131
+ max_length: int = 100,
132
+ temperature: float = 0.7,
133
+ top_p: float = 0.9,
134
+ do_sample: bool = True) -> str:
135
+ """Generate response using the loaded model"""
136
+
137
+ if not self.loaded:
138
+ raise RuntimeError("Model not loaded. Call load_model() first.")
139
+
140
+ try:
141
+ # Generate response
142
+ outputs = self.pipeline(
143
+ prompt,
144
+ max_new_tokens=max_length,
145
+ temperature=temperature,
146
+ top_p=top_p,
147
+ do_sample=do_sample,
148
+ pad_token_id=self.tokenizer.eos_token_id,
149
+ eos_token_id=self.tokenizer.eos_token_id,
150
+ truncation=True
151
+ )
152
+
153
+ # Extract generated text
154
+ if outputs and len(outputs) > 0:
155
+ generated_text = outputs[0]['generated_text']
156
+ return generated_text.strip()
157
+ else:
158
+ return "Sorry, I couldn't generate a response."
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error generating response: {str(e)}")
162
+ raise e
163
+
164
+ def get_model_info(self):
165
+ """Get information about the loaded model"""
166
+ return {
167
+ "model_name": self.model_name,
168
+ "device": self.device,
169
+ "loaded": self.loaded,
170
+ "tokenizer_vocab_size": len(self.tokenizer) if self.tokenizer else None,
171
+ "model_parameters": sum(p.numel() for p in self.model.parameters()) if self.model else None
172
+ }
173
+
174
+ def unload_model(self):
175
+ """Unload the model to free memory"""
176
+ if self.model:
177
+ del self.model
178
+ self.model = None
179
+ if self.tokenizer:
180
+ del self.tokenizer
181
+ self.tokenizer = None
182
+ if self.pipeline:
183
+ del self.pipeline
184
+ self.pipeline = None
185
+
186
+ # Clear CUDA cache if using GPU
187
+ if torch.cuda.is_available():
188
+ torch.cuda.empty_cache()
189
+
190
+ self.loaded = False
191
+ logger.info("Model unloaded successfully")
192
+
193
+ # Global model manager instance
194
+ model_manager = None
195
+
196
+ def get_model_manager(model_name: str = None) -> ModelManager:
197
+ """Get or create the global model manager instance"""
198
+ global model_manager
199
+ if model_manager is None:
200
+ model_manager = ModelManager(model_name)
201
+ return model_manager
202
+
203
+ def initialize_model(model_name: str = None):
204
+ """Initialize and load the model"""
205
+ manager = get_model_manager(model_name)
206
+ if not manager.loaded:
207
+ manager.load_model()
208
+ return manager