Uhhy commited on
Commit
f8a1e1e
verified
1 Parent(s): 530ea6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -79
app.py CHANGED
@@ -3,35 +3,38 @@ import uvicorn
3
  import requests
4
  import os
5
  import io
6
- import time
7
  import asyncio
8
  from typing import List, Dict, Any
9
  from tqdm import tqdm
10
  from llama_cpp import Llama
11
  import aiofiles
 
12
 
13
  app = FastAPI()
14
 
15
  # Configuraci贸n de los modelos
16
  model_configs = [
17
- {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
18
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
19
- {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
20
- {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
21
- {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
22
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
23
- {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
24
- {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
25
- {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
26
- {"repo_id": "Ffftdtd5dtft/starcoder2-15b-Q2_K-GGUF", "filename": "starcoder2-15b-q2_k.gguf", "name": "Starcoder2 15B"},
27
- {"repo_id": "Ffftdtd5dtft/gemma-2-2b-it-Q2_K-GGUF", "filename": "gemma-2-2b-it-q2_k.gguf", "name": "Gemma 2-2B IT"},
28
- {"repo_id": "Ffftdtd5dtft/sarvam-2b-v0.5-Q2_K-GGUF", "filename": "sarvam-2b-v0.5-q2_k.gguf", "name": "Sarvam 2B v0.5"},
29
- {"repo_id": "Ffftdtd5dtft/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM 13B Uncensored"},
30
- {"repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-72b-instruct-q2_k.gguf", "name": "Qwen2 Math 72B Instruct"},
31
- {"repo_id": "Ffftdtd5dtft/WizardLM-7B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-7b-uncensored-q2_k.gguf", "name": "WizardLM 7B Uncensored"},
32
- {"repo_id": "Ffftdtd5dtft/Qwen2-Math-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-7b-instruct-q2_k.gguf", "name": "Qwen2 Math 7B Instruct"}
33
  ]
34
 
 
 
 
35
  class ModelManager:
36
  def __init__(self):
37
  self.models = {}
@@ -40,64 +43,66 @@ class ModelManager:
40
  self.index_lock = asyncio.Lock()
41
  self.part_size = 1024 * 1024 # Tama帽o de cada parte en bytes (1 MB)
42
 
43
- async def download_model_to_memory(self, model_config):
44
- url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
45
- print(f"Descargando modelo desde {url}")
46
- try:
47
- start_time = time.time()
48
- response = requests.get(url)
49
- response.raise_for_status()
50
- end_time = time.time()
51
- download_duration = end_time - start_time
52
- print(f"Descarga completa para {model_config['name']} en {download_duration:.2f} segundos")
53
- return io.BytesIO(response.content)
54
- except requests.RequestException as e:
55
- raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
56
-
57
- async def save_model_to_temp_file(self, model_config):
58
- model_file = await self.download_model_to_memory(model_config)
59
- temp_filename = f"/tmp/{model_config['filename']}"
60
- print(f"Guardando el modelo en {temp_filename}")
61
- async with aiofiles.open(temp_filename, 'wb') as f:
62
- await f.write(model_file.getvalue())
63
- print(f"Modelo guardado en {temp_filename}")
64
- return temp_filename
65
-
66
- async def load_model(self, model_config):
67
- async with self.load_lock:
68
  try:
69
- temp_filename = await self.save_model_to_temp_file(model_config)
70
  start_time = time.time()
71
- print(f"Cargando modelo desde {temp_filename}")
72
-
73
- # Cambiar la forma en que se carga el modelo seg煤n la biblioteca que utilices
74
- llama = Llama.from_file(temp_filename)
75
-
 
 
 
 
76
  end_time = time.time()
77
- load_duration = end_time - start_time
78
- if load_duration > 0.5:
79
- print(f"Modelo {model_config['name']} tard贸 {load_duration:.2f} segundos en cargar, dividiendo autom谩ticamente")
80
- await self.handle_large_model(temp_filename, model_config)
81
- else:
82
- print(f"Modelo {model_config['name']} cargado correctamente en {load_duration:.2f} segundos")
83
-
84
- tokenizer = llama.tokenizer
85
- model_data = {
86
- 'model': llama,
87
- 'tokenizer': tokenizer,
88
- 'pad_token': tokenizer.pad_token,
89
- 'pad_token_id': tokenizer.pad_token_id,
90
- 'eos_token': tokenizer.eos_token,
91
- 'eos_token_id': tokenizer.eos_token_id,
92
- 'bos_token': tokenizer.bos_token,
93
- 'bos_token_id': tokenizer.bos_token_id,
94
- 'unk_token': tokenizer.unk_token,
95
- 'unk_token_id': tokenizer.unk_token_id
96
- }
97
-
98
- self.models[model_config['name']] = model_data
99
- except Exception as e:
100
- print(f"Error al cargar el modelo: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  async def handle_large_model(self, model_filename, model_config):
103
  total_size = os.path.getsize(model_filename)
@@ -116,7 +121,7 @@ class ModelManager:
116
  async with self.index_lock:
117
  part_name = f"part_{part_index}"
118
  print(f"Indexando parte {part_index}")
119
- temp_filename = f"/tmp/{part_name}.gguf"
120
  async with aiofiles.open(temp_filename, 'wb') as f:
121
  await f.write(model_part.getvalue())
122
  print(f"Parte {part_index} indexada y guardada")
@@ -156,16 +161,25 @@ async def generate(request: Request):
156
  raise HTTPException(status_code=400, detail="Se requiere una entrada de usuario.")
157
 
158
  try:
159
- model_manager = ModelManager()
160
- tasks = [model_manager.load_model(config) for config in model_configs]
161
- await asyncio.gather(*tasks)
162
  responses = await model_manager.generate_response(user_input)
163
  return {"responses": responses}
164
  except Exception as e:
165
  raise HTTPException(status_code=500, detail=str(e))
166
 
167
- def start_uvicorn():
168
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
169
 
170
  if __name__ == "__main__":
171
- asyncio.run(start_uvicorn())
 
 
 
 
 
3
  import requests
4
  import os
5
  import io
 
6
  import asyncio
7
  from typing import List, Dict, Any
8
  from tqdm import tqdm
9
  from llama_cpp import Llama
10
  import aiofiles
11
+ import time
12
 
13
  app = FastAPI()
14
 
15
  # Configuraci贸n de los modelos
16
  model_configs = [
17
+ {"repo_id": "TheBloke/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
18
+ {"repo_id": "TheBloke/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
19
+ {"repo_id": "TheBloke/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
20
+ {"repo_id": "TheBloke/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
21
+ {"repo_id": "TheBloke/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
22
+ {"repo_id": "TheBloke/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
23
+ {"repo_id": "TheBloke/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
24
+ {"repo_id": "TheBloke/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
25
+ {"repo_id": "TheBloke/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
26
+ {"repo_id": "TheBloke/starcoder2-15b-Q2_K-GGUF", "filename": "starcoder2-15b-q2_k.gguf", "name": "Starcoder2 15B"},
27
+ {"repo_id": "TheBloke/gemma-2-2b-it-Q2_K-GGUF", "filename": "gemma-2-2b-it-q2_k.gguf", "name": "Gemma 2-2B IT"},
28
+ {"repo_id": "TheBloke/sarvam-2b-v0.5-Q2_K-GGUF", "filename": "sarvam-2b-v0.5-q2_k.gguf", "name": "Sarvam 2B v0.5"},
29
+ {"repo_id": "TheBloke/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM 13B Uncensored"},
30
+ {"repo_id": "TheBloke/Qwen2-Math-72B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-72b-instruct-q2_k.gguf", "name": "Qwen2 Math 72B Instruct"},
31
+ {"repo_id": "TheBloke/WizardLM-7B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-7b-uncensored-q2_k.gguf", "name": "WizardLM 7B Uncensored"},
32
+ {"repo_id": "TheBloke/Qwen2-Math-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-7b-instruct-q2_k.gguf", "name": "Qwen2 Math 7B Instruct"}
33
  ]
34
 
35
+ # Directorio para almacenar los modelos descargados
36
+ models_dir = "modelos"
37
+
38
  class ModelManager:
39
  def __init__(self):
40
  self.models = {}
 
43
  self.index_lock = asyncio.Lock()
44
  self.part_size = 1024 * 1024 # Tama帽o de cada parte en bytes (1 MB)
45
 
46
+ async def download_model(self, model_config):
47
+ model_path = os.path.join(models_dir, model_config['filename'])
48
+ if not os.path.exists(model_path):
49
+ url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
50
+ print(f"Descargando modelo desde {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  try:
 
52
  start_time = time.time()
53
+ response = requests.get(url, stream=True)
54
+ response.raise_for_status()
55
+
56
+ total_size = int(response.headers.get('content-length', 0))
57
+ with open(model_path, 'wb') as f:
58
+ with tqdm(total=total_size, unit='B', unit_scale=True, desc=f"Descargando {model_config['filename']}") as pbar:
59
+ for chunk in response.iter_content(chunk_size=8192):
60
+ f.write(chunk)
61
+ pbar.update(len(chunk))
62
  end_time = time.time()
63
+ download_duration = end_time - start_time
64
+ print(f"Descarga completa para {model_config['name']} en {download_duration:.2f} segundos")
65
+ except requests.RequestException as e:
66
+ raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
67
+ else:
68
+ print(f"Modelo {model_config['filename']} ya descargado.")
69
+ return model_path
70
+
71
+ async def load_model(self, model_config):
72
+ async with self.load_lock:
73
+ if model_config['name'] not in self.models:
74
+ try:
75
+ model_path = await self.download_model(model_config)
76
+ start_time = time.time()
77
+ print(f"Cargando modelo desde {model_path}")
78
+
79
+ llama = Llama(model_path=model_path)
80
+
81
+ end_time = time.time()
82
+ load_duration = end_time - start_time
83
+ if load_duration > 0.5:
84
+ print(f"Modelo {model_config['name']} tard贸 {load_duration:.2f} segundos en cargar, dividiendo autom谩ticamente")
85
+ await self.handle_large_model(model_path, model_config)
86
+ else:
87
+ print(f"Modelo {model_config['name']} cargado correctamente en {load_duration:.2f} segundos")
88
+
89
+ tokenizer = llama.tokenizer
90
+ model_data = {
91
+ 'model': llama,
92
+ 'tokenizer': tokenizer,
93
+ 'pad_token': tokenizer.pad_token,
94
+ 'pad_token_id': tokenizer.pad_token_id,
95
+ 'eos_token': tokenizer.eos_token,
96
+ 'eos_token_id': tokenizer.eos_token_id,
97
+ 'bos_token': tokenizer.bos_token,
98
+ 'bos_token_id': tokenizer.bos_token_id,
99
+ 'unk_token': tokenizer.unk_token,
100
+ 'unk_token_id': tokenizer.unk_token_id
101
+ }
102
+
103
+ self.models[model_config['name']] = model_data
104
+ except Exception as e:
105
+ print(f"Error al cargar el modelo: {e}")
106
 
107
  async def handle_large_model(self, model_filename, model_config):
108
  total_size = os.path.getsize(model_filename)
 
121
  async with self.index_lock:
122
  part_name = f"part_{part_index}"
123
  print(f"Indexando parte {part_index}")
124
+ temp_filename = os.path.join(models_dir, f"{part_name}.gguf")
125
  async with aiofiles.open(temp_filename, 'wb') as f:
126
  await f.write(model_part.getvalue())
127
  print(f"Parte {part_index} indexada y guardada")
 
161
  raise HTTPException(status_code=400, detail="Se requiere una entrada de usuario.")
162
 
163
  try:
 
 
 
164
  responses = await model_manager.generate_response(user_input)
165
  return {"responses": responses}
166
  except Exception as e:
167
  raise HTTPException(status_code=500, detail=str(e))
168
 
169
+ async def load_models_on_startup():
170
+ tasks = [model_manager.load_model(config) for config in model_configs]
171
+ await asyncio.gather(*tasks)
172
+
173
+ @app.on_event("startup")
174
+ async def startup_event():
175
+ global model_manager
176
+ model_manager = ModelManager()
177
+ await load_models_on_startup()
178
+ print("Modelos cargados correctamente. API lista.")
179
 
180
  if __name__ == "__main__":
181
+ # Crear el directorio "modelos" si no existe
182
+ if not os.path.exists(models_dir):
183
+ os.makedirs(models_dir)
184
+
185
+ uvicorn.run(app, host="0.0.0.0", port=7860)