ciyidogan commited on
Commit
8687d10
·
verified ·
1 Parent(s): 6f0b2c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -9
app.py CHANGED
@@ -6,7 +6,7 @@ import traceback
6
  from datetime import datetime
7
  from fastapi import FastAPI, HTTPException
8
  from pydantic import BaseModel
9
- from transformers import AutoTokenizer, AutoModelForCausalLM
10
  from contextlib import asynccontextmanager
11
 
12
  # === Ortam değişkenleri
@@ -33,13 +33,18 @@ async def lifespan(app: FastAPI):
33
  tokenizer = AutoTokenizer.from_pretrained(model_name)
34
  tokenizer.pad_token = tokenizer.eos_token
35
 
 
 
 
 
 
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_name,
38
  device_map="auto",
39
- load_in_8bit=True # ✅ 8-bit quantization
40
  )
41
 
42
- log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized).")
43
  yield # Uygulama burada çalışır
44
 
45
  except Exception as e:
@@ -57,19 +62,22 @@ class UserInputRequest(BaseModel):
57
  @app.post("/generate")
58
  def generate(req: UserInputRequest):
59
  try:
60
- start_time = time.time()
61
  log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
62
 
 
 
63
  messages = [
64
  {"role": "system", "content": req.system_prompt},
65
  {"role": "user", "content": req.user_input}
66
  ]
67
-
68
  chat_input = tokenizer.apply_chat_template(
69
  messages,
70
  add_generation_prompt=True,
71
  return_tensors="pt"
72
  ).to(model.device)
 
 
73
 
74
  input_len = chat_input.shape[-1]
75
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
@@ -77,23 +85,30 @@ def generate(req: UserInputRequest):
77
 
78
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
79
 
 
 
80
  terminators = [
81
  tokenizer.eos_token_id,
82
  tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
83
  ]
84
-
85
  outputs = model.generate(
86
  input_ids=chat_input,
87
  max_new_tokens=max_new_tokens,
88
  eos_token_id=terminators
89
  )
 
 
90
 
 
 
91
  response = outputs[0][input_len:]
92
  answer = tokenizer.decode(response, skip_special_tokens=True)
 
 
93
 
94
- end_time = time.time()
95
- elapsed = end_time - start_time
96
- log(f"✅ Yanıt süresi: {elapsed:.2f} saniye")
97
 
98
  return {"response": answer}
99
 
 
6
  from datetime import datetime
7
  from fastapi import FastAPI, HTTPException
8
  from pydantic import BaseModel
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
  from contextlib import asynccontextmanager
11
 
12
  # === Ortam değişkenleri
 
33
  tokenizer = AutoTokenizer.from_pretrained(model_name)
34
  tokenizer.pad_token = tokenizer.eos_token
35
 
36
+ quant_config = BitsAndBytesConfig(
37
+ load_in_8bit=True, # ✅ 8-bit quantization (modern yöntem)
38
+ llm_int8_threshold=6.0
39
+ )
40
+
41
  model = AutoModelForCausalLM.from_pretrained(
42
  model_name,
43
  device_map="auto",
44
+ quantization_config=quant_config
45
  )
46
 
47
+ log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized, BitsAndBytesConfig).")
48
  yield # Uygulama burada çalışır
49
 
50
  except Exception as e:
 
62
  @app.post("/generate")
63
  def generate(req: UserInputRequest):
64
  try:
65
+ overall_start = time.time()
66
  log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
67
 
68
+ # === Apply chat template
69
+ t0 = time.time()
70
  messages = [
71
  {"role": "system", "content": req.system_prompt},
72
  {"role": "user", "content": req.user_input}
73
  ]
 
74
  chat_input = tokenizer.apply_chat_template(
75
  messages,
76
  add_generation_prompt=True,
77
  return_tensors="pt"
78
  ).to(model.device)
79
+ t1 = time.time()
80
+ log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
81
 
82
  input_len = chat_input.shape[-1]
83
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
 
85
 
86
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
87
 
88
+ # === Generate
89
+ t2 = time.time()
90
  terminators = [
91
  tokenizer.eos_token_id,
92
  tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
93
  ]
 
94
  outputs = model.generate(
95
  input_ids=chat_input,
96
  max_new_tokens=max_new_tokens,
97
  eos_token_id=terminators
98
  )
99
+ t3 = time.time()
100
+ log(f"⏱️ generate süresi: {t3 - t2:.2f} saniye")
101
 
102
+ # === Decode
103
+ t4 = time.time()
104
  response = outputs[0][input_len:]
105
  answer = tokenizer.decode(response, skip_special_tokens=True)
106
+ t5 = time.time()
107
+ log(f"⏱️ decode süresi: {t5 - t4:.2f} saniye")
108
 
109
+ overall_end = time.time()
110
+ overall_elapsed = overall_end - overall_start
111
+ log(f"✅ Toplam yanıt süresi: {overall_elapsed:.2f} saniye")
112
 
113
  return {"response": answer}
114