ciyidogan commited on
Commit
e68dc63
·
verified ·
1 Parent(s): 8687d10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -13
app.py CHANGED
@@ -34,7 +34,7 @@ async def lifespan(app: FastAPI):
34
  tokenizer.pad_token = tokenizer.eos_token
35
 
36
  quant_config = BitsAndBytesConfig(
37
- load_in_8bit=True, # ✅ 8-bit quantization (modern yöntem)
38
  llm_int8_threshold=6.0
39
  )
40
 
@@ -45,7 +45,7 @@ async def lifespan(app: FastAPI):
45
  )
46
 
47
  log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized, BitsAndBytesConfig).")
48
- yield # Uygulama burada çalışır
49
 
50
  except Exception as e:
51
  log(f"❌ Model yükleme hatası: {e}")
@@ -71,40 +71,53 @@ def generate(req: UserInputRequest):
71
  {"role": "system", "content": req.system_prompt},
72
  {"role": "user", "content": req.user_input}
73
  ]
74
- chat_input = tokenizer.apply_chat_template(
75
  messages,
76
  add_generation_prompt=True,
77
- return_tensors="pt"
78
- ).to(model.device)
79
  t1 = time.time()
80
  log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
81
 
82
- input_len = chat_input.shape[-1]
 
 
 
 
 
 
 
 
 
 
 
 
83
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
84
  max_new_tokens = min(512, max(1, total_ctx - input_len))
85
 
86
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
87
 
88
  # === Generate
89
- t2 = time.time()
90
  terminators = [
91
  tokenizer.eos_token_id,
92
  tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
93
  ]
94
  outputs = model.generate(
95
- input_ids=chat_input,
 
96
  max_new_tokens=max_new_tokens,
97
  eos_token_id=terminators
98
  )
99
- t3 = time.time()
100
- log(f"⏱️ generate süresi: {t3 - t2:.2f} saniye")
101
 
102
  # === Decode
103
- t4 = time.time()
104
  response = outputs[0][input_len:]
105
  answer = tokenizer.decode(response, skip_special_tokens=True)
106
- t5 = time.time()
107
- log(f"⏱️ decode süresi: {t5 - t4:.2f} saniye")
108
 
109
  overall_end = time.time()
110
  overall_elapsed = overall_end - overall_start
 
34
  tokenizer.pad_token = tokenizer.eos_token
35
 
36
  quant_config = BitsAndBytesConfig(
37
+ load_in_8bit=True, # ✅ 8-bit quantization (modern BitsAndBytesConfig)
38
  llm_int8_threshold=6.0
39
  )
40
 
 
45
  )
46
 
47
  log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized, BitsAndBytesConfig).")
48
+ yield
49
 
50
  except Exception as e:
51
  log(f"❌ Model yükleme hatası: {e}")
 
71
  {"role": "system", "content": req.system_prompt},
72
  {"role": "user", "content": req.user_input}
73
  ]
74
+ chat_template_str = tokenizer.apply_chat_template(
75
  messages,
76
  add_generation_prompt=True,
77
+ return_tensors=None
78
+ )
79
  t1 = time.time()
80
  log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
81
 
82
+ # === Tokenizer ile input_ids + attention_mask hazırla
83
+ t2 = time.time()
84
+ tokenized_inputs = tokenizer(
85
+ chat_template_str,
86
+ return_tensors="pt",
87
+ padding=True
88
+ ).to(model.device)
89
+ input_ids = tokenized_inputs['input_ids']
90
+ attention_mask = tokenized_inputs['attention_mask']
91
+ t3 = time.time()
92
+ log(f"⏱️ tokenize süresi: {t3 - t2:.2f} saniye")
93
+
94
+ input_len = input_ids.shape[-1]
95
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
96
  max_new_tokens = min(512, max(1, total_ctx - input_len))
97
 
98
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
99
 
100
  # === Generate
101
+ t4 = time.time()
102
  terminators = [
103
  tokenizer.eos_token_id,
104
  tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
105
  ]
106
  outputs = model.generate(
107
+ input_ids=input_ids,
108
+ attention_mask=attention_mask,
109
  max_new_tokens=max_new_tokens,
110
  eos_token_id=terminators
111
  )
112
+ t5 = time.time()
113
+ log(f"⏱️ generate süresi: {t5 - t4:.2f} saniye")
114
 
115
  # === Decode
116
+ t6 = time.time()
117
  response = outputs[0][input_len:]
118
  answer = tokenizer.decode(response, skip_special_tokens=True)
119
+ t7 = time.time()
120
+ log(f"⏱️ decode süresi: {t7 - t6:.2f} saniye")
121
 
122
  overall_end = time.time()
123
  overall_elapsed = overall_end - overall_start