Reality123b commited on
Commit
a04b12b
·
verified ·
1 Parent(s): fbf5fda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -32
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from fastapi import FastAPI, HTTPException, Request
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel, Field
@@ -30,6 +31,14 @@ app.add_middleware(
30
  BASE_MODEL_PATH = "HuggingFaceTB/SmolLM2-135M-Instruct"
31
  ADAPTER_PATH = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
32
 
 
 
 
 
 
 
 
 
33
  def load_model_and_tokenizer():
34
  """Load the model, tokenizer, and adapter weights."""
35
  try:
@@ -38,11 +47,28 @@ def load_model_and_tokenizer():
38
  BASE_MODEL_PATH,
39
  torch_dtype=torch.float16,
40
  trust_remote_code=True,
41
- device_map="auto"
 
42
  )
43
 
44
  logger.info("Loading tokenizer...")
45
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  logger.info("Downloading adapter weights...")
48
  adapter_path_local = snapshot_download(repo_id=ADAPTER_PATH)
@@ -71,41 +97,58 @@ except Exception as e:
71
  def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
72
  """Generate a response from the model based on an instruction."""
73
  try:
74
- logger.info(f"Generating response for instruction: {instruction[:100]}...")
 
 
75
 
76
  # Encode input with truncation
77
- inputs = tokenizer.encode(
78
- instruction,
79
  return_tensors="pt",
80
  truncation=True,
81
- max_length=tokenizer.model_max_length
 
 
82
  ).to(model.device)
83
 
84
- logger.info(f"Input shape: {inputs.shape}")
85
 
86
- # Create attention mask
87
- attention_mask = torch.ones(inputs.shape, device=model.device)
88
-
89
  # Generate response
90
- outputs = model.generate(
91
- inputs,
92
- attention_mask=attention_mask,
93
- max_new_tokens=max_new_tokens,
94
- temperature=0.7,
95
- top_p=0.9,
96
- do_sample=True,
97
- pad_token_id=tokenizer.pad_token_id,
98
- eos_token_id=tokenizer.eos_token_id,
99
- )
 
 
 
 
 
 
100
 
101
  logger.info(f"Output shape: {outputs.shape}")
102
 
103
- # Decode and strip input prompt from response
104
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
105
- generated_text = response[len(instruction):].strip()
 
 
 
106
 
107
- logger.info(f"Generated text length: {len(generated_text)}")
108
- return generated_text
 
 
 
 
 
 
 
109
  except Exception as e:
110
  logger.error(f"Error generating response: {e}", exc_info=True)
111
  raise ValueError(f"Error generating response: {e}")
@@ -127,11 +170,6 @@ async def generate_text(input: ModelInput, request: Request):
127
  max_new_tokens=input.max_new_tokens
128
  )
129
 
130
- if not response:
131
- logger.warning("Generated empty response")
132
- return {"generated_text": "", "warning": "Empty response generated"}
133
-
134
- logger.info(f"Generated response length: {len(response)}")
135
  return {"generated_text": response}
136
  except Exception as e:
137
  logger.error(f"Error in generate_text endpoint: {e}", exc_info=True)
@@ -148,5 +186,10 @@ async def health_check():
148
  return {
149
  "status": "healthy",
150
  "model_loaded": model is not None and tokenizer is not None,
151
- "model_device": str(next(model.parameters()).device) if model else None
152
- }
 
 
 
 
 
 
1
+ # server.py
2
  from fastapi import FastAPI, HTTPException, Request
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from pydantic import BaseModel, Field
 
31
  BASE_MODEL_PATH = "HuggingFaceTB/SmolLM2-135M-Instruct"
32
  ADAPTER_PATH = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
33
 
34
+ def format_prompt(instruction):
35
+ """Format the prompt according to the model's expected format."""
36
+ return f"""### Instruction:
37
+ {instruction}
38
+
39
+ ### Response:
40
+ """
41
+
42
  def load_model_and_tokenizer():
43
  """Load the model, tokenizer, and adapter weights."""
44
  try:
 
47
  BASE_MODEL_PATH,
48
  torch_dtype=torch.float16,
49
  trust_remote_code=True,
50
+ device_map="auto",
51
+ use_cache=True
52
  )
53
 
54
  logger.info("Loading tokenizer...")
55
+ tokenizer = AutoTokenizer.from_pretrained(
56
+ BASE_MODEL_PATH,
57
+ padding_side="left",
58
+ truncation_side="left"
59
+ )
60
+
61
+ # Ensure the tokenizer has the necessary special tokens
62
+ special_tokens = {
63
+ "pad_token": "<|padding|>",
64
+ "eos_token": "</s>",
65
+ "bos_token": "<s>",
66
+ "unk_token": "<|unknown|>"
67
+ }
68
+ tokenizer.add_special_tokens(special_tokens)
69
+
70
+ # Resize the model embeddings to match the new tokenizer size
71
+ model.resize_token_embeddings(len(tokenizer))
72
 
73
  logger.info("Downloading adapter weights...")
74
  adapter_path_local = snapshot_download(repo_id=ADAPTER_PATH)
 
97
  def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
98
  """Generate a response from the model based on an instruction."""
99
  try:
100
+ # Format the prompt
101
+ formatted_prompt = format_prompt(instruction)
102
+ logger.info(f"Formatted prompt: {formatted_prompt}")
103
 
104
  # Encode input with truncation
105
+ inputs = tokenizer(
106
+ formatted_prompt,
107
  return_tensors="pt",
108
  truncation=True,
109
+ max_length=tokenizer.model_max_length,
110
+ padding=True,
111
+ add_special_tokens=True
112
  ).to(model.device)
113
 
114
+ logger.info(f"Input shape: {inputs.input_ids.shape}")
115
 
 
 
 
116
  # Generate response
117
+ with torch.inference_mode():
118
+ outputs = model.generate(
119
+ input_ids=inputs.input_ids,
120
+ attention_mask=inputs.attention_mask,
121
+ max_new_tokens=max_new_tokens,
122
+ temperature=0.7,
123
+ top_p=0.9,
124
+ top_k=50,
125
+ do_sample=True,
126
+ num_return_sequences=1,
127
+ pad_token_id=tokenizer.pad_token_id,
128
+ eos_token_id=tokenizer.eos_token_id,
129
+ repetition_penalty=1.1,
130
+ length_penalty=1.0,
131
+ no_repeat_ngram_size=3
132
+ )
133
 
134
  logger.info(f"Output shape: {outputs.shape}")
135
 
136
+ # Decode the response
137
+ response = tokenizer.decode(
138
+ outputs[0, inputs.input_ids.shape[1]:],
139
+ skip_special_tokens=True,
140
+ clean_up_tokenization_spaces=True
141
+ )
142
 
143
+ response = response.strip()
144
+ logger.info(f"Generated text length: {len(response)}")
145
+ logger.info(f"Generated text preview: {response[:100]}...")
146
+
147
+ if not response:
148
+ logger.warning("Empty response generated")
149
+ raise ValueError("Model generated an empty response")
150
+
151
+ return response
152
  except Exception as e:
153
  logger.error(f"Error generating response: {e}", exc_info=True)
154
  raise ValueError(f"Error generating response: {e}")
 
170
  max_new_tokens=input.max_new_tokens
171
  )
172
 
 
 
 
 
 
173
  return {"generated_text": response}
174
  except Exception as e:
175
  logger.error(f"Error in generate_text endpoint: {e}", exc_info=True)
 
186
  return {
187
  "status": "healthy",
188
  "model_loaded": model is not None and tokenizer is not None,
189
+ "model_device": str(next(model.parameters()).device) if model else None,
190
+ "tokenizer_vocab_size": len(tokenizer) if tokenizer else None
191
+ }
192
+
193
+ if __name__ == "__main__":
194
+ import uvicorn
195
+ uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")