Reality123b commited on
Commit
8faa1c2
·
verified ·
1 Parent(s): 37153e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -25
app.py CHANGED
@@ -12,25 +12,25 @@ class ModelInput(BaseModel):
12
  app = FastAPI()
13
 
14
  # Define model paths
15
- base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
16
- adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
17
 
18
- # Load the model and tokenizer
19
  def load_model_and_tokenizer():
 
20
  try:
21
  print("Loading base model...")
22
  model = AutoModelForCausalLM.from_pretrained(
23
- base_model_path,
24
  torch_dtype=torch.float16,
25
  trust_remote_code=True,
26
  device_map="auto"
27
  )
28
 
29
  print("Loading tokenizer...")
30
- tokenizer = AutoTokenizer.from_pretrained(base_model_path)
31
 
32
  print("Downloading adapter weights...")
33
- adapter_path_local = snapshot_download(repo_id=adapter_path)
34
 
35
  print("Loading adapter weights...")
36
  adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
@@ -38,7 +38,6 @@ def load_model_and_tokenizer():
38
 
39
  print("Applying adapter weights...")
40
  model.load_state_dict(state_dict, strict=False)
41
-
42
  print("Model and adapter loaded successfully!")
43
 
44
  return model, tokenizer
@@ -46,12 +45,13 @@ def load_model_and_tokenizer():
46
  print(f"Error during model loading: {e}")
47
  raise
48
 
 
49
  model, tokenizer = load_model_and_tokenizer()
50
 
51
  def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
52
  """Generate a response from the model based on an instruction."""
53
  try:
54
- # Encode input with truncation and create an attention mask
55
  inputs = tokenizer.encode(
56
  instruction,
57
  return_tensors="pt",
@@ -59,55 +59,46 @@ def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
59
  max_length=tokenizer.model_max_length
60
  ).to(model.device)
61
 
62
- # Create attention mask (1 for real tokens, 0 for padding tokens)
63
  attention_mask = torch.ones(inputs.shape, device=model.device)
64
 
65
- print(f"Model input tokens: {inputs}") # Debugging line
66
- print(f"Attention mask: {attention_mask}") # Debugging line
67
-
68
  # Generate response
69
  outputs = model.generate(
70
  inputs,
71
- attention_mask=attention_mask, # Pass the attention mask here
72
  max_new_tokens=max_new_tokens,
73
  temperature=0.7,
74
  top_p=0.9,
75
  do_sample=True,
76
  )
77
 
78
- print(f"Model output tokens: {outputs}") # Debugging line
79
-
80
  # Decode and strip input prompt from response
81
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
82
  generated_text = response[len(instruction):].strip()
83
 
84
- print(f"Instruction: {instruction}") # Debugging line
85
- print(f"Generated Response: {generated_text}") # Debugging line
86
-
87
  return generated_text
88
-
89
  except Exception as e:
90
  print(f"Error generating response: {e}")
91
  raise ValueError(f"Error generating response: {e}")
92
 
93
-
94
  @app.post("/generate")
95
  async def generate_text(input: ModelInput):
 
96
  try:
97
- print(f"Received prompt: {input.prompt}") # Log the prompt received
98
  response = generate_response(
99
  model=model,
100
  tokenizer=tokenizer,
101
  instruction=input.prompt,
102
  max_new_tokens=input.max_new_tokens
103
  )
104
- print(f"Generated response: {response}") # Log the generated response
105
  return {"generated_text": response}
106
-
107
  except Exception as e:
108
- print(f"Error: {str(e)}") # Log the error
109
  raise HTTPException(status_code=500, detail=str(e))
110
 
111
  @app.get("/")
112
  async def root():
113
- return {"message": "Welcome to the Model API!"}
 
 
12
  app = FastAPI()
13
 
14
  # Define model paths
15
+ BASE_MODEL_PATH = "HuggingFaceTB/SmolLM2-135M-Instruct"
16
+ ADAPTER_PATH = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
17
 
 
18
  def load_model_and_tokenizer():
19
+ """Load the model, tokenizer, and adapter weights."""
20
  try:
21
  print("Loading base model...")
22
  model = AutoModelForCausalLM.from_pretrained(
23
+ BASE_MODEL_PATH,
24
  torch_dtype=torch.float16,
25
  trust_remote_code=True,
26
  device_map="auto"
27
  )
28
 
29
  print("Loading tokenizer...")
30
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
31
 
32
  print("Downloading adapter weights...")
33
+ adapter_path_local = snapshot_download(repo_id=ADAPTER_PATH)
34
 
35
  print("Loading adapter weights...")
36
  adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
 
38
 
39
  print("Applying adapter weights...")
40
  model.load_state_dict(state_dict, strict=False)
 
41
  print("Model and adapter loaded successfully!")
42
 
43
  return model, tokenizer
 
45
  print(f"Error during model loading: {e}")
46
  raise
47
 
48
+ # Load model and tokenizer at startup
49
  model, tokenizer = load_model_and_tokenizer()
50
 
51
  def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
52
  """Generate a response from the model based on an instruction."""
53
  try:
54
+ # Encode input with truncation
55
  inputs = tokenizer.encode(
56
  instruction,
57
  return_tensors="pt",
 
59
  max_length=tokenizer.model_max_length
60
  ).to(model.device)
61
 
62
+ # Create attention mask
63
  attention_mask = torch.ones(inputs.shape, device=model.device)
64
 
 
 
 
65
  # Generate response
66
  outputs = model.generate(
67
  inputs,
68
+ attention_mask=attention_mask,
69
  max_new_tokens=max_new_tokens,
70
  temperature=0.7,
71
  top_p=0.9,
72
  do_sample=True,
73
  )
74
 
 
 
75
  # Decode and strip input prompt from response
76
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
77
  generated_text = response[len(instruction):].strip()
78
 
 
 
 
79
  return generated_text
 
80
  except Exception as e:
81
  print(f"Error generating response: {e}")
82
  raise ValueError(f"Error generating response: {e}")
83
 
 
84
  @app.post("/generate")
85
  async def generate_text(input: ModelInput):
86
+ """Generate text based on the input prompt."""
87
  try:
88
+ print(f"Received prompt: {input.prompt}")
89
  response = generate_response(
90
  model=model,
91
  tokenizer=tokenizer,
92
  instruction=input.prompt,
93
  max_new_tokens=input.max_new_tokens
94
  )
95
+ print(f"Generated response: {response}")
96
  return {"generated_text": response}
 
97
  except Exception as e:
98
+ print(f"Error: {str(e)}")
99
  raise HTTPException(status_code=500, detail=str(e))
100
 
101
  @app.get("/")
102
  async def root():
103
+ """Root endpoint that returns a welcome message."""
104
+ return {"message": "Welcome to the Model API!"}