api-smollm135m

Sleeping

App Files Files Community

Reality123b commited on Jan 21

Commit

8faa1c2

verified ·

1 Parent(s): 37153e4

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -25

app.py CHANGED Viewed

@@ -12,25 +12,25 @@ class ModelInput(BaseModel):
 app = FastAPI()
 # Define model paths
-base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
-adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
-# Load the model and tokenizer
 def load_model_and_tokenizer():
     try:
         print("Loading base model...")
         model = AutoModelForCausalLM.from_pretrained(
-            base_model_path,
             torch_dtype=torch.float16,
             trust_remote_code=True,
             device_map="auto"
         )
         print("Loading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(base_model_path)
         print("Downloading adapter weights...")
-        adapter_path_local = snapshot_download(repo_id=adapter_path)
         print("Loading adapter weights...")
         adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
@@ -38,7 +38,6 @@ def load_model_and_tokenizer():
         print("Applying adapter weights...")
         model.load_state_dict(state_dict, strict=False)
         print("Model and adapter loaded successfully!")
         return model, tokenizer
@@ -46,12 +45,13 @@ def load_model_and_tokenizer():
         print(f"Error during model loading: {e}")
         raise
 model, tokenizer = load_model_and_tokenizer()
 def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
     """Generate a response from the model based on an instruction."""
     try:
-        # Encode input with truncation and create an attention mask
         inputs = tokenizer.encode(
             instruction,
             return_tensors="pt",
@@ -59,55 +59,46 @@ def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
             max_length=tokenizer.model_max_length
         ).to(model.device)
-        # Create attention mask (1 for real tokens, 0 for padding tokens)
         attention_mask = torch.ones(inputs.shape, device=model.device)
-        print(f"Model input tokens: {inputs}")  # Debugging line
-        print(f"Attention mask: {attention_mask}")  # Debugging line
         # Generate response
         outputs = model.generate(
             inputs,
-            attention_mask=attention_mask,  # Pass the attention mask here
             max_new_tokens=max_new_tokens,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
         )
-        print(f"Model output tokens: {outputs}")  # Debugging line
         # Decode and strip input prompt from response
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         generated_text = response[len(instruction):].strip()
-        print(f"Instruction: {instruction}")  # Debugging line
-        print(f"Generated Response: {generated_text}")  # Debugging line
         return generated_text
     except Exception as e:
         print(f"Error generating response: {e}")
         raise ValueError(f"Error generating response: {e}")
 @app.post("/generate")
 async def generate_text(input: ModelInput):
     try:
-        print(f"Received prompt: {input.prompt}")  # Log the prompt received
         response = generate_response(
             model=model,
             tokenizer=tokenizer,
             instruction=input.prompt,
             max_new_tokens=input.max_new_tokens
         )
-        print(f"Generated response: {response}")  # Log the generated response
         return {"generated_text": response}
     except Exception as e:
-        print(f"Error: {str(e)}")  # Log the error
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 async def root():
-    return {"message": "Welcome to the Model API!"}

 app = FastAPI()
 # Define model paths
+BASE_MODEL_PATH = "HuggingFaceTB/SmolLM2-135M-Instruct"
+ADAPTER_PATH = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
 def load_model_and_tokenizer():
+    """Load the model, tokenizer, and adapter weights."""
     try:
         print("Loading base model...")
         model = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL_PATH,
             torch_dtype=torch.float16,
             trust_remote_code=True,
             device_map="auto"
         )
         print("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
         print("Downloading adapter weights...")
+        adapter_path_local = snapshot_download(repo_id=ADAPTER_PATH)
         print("Loading adapter weights...")
         adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
         print("Applying adapter weights...")
         model.load_state_dict(state_dict, strict=False)
         print("Model and adapter loaded successfully!")
         return model, tokenizer
         print(f"Error during model loading: {e}")
         raise
+# Load model and tokenizer at startup
 model, tokenizer = load_model_and_tokenizer()
 def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
     """Generate a response from the model based on an instruction."""
     try:
+        # Encode input with truncation
         inputs = tokenizer.encode(
             instruction,
             return_tensors="pt",
             max_length=tokenizer.model_max_length
         ).to(model.device)
+        # Create attention mask
         attention_mask = torch.ones(inputs.shape, device=model.device)
         # Generate response
         outputs = model.generate(
             inputs,
+            attention_mask=attention_mask,
             max_new_tokens=max_new_tokens,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
         )
         # Decode and strip input prompt from response
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         generated_text = response[len(instruction):].strip()
         return generated_text
     except Exception as e:
         print(f"Error generating response: {e}")
         raise ValueError(f"Error generating response: {e}")
 @app.post("/generate")
 async def generate_text(input: ModelInput):
+    """Generate text based on the input prompt."""
     try:
+        print(f"Received prompt: {input.prompt}")
         response = generate_response(
             model=model,
             tokenizer=tokenizer,
             instruction=input.prompt,
             max_new_tokens=input.max_new_tokens
         )
+        print(f"Generated response: {response}")
         return {"generated_text": response}
     except Exception as e:
+        print(f"Error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 async def root():
+    """Root endpoint that returns a welcome message."""
+    return {"message": "Welcome to the Model API!"}