Final_Assignment_Template

Running

App Files Files Community

mjschock commited on May 5

Commit

518aafe

unverified ·

1 Parent(s): b21080c

Add serve.py for model deployment and API integration, update requirements.txt for smolagents with vllm support, and enhance .gitignore to exclude memory snapshot files. Additionally, implement testing configuration in config.yaml and modify train.py for memory tracking and model saving in VLLM format.

Browse files

Files changed (5) hide show

.gitignore +1 -0
conf/config.yaml +9 -0
requirements.txt +1 -1
serve.py +365 -0
train.py +149 -7

.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
 .env
 logs
 lora_model
 outputs
 __pycache__
 .pytest_cache

 .env
 logs
 lora_model
+memory_snapshot.pickle
 outputs
 __pycache__
 .pytest_cache

conf/config.yaml CHANGED Viewed

@@ -70,3 +70,12 @@ output:
 # Training control
 train: false

 # Training control
 train: false
+# Testing configuration
+test: true  # Whether to run testing after training
+test_dataset:
+  name: "gaia-benchmark/GAIA"
+  config: "2023_level1"  # Use level 1 questions for testing
+  split: "test"  # Use test split for testing
+  max_samples: 10  # Number of samples to test on
+  max_length: 2048  # Maximum sequence length for testing

requirements.txt CHANGED Viewed

@@ -27,7 +27,7 @@ pytest-cov>=6.1.1
 python-dotenv>=1.0.0
 requests>=2.32.3
 sentence-transformers>=4.1.0
-smolagents[litellm,telemetry]>=1.14.0
 tensorboardX>=2.6.2.2
 trl>=0.17.0
 typing-extensions>=4.5.0

 python-dotenv>=1.0.0
 requests>=2.32.3
 sentence-transformers>=4.1.0
+smolagents[litellm,telemetry,vllm]>=1.14.0
 tensorboardX>=2.6.2.2
 trl>=0.17.0
 typing-extensions>=4.5.0

serve.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import asyncio
+import logging
+import os
+import time
+from pprint import pprint
+from threading import Thread
+from typing import Any, Dict, List
+from fastapi import FastAPI, Request
+from openai.types.chat.chat_completion import ChatCompletion
+from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+from openai.types.chat.chat_completion_chunk import Choice as ChatCompletionChunkChoice
+from openai.types.chat.chat_completion_chunk import ChoiceDelta
+from openai.types.chat.chat_completion_message import ChatCompletionMessage
+from openai.types.chat.completion_create_params import CompletionCreateParams
+from pydantic import TypeAdapter
+from ray import serve
+from sse_starlette import EventSourceResponse
+from starlette.responses import JSONResponse
+from transformers.generation.streamers import AsyncTextIteratorStreamer
+from transformers.image_utils import load_image
+from unsloth import FastVisionModel
+dtype = (
+    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+)
+load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
+max_seq_length = 2048  # Supports RoPE Scaling interally, so choose any!
+# max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
+logger = logging.getLogger("ray.serve")
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+app = FastAPI()
+# middlewares = [
+#     middleware
+#     for middleware in ConnexionMiddleware.default_middlewares
+#     if middleware is not SecurityMiddleware
+# ]
+# connexion_app = AsyncApp(import_name=__name__, middlewares=middlewares)
+# connexion_app.add_api(
+#     # "api/openai/v1/openapi/openapi.yaml",
+#     "api/v1/openapi/openapi.yaml",
+#     # base_path="/openai/v1",
+#     base_path="/v1",
+#     pythonic_params=True,
+#     resolver_error=501,
+# )
+# # fastapi_app.mount("/api", ConnexionMiddleware(app=connexion_app, import_name=__name__))
+# # app.mount("/api", ConnexionMiddleware(app=connexion_app, import_name=__name__))
+# app.mount(
+#     "/",
+#     ConnexionMiddleware(
+#         app=connexion_app,
+#         import_name=__name__,
+#         # middlewares=middlewares,
+#     ),
+# )
+@serve.deployment(
+    autoscaling_config={
+        # https://docs.ray.io/en/latest/serve/advanced-guides/advanced-autoscaling.html#required-define-upper-and-lower-autoscaling-limits
+        "max_replicas": 1,
+        "min_replicas": 1,  # TOOD: set to 0
+        "target_ongoing_requests": 2,  # https://docs.ray.io/en/latest/serve/advanced-guides/advanced-autoscaling.html#target-ongoing-requests-default-2
+    },
+    max_ongoing_requests=5,  # https://docs.ray.io/en/latest/serve/advanced-guides/advanced-autoscaling.html#max-ongoing-requests-default-5
+    ray_actor_options={"num_gpus": 1},
+)
+@serve.ingress(app)
+class ModelDeployment:
+    def __init__(
+        self,
+        model_name: str,
+    ):
+        self.model_name = model_name
+        model, processor = FastVisionModel.from_pretrained(
+            load_in_4bit=load_in_4bit,
+            max_seq_length=max_seq_length,
+            model_name=self.model_name,
+        )
+        with open("chat_template.txt", "r") as f:
+            processor.chat_template = f.read()
+            processor.tokenizer.chat_template = processor.chat_template
+        FastVisionModel.for_inference(model)  # Enable native 2x faster inference
+        self.model = model
+        self.processor = processor
+    def reconfigure(self, config: Dict[str, Any]):
+        print("=== reconfigure ===")
+        print("config:")
+        print(config)
+        # https://docs.ray.io/en/latest/serve/production-guide/config.html#dynamically-change-parameters-without-restarting-replicas-user-config
+    @app.post("/v1/chat/completions")
+    async def create_chat_completion(self, body: dict, raw_request: Request):
+        """Creates a model response for the given chat conversation. Learn more in the [text generation](/docs/guides/text-generation), [vision](/docs/guides/vision), and [audio](/docs/guides/audio) guides.  Parameter support can differ depending on the model used to generate the response, particularly for newer reasoning models. Parameters that are only supported for reasoning models are noted below. For the current state of  unsupported parameters in reasoning models,  [refer to the reasoning guide](/docs/guides/reasoning).
+        # noqa: E501
+        :param create_chat_completion_request:
+        :type create_chat_completion_request: dict | bytes
+        :rtype: Union[CreateChatCompletionResponse, Tuple[CreateChatCompletionResponse, int], Tuple[CreateChatCompletionResponse, int, Dict[str, str]]
+        """
+        print("=== create_chat_completion ===")
+        print("body:")
+        pprint(body)
+        ta = TypeAdapter(CompletionCreateParams)
+        print("ta.validate_python...")
+        pprint(ta.validate_python(body))
+        max_new_tokens = body.get("max_completion_tokens", body.get("max_tokens"))
+        messages = body.get("messages")
+        model_name = body.get("model")
+        stream = body.get("stream", False)
+        temperature = body.get("temperature")
+        tools = body.get("tools")
+        images = []
+        for message in messages:
+            for content in message["content"]:
+                if "type" in content and content["type"] == "image_url":
+                    image_url = content["image_url"]["url"]
+                    image = load_image(image_url)
+                    images.append(image)
+                    content["type"] = "image"
+                    del content["image_url"]
+        images = images if images else None
+        if model_name != self.model_name:
+            # adapter_path = model_name
+            # self.model.load_adapter(adapter_path)
+            return JSONResponse(content={"error": "Model not found"}, status_code=404)
+        prompt = self.processor.apply_chat_template(
+            add_generation_prompt=True,
+            conversation=messages,
+            # documents=documents,
+            tools=tools,
+        )
+        print("prompt:")
+        print(prompt)
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt")
+        inputs = inputs.to(self.model.device)
+        input_ids = inputs.input_ids
+        class GeneratorThread(Thread):
+            """Thread to generate completions in the background."""
+            def __init__(self, model, **generation_kwargs):
+                super().__init__()
+                self.chat_completion = None
+                self.generation_kwargs = generation_kwargs
+                self.model = model
+            def run(self):
+                import torch
+                import torch._dynamo.config
+                try:
+                    try:
+                        self.generated_ids = self.model.generate(
+                            **self.generation_kwargs
+                        )
+                    except torch._dynamo.exc.BackendCompilerFailed as e:
+                        print(e)
+                        print("Disabling dynamo...")
+                        torch._dynamo.config.disable = True
+                        self.generated_ids = self.model.generate(
+                            **self.generation_kwargs
+                        )
+                except Exception as e:
+                    print(e)
+                    print("Warning: Exception in GeneratorThread")
+                    self.generated_ids = []
+            def join(self, timeout=None):
+                super().join()
+                return self.generated_ids
+        decode_kwargs = dict(skip_special_tokens=True)
+        streamer = (
+            AsyncTextIteratorStreamer(
+                self.processor,
+                skip_prompt=True,
+                **decode_kwargs,
+            )
+            if stream
+            else None
+        )
+        generation_kwargs = dict(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            streamer=streamer,
+            temperature=temperature,
+            use_cache=True,
+        )
+        thread = GeneratorThread(self.model, **generation_kwargs)
+        thread.start()
+        if stream:
+            async def event_publisher():
+                i = 0
+                try:
+                    async for new_text in streamer:
+                        print("new_text:")
+                        print(new_text)
+                        choices: List[ChatCompletionChunkChoice] = [
+                            ChatCompletionChunkChoice(
+                                _request_id=None,
+                                delta=ChoiceDelta(
+                                    _request_id=None,
+                                    content=new_text,
+                                    function_call=None,
+                                    refusal=None,
+                                    role="assistant",
+                                    tool_calls=None,
+                                ),
+                                finish_reason=None,
+                                index=0,
+                                logprobs=None,
+                            )
+                        ]
+                        chat_completion_chunk = ChatCompletionChunk(
+                            _request_id=None,
+                            choices=choices,
+                            created=int(time.time()),
+                            id=str(i),
+                            model=model_name,
+                            object="chat.completion.chunk",
+                            service_tier=None,
+                            system_fingerprint=None,
+                            usage=None,
+                        )
+                        yield chat_completion_chunk.model_dump_json()
+                        i += 1
+                except asyncio.CancelledError as e:
+                    print("Disconnected from client (via refresh/close)")
+                    raise e
+                except Exception as e:
+                    print(f"Exception: {e}")
+                    raise e
+            return EventSourceResponse(event_publisher())
+        generated_ids = thread.join()
+        input_length = input_ids.shape[1]
+        batch_decoded_outputs = self.processor.batch_decode(
+            generated_ids[:, input_length:],
+            skip_special_tokens=True,
+        )
+        choices: List[ChatCompletionChoice] = []
+        for i, response in enumerate(batch_decoded_outputs):
+            print("response:")
+            print(response)
+            # try:
+            # response = json.loads(response)
+            #         finish_reason: str = response.get("finish_reason")
+            #         tool_calls_json = response.get("tool_calls")
+            #         tool_calls: List[ToolCall] = []
+            #         for tool_call_json in tool_calls_json:
+            #             tool_call = ToolCall(
+            #                 function=FunctionToolCallArguments(
+            #                     arguments=tool_call_json.get("arguments"),
+            #                     name=tool_call_json.get("name"),
+            #                 ),
+            #                 id=tool_call_json.get("id"),
+            #                 type="function",
+            #             )
+            #             tool_calls.append(tool_call)
+            #         message: ChatMessage = ChatMessage(
+            #             role="assistant",
+            #             tool_calls=tool_calls,
+            #         )
+            #     except json.JSONDecodeError:
+            #         finish_reason: str = "stop"
+            #         message: ChatMessage = ChatMessage(
+            #             role="assistant",
+            #             content=response,
+            #         )
+            message = ChatCompletionMessage(
+                audio=None,
+                content=response,
+                refusal=None,
+                role="assistant",
+                tool_calls=None,
+            )
+            choices.append(
+                ChatCompletionChoice(
+                    index=i,
+                    finish_reason="stop",
+                    logprobs=None,
+                    message=message,
+                )
+            )
+        chat_completion = ChatCompletion(
+            choices=choices,
+            created=int(time.time()),
+            id="1",
+            model=model_name,
+            object="chat.completion",
+            service_tier=None,
+            system_fingerprint=None,
+            usage=None,
+        )
+        return chat_completion.model_dump(mode="json")
+def build_app(cli_args: Dict[str, str]) -> serve.Application:
+    """Builds the Serve app based on CLI arguments."""
+    return ModelDeployment.options().bind(
+        cli_args.get("model_name"),
+    )

train.py CHANGED Viewed

@@ -40,8 +40,16 @@ from transformers import (
     DataCollatorForLanguageModeling,
     Trainer,
     TrainingArguments,
 )
 from trl import SFTTrainer
 # Setup logging
 def setup_logging():
@@ -130,7 +138,9 @@ def load_and_format_dataset(
         logger.info(f"Dataset loaded successfully. Size: {len(dataset)} examples")
         # Split into train and validation sets
-        dataset = dataset.train_test_split(test_size=cfg.dataset.validation_split, seed=cfg.dataset.seed)
         logger.info(
             f"Dataset split into train ({len(dataset['train'])} examples) and validation ({len(dataset['test'])} examples) sets"
         )
@@ -188,10 +198,12 @@ def create_trainer(
         # Create TrainingArguments from config
         training_args_dict = OmegaConf.to_container(cfg.training.args, resolve=True)
         # Add dynamic precision settings
-        training_args_dict.update({
-            "fp16": not is_bfloat16_supported(),
-            "bf16": is_bfloat16_supported(),
-        })
         training_args = TrainingArguments(**training_args_dict)
         # Create data collator from config
@@ -202,7 +214,7 @@ def create_trainer(
         # Create SFT config without data_collator to avoid duplication
         sft_config = OmegaConf.to_container(cfg.training.sft, resolve=True)
-        sft_config.pop('data_collator', None)  # Remove data_collator from config
         trainer = SFTTrainer(
             model=model,
@@ -247,15 +259,145 @@ def main(cfg: DictConfig) -> None:
             # Save model
             logger.info(f"Saving final model to {cfg.output.dir}...")
             trainer.save_model(cfg.output.dir)
             # Print final metrics
             final_metrics = trainer.state.log_history[-1]
             logger.info("\nTraining completed!")
             logger.info(f"Final training loss: {final_metrics.get('loss', 'N/A')}")
-            logger.info(f"Final validation loss: {final_metrics.get('eval_loss', 'N/A')}")
         else:
             logger.info("Training skipped as train=False")
     except Exception as e:
         logger.error(f"Error in main training process: {e}")
         raise

     DataCollatorForLanguageModeling,
     Trainer,
     TrainingArguments,
+    AutoModelForCausalLM,
 )
 from trl import SFTTrainer
+from peft import PeftModel
+from smolagents import CodeAgent, Model, TransformersModel, VLLMModel
+from tools.smart_search.tool import SmartSearchTool
+from smolagents.monitoring import LogLevel
+import torch
+import os
 # Setup logging
 def setup_logging():
         logger.info(f"Dataset loaded successfully. Size: {len(dataset)} examples")
         # Split into train and validation sets
+        dataset = dataset.train_test_split(
+            test_size=cfg.dataset.validation_split, seed=cfg.dataset.seed
+        )
         logger.info(
             f"Dataset split into train ({len(dataset['train'])} examples) and validation ({len(dataset['test'])} examples) sets"
         )
         # Create TrainingArguments from config
         training_args_dict = OmegaConf.to_container(cfg.training.args, resolve=True)
         # Add dynamic precision settings
+        training_args_dict.update(
+            {
+                "fp16": not is_bfloat16_supported(),
+                "bf16": is_bfloat16_supported(),
+            }
+        )
         training_args = TrainingArguments(**training_args_dict)
         # Create data collator from config
         # Create SFT config without data_collator to avoid duplication
         sft_config = OmegaConf.to_container(cfg.training.sft, resolve=True)
+        sft_config.pop("data_collator", None)  # Remove data_collator from config
         trainer = SFTTrainer(
             model=model,
             # Save model
             logger.info(f"Saving final model to {cfg.output.dir}...")
             trainer.save_model(cfg.output.dir)
+            # Save model in VLLM format
+            logger.info("Saving model in VLLM format...")
+            model.save_pretrained_merged(
+                cfg.output.dir,
+                tokenizer,
+                save_method="merged_16bit"
+            )
             # Print final metrics
             final_metrics = trainer.state.log_history[-1]
             logger.info("\nTraining completed!")
             logger.info(f"Final training loss: {final_metrics.get('loss', 'N/A')}")
+            logger.info(
+                f"Final validation loss: {final_metrics.get('eval_loss', 'N/A')}"
+            )
         else:
             logger.info("Training skipped as train=False")
+        # Test if requested
+        if cfg.test:
+            logger.info("\nStarting testing...")
+            try:
+                # Enable memory history tracking
+                torch.cuda.memory._record_memory_history()
+                # Set memory allocation configuration
+                os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'
+                # Load test dataset
+                test_dataset = load_dataset(
+                    cfg.test_dataset.name,
+                    cfg.test_dataset.config,
+                    split=cfg.test_dataset.split,
+                    trust_remote_code=True,
+                )
+                logger.info(f"Loaded test dataset with {len(test_dataset)} examples")
+                logger.info(f"Dataset features: {test_dataset.features}")
+                # Clear CUDA cache before loading model
+                torch.cuda.empty_cache()
+                # Initialize model
+                model: Model = Model(
+                    model_id=cfg.model.name,
+                    # model_id=cfg.output.dir,
+                )
+                # model: Model = TransformersModel(
+                #     model_id=cfg.model.name,
+                #     # model_id=cfg.output.dir,
+                # )
+                # model: Model = VLLMModel(
+                #     model_id=cfg.model.name,
+                #     # model_id=cfg.output.dir,
+                # )
+                # Create CodeAgent with SmartSearchTool
+                agent = CodeAgent(
+                    model=model,
+                    tools=[SmartSearchTool()],
+                    verbosity_level=LogLevel.ERROR,
+                )
+                # Format task to get succinct answer
+                def format_task(question):
+                    return f"""Please provide two answers to the following question:
+1. A succinct answer that follows these rules:
+   - Contains ONLY the answer, nothing else
+   - Does not repeat the question
+   - Does not include explanations, reasoning, or context
+   - Does not include source attribution or references
+   - Does not use phrases like "The answer is" or "I found that"
+   - Does not include formatting, bullet points, or line breaks
+   - If the answer is a number, return only the number
+   - If the answer requires multiple items, separate them with commas
+   - If the answer requires ordering, maintain the specified order
+   - Uses the most direct and succinct form possible
+2. A verbose answer that includes:
+   - The complete answer with all relevant details
+   - Explanations and reasoning
+   - Context and background information
+   - Source attribution where appropriate
+Question: {question}
+Please format your response as a JSON object with two keys:
+- "succinct_answer": The concise answer following the rules above
+- "verbose_answer": The detailed explanation with context"""
+                # Run inference on test samples
+                logger.info("Running inference on test samples...")
+                for i, example in enumerate(test_dataset):
+                    try:
+                        # Clear CUDA cache before each sample
+                        torch.cuda.empty_cache()
+                        # Format the task
+                        task = format_task(example['Question'])
+                        # Run the agent
+                        result = agent.run(
+                            task=task,
+                            max_steps=3,
+                            reset=True,
+                            stream=False,
+                        )
+                        # Parse the result
+                        import json
+                        json_str = result[result.find("{"):result.rfind("}")+1]
+                        parsed_result = json.loads(json_str)
+                        answer = parsed_result["succinct_answer"]
+                        logger.info(f"\nTest Sample {i+1}:")
+                        logger.info(f"Question: {example['Question']}")
+                        logger.info(f"Model Response: {answer}")
+                        logger.info("-" * 80)
+                        # Log memory usage after each sample
+                        logger.info(f"Memory usage after sample {i+1}:")
+                        logger.info(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+                        logger.info(f"Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+                    except Exception as e:
+                        logger.error(f"Error processing test sample {i+1}: {str(e)}")
+                        continue
+                # Dump memory snapshot for analysis
+                torch.cuda.memory._dump_snapshot("memory_snapshot.pickle")
+                logger.info("Memory snapshot saved to memory_snapshot.pickle")
+            except Exception as e:
+                logger.error(f"Error during testing: {e}")
+                raise
     except Exception as e:
         logger.error(f"Error in main training process: {e}")
         raise