AkashDataScience commited on
Commit
dfae4e0
·
1 Parent(s): 23690d0

First commit

Browse files
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
2
+ from peft import PeftModel
3
+ import torch
4
+ import clip
5
+ from PIL import Image
6
+ import torch.nn as nn
7
+ from model import Projections
8
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
9
+ import gradio as gr
10
+
11
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
+ projections = Projections(512, 3072)
13
+ projections.load_state_dict(torch.load('checkpoint_dir/checkpoint-6000/projection_layer/pytorch_model.bin', map_location=device), strict=False)
14
+ projections = projections.to(device)
15
+ projections = projections.to(torch.bfloat16)
16
+
17
+ checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
18
+ model_kwargs = dict(
19
+ use_cache=False,
20
+ trust_remote_code=True,
21
+ attn_implementation='eager',
22
+ torch_dtype=torch.bfloat16,
23
+ device_map=None
24
+ )
25
+ base_model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
26
+
27
+ new_model = "checkpoint_dir/checkpoint-6000/phi_model" # change to the path where your model is saved
28
+
29
+ model = PeftModel.from_pretrained(base_model, new_model)
30
+ model = model.merge_and_unload()
31
+ model = model.to(device)
32
+
33
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True)
34
+ tokenizer.model_max_length = 2048
35
+ tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
36
+ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
37
+ tokenizer.padding_side = 'right'
38
+ tokenizer.chat_template = "{% for message in messages %}{% if message['from'] == 'system' %}{{'<|system|>' + message['value'] + '<|end|>'}}{% elif message['from'] ==\
39
+ 'human' %}{{'<|user|>' + message['value'] + '<|end|>'}}{% elif message['from'] == 'gpt' %}{{'<|assistant|>' + message['value'] +\
40
+ '<|end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}"
41
+
42
+ clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
43
+
44
+ # Load Whisper model and processor
45
+ whisper_model_name = "openai/whisper-small"
46
+ whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
47
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
48
+
49
+ def infer(message, history):
50
+ return message.keys()
51
+
52
+ examples=[{'text':"I am planning to buy a dog and a cat. Suggest some breeds that get along with each other"},
53
+ {'text':"Explain biased coin flip"},
54
+ {'text': "I want to buy a house. Suggest some factors to consider while making final decision"}]
55
+
56
+ gr.ChatInterface(infer, chatbot=gr.Chatbot(height=600),
57
+ textbox=gr.Textbox(placeholder="How can I help you today", container=False,
58
+ scale=7), theme="soft", examples=examples, undo_btn=None,
59
+ title="Phi-3 Multimodel Assistant").launch()
checkpoint_dir/ERA_V2_S30.ipynb ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "id": "C_YSfsRILGPG",
8
+ "tags": [],
9
+ "colab": {
10
+ "base_uri": "https://localhost:8080/"
11
+ },
12
+ "outputId": "107ed765-da2b-4d6e-e562-43c5573d8566"
13
+ },
14
+ "outputs": [
15
+ {
16
+ "output_type": "stream",
17
+ "name": "stdout",
18
+ "text": [
19
+ "fatal: destination path 'multi_model_phi_3' already exists and is not an empty directory.\n"
20
+ ]
21
+ }
22
+ ],
23
+ "source": [
24
+ "!git clone https://github.com/AkashDataScience/multi_model_phi_3"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {
31
+ "id": "CBVAhJBULs5R",
32
+ "tags": [],
33
+ "colab": {
34
+ "base_uri": "https://localhost:8080/"
35
+ },
36
+ "outputId": "3843bc68-eac9-45aa-a061-b284ae3ddefd"
37
+ },
38
+ "outputs": [
39
+ {
40
+ "output_type": "stream",
41
+ "name": "stdout",
42
+ "text": [
43
+ "/content/multi_model_phi_3\n"
44
+ ]
45
+ }
46
+ ],
47
+ "source": [
48
+ "%cd multi_model_phi_3"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "metadata": {
55
+ "id": "75koL8tzLxKS",
56
+ "tags": [],
57
+ "colab": {
58
+ "base_uri": "https://localhost:8080/"
59
+ },
60
+ "outputId": "7c5217f4-e70b-4d6d-bda3-65e94878b0e5"
61
+ },
62
+ "outputs": [
63
+ {
64
+ "output_type": "stream",
65
+ "name": "stdout",
66
+ "text": [
67
+ "Collecting clip@ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1 (from -r requirements.txt (line 2))\n",
68
+ " Using cached clip-1.0-py3-none-any.whl\n",
69
+ "Requirement already satisfied: bitsandbytes==0.43.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 1)) (0.43.3)\n",
70
+ "Requirement already satisfied: colorama==0.4.6 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 3)) (0.4.6)\n",
71
+ "Requirement already satisfied: datasets==3.0.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 4)) (3.0.0)\n",
72
+ "Requirement already satisfied: dill==0.3.8 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 5)) (0.3.8)\n",
73
+ "Requirement already satisfied: multiprocess==0.70.16 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 6)) (0.70.16)\n",
74
+ "Requirement already satisfied: numpy==1.26.4 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (1.26.4)\n",
75
+ "Requirement already satisfied: pandas==2.2.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 8)) (2.2.2)\n",
76
+ "Requirement already satisfied: peft==0.12.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 9)) (0.12.0)\n",
77
+ "Requirement already satisfied: shtab==1.7.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 10)) (1.7.1)\n",
78
+ "Requirement already satisfied: tokenizers==0.19.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 11)) (0.19.1)\n",
79
+ "Requirement already satisfied: torch==2.4.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 12)) (2.4.1+cu121)\n",
80
+ "Requirement already satisfied: torchvision==0.19.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 13)) (0.19.1+cu121)\n",
81
+ "Requirement already satisfied: tqdm==4.66.5 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 14)) (4.66.5)\n",
82
+ "Requirement already satisfied: transformers==4.44.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 15)) (4.44.2)\n",
83
+ "Requirement already satisfied: treelib==1.7.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 16)) (1.7.0)\n",
84
+ "Requirement already satisfied: trl==0.10.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 17)) (0.10.1)\n",
85
+ "Requirement already satisfied: typing_extensions==4.12.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 18)) (4.12.2)\n",
86
+ "Requirement already satisfied: tyro==0.8.10 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 19)) (0.8.10)\n",
87
+ "Requirement already satisfied: tzdata==2024.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 20)) (2024.1)\n",
88
+ "Requirement already satisfied: urllib3==2.2.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 21)) (2.2.3)\n",
89
+ "Requirement already satisfied: wcwidth==0.2.13 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 22)) (0.2.13)\n",
90
+ "Requirement already satisfied: xxhash==3.5.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 23)) (3.5.0)\n",
91
+ "Requirement already satisfied: yarl==1.11.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 24)) (1.11.1)\n",
92
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (3.16.1)\n",
93
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (16.1.0)\n",
94
+ "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (2.32.3)\n",
95
+ "Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==3.0.0->-r requirements.txt (line 4)) (2024.6.1)\n",
96
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (3.10.5)\n",
97
+ "Requirement already satisfied: huggingface-hub>=0.22.0 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (0.24.7)\n",
98
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (24.1)\n",
99
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (6.0.2)\n",
100
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.2->-r requirements.txt (line 8)) (2.8.2)\n",
101
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.2->-r requirements.txt (line 8)) (2024.2)\n",
102
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.12.0->-r requirements.txt (line 9)) (5.9.5)\n",
103
+ "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.12.0->-r requirements.txt (line 9)) (0.34.2)\n",
104
+ "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft==0.12.0->-r requirements.txt (line 9)) (0.4.5)\n",
105
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch==2.4.1->-r requirements.txt (line 12)) (1.13.3)\n",
106
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch==2.4.1->-r requirements.txt (line 12)) (3.3)\n",
107
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch==2.4.1->-r requirements.txt (line 12)) (3.1.4)\n",
108
+ "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision==0.19.1->-r requirements.txt (line 13)) (10.4.0)\n",
109
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.44.2->-r requirements.txt (line 15)) (2024.9.11)\n",
110
+ "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from treelib==1.7.0->-r requirements.txt (line 16)) (1.16.0)\n",
111
+ "Requirement already satisfied: docstring-parser>=0.16 in /usr/local/lib/python3.10/dist-packages (from tyro==0.8.10->-r requirements.txt (line 19)) (0.16)\n",
112
+ "Requirement already satisfied: rich>=11.1.0 in /usr/local/lib/python3.10/dist-packages (from tyro==0.8.10->-r requirements.txt (line 19)) (13.9.1)\n",
113
+ "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.10/dist-packages (from yarl==1.11.1->-r requirements.txt (line 24)) (3.10)\n",
114
+ "Requirement already satisfied: multidict>=4.0 in /usr/local/lib/python3.10/dist-packages (from yarl==1.11.1->-r requirements.txt (line 24)) (6.1.0)\n",
115
+ "Requirement already satisfied: ftfy in /usr/local/lib/python3.10/dist-packages (from clip@ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1->-r requirements.txt (line 2)) (6.2.3)\n",
116
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (2.4.3)\n",
117
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (1.3.1)\n",
118
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (24.2.0)\n",
119
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (1.4.1)\n",
120
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (4.0.3)\n",
121
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets==3.0.0->-r requirements.txt (line 4)) (3.3.2)\n",
122
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets==3.0.0->-r requirements.txt (line 4)) (2024.8.30)\n",
123
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1.0->tyro==0.8.10->-r requirements.txt (line 19)) (3.0.0)\n",
124
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1.0->tyro==0.8.10->-r requirements.txt (line 19)) (2.18.0)\n",
125
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch==2.4.1->-r requirements.txt (line 12)) (2.1.5)\n",
126
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch==2.4.1->-r requirements.txt (line 12)) (1.3.0)\n",
127
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=11.1.0->tyro==0.8.10->-r requirements.txt (line 19)) (0.1.2)\n"
128
+ ]
129
+ }
130
+ ],
131
+ "source": [
132
+ "!pip install -r requirements.txt"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": null,
138
+ "metadata": {
139
+ "id": "QauI2fQjWWTg",
140
+ "colab": {
141
+ "base_uri": "https://localhost:8080/"
142
+ },
143
+ "outputId": "fa8e0f93-d988-4108-93f7-15a7281a1b21"
144
+ },
145
+ "outputs": [
146
+ {
147
+ "output_type": "stream",
148
+ "name": "stdout",
149
+ "text": [
150
+ "/content/multi_model_phi_3/image_finetuning/finetuning\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "%cd image_finetuning/finetuning"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "source": [
161
+ "!wget -c https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json"
162
+ ],
163
+ "metadata": {
164
+ "id": "koXJ8mCciYYn",
165
+ "colab": {
166
+ "base_uri": "https://localhost:8080/"
167
+ },
168
+ "outputId": "ad30c91c-59f7-48c3-c6c9-7cd824dbdaaf"
169
+ },
170
+ "execution_count": null,
171
+ "outputs": [
172
+ {
173
+ "output_type": "stream",
174
+ "name": "stdout",
175
+ "text": [
176
+ "--2024-10-10 15:16:24-- https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json\n",
177
+ "Resolving huggingface.co (huggingface.co)... 3.165.160.12, 3.165.160.59, 3.165.160.11, ...\n",
178
+ "Connecting to huggingface.co (huggingface.co)|3.165.160.12|:443... connected.\n",
179
+ "HTTP request sent, awaiting response... 302 Found\n",
180
+ "Location: https://cdn-lfs.hf.co/repos/4d/41/4d41ea1e2709f0e68e9e361e4218192b9620c5a3f2cb8055bc625942b6cd3039/6b68bc5ca2bfd8a71119af0e8454929668ccda6a334955ccc95d114fc8d082fa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27llava_instruct_150k.json%3B+filename%3D%22llava_instruct_150k.json%22%3B&response-content-type=application%2Fjson&Expires=1728832584&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyODgzMjU4NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy80ZC80MS80ZDQxZWExZTI3MDlmMGU2OGU5ZTM2MWU0MjE4MTkyYjk2MjBjNWEzZjJjYjgwNTViYzYyNTk0MmI2Y2QzMDM5LzZiNjhiYzVjYTJiZmQ4YTcxMTE5YWYwZTg0NTQ5Mjk2NjhjY2RhNmEzMzQ5NTVjY2M5NWQxMTRmYzhkMDgyZmE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=hSNSHtz4qcHoAKL%7EBQFjBgq04GmcG2H-ajjYJrixr%7EHufuWwWQMy5AcuLKkDmolFgE8M82AnKQ08idN5ZvzJcgcoyt4QLWmrwLFRMnkORPQNFAoZk9FKvkthxfpdIdLtTZoPb6BqMg5l4SeggvOSC5q8%7EtfC5ASQMw%7ExqIqSGPTo9yIb-CfLXyE3Ceef8E7MIfW8s796ZpgilPx1zhl4cx8s2DyieL84KckvhYxf2Lc5MRBZnUdl0sUuvHBlC7SCr5lB2v-W1veTiqwur9fSpQ4uawD1BApft-zlSA84DnjssFWqhBa-T49X5-P2fGLmwAPcyVlUT17%7EvhHc-reAJg__&Key-Pair-Id=K3RPWS32NSSJCE [following]\n",
181
+ "--2024-10-10 15:16:25-- https://cdn-lfs.hf.co/repos/4d/41/4d41ea1e2709f0e68e9e361e4218192b9620c5a3f2cb8055bc625942b6cd3039/6b68bc5ca2bfd8a71119af0e8454929668ccda6a334955ccc95d114fc8d082fa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27llava_instruct_150k.json%3B+filename%3D%22llava_instruct_150k.json%22%3B&response-content-type=application%2Fjson&Expires=1728832584&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyODgzMjU4NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy80ZC80MS80ZDQxZWExZTI3MDlmMGU2OGU5ZTM2MWU0MjE4MTkyYjk2MjBjNWEzZjJjYjgwNTViYzYyNTk0MmI2Y2QzMDM5LzZiNjhiYzVjYTJiZmQ4YTcxMTE5YWYwZTg0NTQ5Mjk2NjhjY2RhNmEzMzQ5NTVjY2M5NWQxMTRmYzhkMDgyZmE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=hSNSHtz4qcHoAKL%7EBQFjBgq04GmcG2H-ajjYJrixr%7EHufuWwWQMy5AcuLKkDmolFgE8M82AnKQ08idN5ZvzJcgcoyt4QLWmrwLFRMnkORPQNFAoZk9FKvkthxfpdIdLtTZoPb6BqMg5l4SeggvOSC5q8%7EtfC5ASQMw%7ExqIqSGPTo9yIb-CfLXyE3Ceef8E7MIfW8s796ZpgilPx1zhl4cx8s2DyieL84KckvhYxf2Lc5MRBZnUdl0sUuvHBlC7SCr5lB2v-W1veTiqwur9fSpQ4uawD1BApft-zlSA84DnjssFWqhBa-T49X5-P2fGLmwAPcyVlUT17%7EvhHc-reAJg__&Key-Pair-Id=K3RPWS32NSSJCE\n",
182
+ "Resolving cdn-lfs.hf.co (cdn-lfs.hf.co)... 18.172.170.21, 18.172.170.29, 18.172.170.5, ...\n",
183
+ "Connecting to cdn-lfs.hf.co (cdn-lfs.hf.co)|18.172.170.21|:443... connected.\n",
184
+ "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
185
+ "\n",
186
+ " The file is already fully retrieved; nothing to do.\n",
187
+ "\n"
188
+ ]
189
+ }
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "metadata": {
196
+ "id": "B325mAHNtJCB",
197
+ "tags": [],
198
+ "colab": {
199
+ "base_uri": "https://localhost:8080/",
200
+ "height": 1000,
201
+ "referenced_widgets": [
202
+ "f8108b8c120d4f49ac2b0fa38d6213c3",
203
+ "868fd8ed1fb5432ab6d6761b4e4ce17d",
204
+ "f75b8c1cb15a4dc9871b28a286dd3b82",
205
+ "a08098910db44feabc38e65bf4a55379",
206
+ "13be5acd97ab44babec61cedcf5b2a3a",
207
+ "239ab0a871684811ae7a3e16daa8991a",
208
+ "4ca7774f57454d74bd1b7c9445030038",
209
+ "557ecfce51574b8db9236bdc8d0bd555",
210
+ "bc0e09ab397f42879b8c874eb10e6a2b",
211
+ "8dec270eb8c649649f6b95ddde159a0f",
212
+ "f9d65533a8fd4310b1466713c22d8255"
213
+ ]
214
+ },
215
+ "outputId": "1a2eb33f-62b2-4f96-d3f5-365869a0aa0b"
216
+ },
217
+ "outputs": [
218
+ {
219
+ "output_type": "stream",
220
+ "name": "stderr",
221
+ "text": [
222
+ "WARNING:__main__:Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: False\n",
223
+ "INFO:__main__:Training/evaluation parameters TrainingArguments(\n",
224
+ "_n_gpu=1,\n",
225
+ "accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
226
+ "adafactor=False,\n",
227
+ "adam_beta1=0.9,\n",
228
+ "adam_beta2=0.999,\n",
229
+ "adam_epsilon=1e-08,\n",
230
+ "auto_find_batch_size=False,\n",
231
+ "batch_eval_metrics=False,\n",
232
+ "bf16=True,\n",
233
+ "bf16_full_eval=False,\n",
234
+ "data_seed=None,\n",
235
+ "dataloader_drop_last=False,\n",
236
+ "dataloader_num_workers=0,\n",
237
+ "dataloader_persistent_workers=False,\n",
238
+ "dataloader_pin_memory=True,\n",
239
+ "dataloader_prefetch_factor=None,\n",
240
+ "ddp_backend=None,\n",
241
+ "ddp_broadcast_buffers=None,\n",
242
+ "ddp_bucket_cap_mb=None,\n",
243
+ "ddp_find_unused_parameters=None,\n",
244
+ "ddp_timeout=1800,\n",
245
+ "debug=[],\n",
246
+ "deepspeed=None,\n",
247
+ "disable_tqdm=False,\n",
248
+ "dispatch_batches=None,\n",
249
+ "do_eval=False,\n",
250
+ "do_predict=False,\n",
251
+ "do_train=False,\n",
252
+ "eval_accumulation_steps=None,\n",
253
+ "eval_delay=0,\n",
254
+ "eval_do_concat_batches=True,\n",
255
+ "eval_on_start=False,\n",
256
+ "eval_steps=None,\n",
257
+ "eval_strategy=no,\n",
258
+ "eval_use_gather_object=False,\n",
259
+ "evaluation_strategy=None,\n",
260
+ "fp16=False,\n",
261
+ "fp16_backend=auto,\n",
262
+ "fp16_full_eval=False,\n",
263
+ "fp16_opt_level=O1,\n",
264
+ "fsdp=[],\n",
265
+ "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
266
+ "fsdp_min_num_params=0,\n",
267
+ "fsdp_transformer_layer_cls_to_wrap=None,\n",
268
+ "full_determinism=False,\n",
269
+ "gradient_accumulation_steps=1,\n",
270
+ "gradient_checkpointing=True,\n",
271
+ "gradient_checkpointing_kwargs={'use_reentrant': False},\n",
272
+ "greater_is_better=None,\n",
273
+ "group_by_length=False,\n",
274
+ "half_precision_backend=auto,\n",
275
+ "hub_always_push=False,\n",
276
+ "hub_model_id=None,\n",
277
+ "hub_private_repo=False,\n",
278
+ "hub_strategy=every_save,\n",
279
+ "hub_token=<HUB_TOKEN>,\n",
280
+ "ignore_data_skip=False,\n",
281
+ "include_inputs_for_metrics=False,\n",
282
+ "include_num_input_tokens_seen=False,\n",
283
+ "include_tokens_per_second=False,\n",
284
+ "jit_mode_eval=False,\n",
285
+ "label_names=None,\n",
286
+ "label_smoothing_factor=0.0,\n",
287
+ "learning_rate=5e-06,\n",
288
+ "length_column_name=length,\n",
289
+ "load_best_model_at_end=False,\n",
290
+ "local_rank=0,\n",
291
+ "log_level=info,\n",
292
+ "log_level_replica=warning,\n",
293
+ "log_on_each_node=True,\n",
294
+ "logging_dir=./checkpoint_dir/runs/Oct10_15-16-37_33ba61f47fc9,\n",
295
+ "logging_first_step=False,\n",
296
+ "logging_nan_inf_filter=True,\n",
297
+ "logging_steps=20,\n",
298
+ "logging_strategy=steps,\n",
299
+ "lr_scheduler_kwargs={},\n",
300
+ "lr_scheduler_type=cosine,\n",
301
+ "max_grad_norm=1.0,\n",
302
+ "max_steps=60,\n",
303
+ "metric_for_best_model=None,\n",
304
+ "mp_parameters=,\n",
305
+ "neftune_noise_alpha=None,\n",
306
+ "no_cuda=False,\n",
307
+ "num_train_epochs=1,\n",
308
+ "optim=adamw_torch,\n",
309
+ "optim_args=None,\n",
310
+ "optim_target_modules=None,\n",
311
+ "output_dir=./checkpoint_dir,\n",
312
+ "overwrite_output_dir=True,\n",
313
+ "past_index=-1,\n",
314
+ "per_device_eval_batch_size=4,\n",
315
+ "per_device_train_batch_size=4,\n",
316
+ "prediction_loss_only=False,\n",
317
+ "push_to_hub=False,\n",
318
+ "push_to_hub_model_id=None,\n",
319
+ "push_to_hub_organization=None,\n",
320
+ "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
321
+ "ray_scope=last,\n",
322
+ "remove_unused_columns=False,\n",
323
+ "report_to=['tensorboard'],\n",
324
+ "restore_callback_states_from_checkpoint=False,\n",
325
+ "resume_from_checkpoint=None,\n",
326
+ "run_name=./checkpoint_dir,\n",
327
+ "save_on_each_node=False,\n",
328
+ "save_only_model=False,\n",
329
+ "save_safetensors=True,\n",
330
+ "save_steps=60,\n",
331
+ "save_strategy=steps,\n",
332
+ "save_total_limit=1,\n",
333
+ "seed=0,\n",
334
+ "skip_memory_metrics=True,\n",
335
+ "split_batches=None,\n",
336
+ "tf32=None,\n",
337
+ "torch_compile=False,\n",
338
+ "torch_compile_backend=None,\n",
339
+ "torch_compile_mode=None,\n",
340
+ "torch_empty_cache_steps=None,\n",
341
+ "torchdynamo=None,\n",
342
+ "tpu_metrics_debug=False,\n",
343
+ "tpu_num_cores=None,\n",
344
+ "use_cpu=False,\n",
345
+ "use_ipex=False,\n",
346
+ "use_legacy_prediction_loop=False,\n",
347
+ "use_mps_device=False,\n",
348
+ "warmup_ratio=0.2,\n",
349
+ "warmup_steps=0,\n",
350
+ "weight_decay=0.0,\n",
351
+ ")\n",
352
+ "INFO:__main__:PEFT parameters LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules={'qkv_proj', 'o_proj'}, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))\n",
353
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
354
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
355
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
356
+ "You will be able to reuse this secret in all of your notebooks.\n",
357
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
358
+ " warnings.warn(\n",
359
+ "[INFO|configuration_utils.py:733] 2024-10-10 15:16:39,682 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json\n",
360
+ "[INFO|configuration_utils.py:733] 2024-10-10 15:16:39,856 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json\n",
361
+ "[INFO|configuration_utils.py:800] 2024-10-10 15:16:39,858 >> Model config Phi3Config {\n",
362
+ " \"_name_or_path\": \"microsoft/phi-3-mini-4k-instruct\",\n",
363
+ " \"architectures\": [\n",
364
+ " \"Phi3ForCausalLM\"\n",
365
+ " ],\n",
366
+ " \"attention_bias\": false,\n",
367
+ " \"attention_dropout\": 0.0,\n",
368
+ " \"auto_map\": {\n",
369
+ " \"AutoConfig\": \"microsoft/phi-3-mini-4k-instruct--configuration_phi3.Phi3Config\",\n",
370
+ " \"AutoModelForCausalLM\": \"microsoft/phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM\"\n",
371
+ " },\n",
372
+ " \"bos_token_id\": 1,\n",
373
+ " \"embd_pdrop\": 0.0,\n",
374
+ " \"eos_token_id\": 32000,\n",
375
+ " \"hidden_act\": \"silu\",\n",
376
+ " \"hidden_size\": 3072,\n",
377
+ " \"initializer_range\": 0.02,\n",
378
+ " \"intermediate_size\": 8192,\n",
379
+ " \"max_position_embeddings\": 4096,\n",
380
+ " \"model_type\": \"phi3\",\n",
381
+ " \"num_attention_heads\": 32,\n",
382
+ " \"num_hidden_layers\": 32,\n",
383
+ " \"num_key_value_heads\": 32,\n",
384
+ " \"original_max_position_embeddings\": 4096,\n",
385
+ " \"pad_token_id\": 32000,\n",
386
+ " \"resid_pdrop\": 0.0,\n",
387
+ " \"rms_norm_eps\": 1e-05,\n",
388
+ " \"rope_scaling\": null,\n",
389
+ " \"rope_theta\": 10000.0,\n",
390
+ " \"sliding_window\": 2047,\n",
391
+ " \"tie_word_embeddings\": false,\n",
392
+ " \"torch_dtype\": \"bfloat16\",\n",
393
+ " \"transformers_version\": \"4.44.2\",\n",
394
+ " \"use_cache\": false,\n",
395
+ " \"vocab_size\": 32064\n",
396
+ "}\n",
397
+ "\n",
398
+ "WARNING:transformers_modules.microsoft.phi-3-mini-4k-instruct.0a67737cc96d2554230f90338b163bc6380a2a85.modeling_phi3:`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.\n",
399
+ "WARNING:transformers_modules.microsoft.phi-3-mini-4k-instruct.0a67737cc96d2554230f90338b163bc6380a2a85.modeling_phi3:Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.\n",
400
+ "[INFO|modeling_utils.py:3678] 2024-10-10 15:16:40,221 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/model.safetensors.index.json\n",
401
+ "[INFO|modeling_utils.py:1606] 2024-10-10 15:16:40,225 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16.\n",
402
+ "[INFO|configuration_utils.py:1038] 2024-10-10 15:16:40,228 >> Generate config GenerationConfig {\n",
403
+ " \"bos_token_id\": 1,\n",
404
+ " \"eos_token_id\": 32000,\n",
405
+ " \"pad_token_id\": 32000,\n",
406
+ " \"use_cache\": false\n",
407
+ "}\n",
408
+ "\n"
409
+ ]
410
+ },
411
+ {
412
+ "output_type": "display_data",
413
+ "data": {
414
+ "text/plain": [
415
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
416
+ ],
417
+ "application/vnd.jupyter.widget-view+json": {
418
+ "version_major": 2,
419
+ "version_minor": 0,
420
+ "model_id": "f8108b8c120d4f49ac2b0fa38d6213c3"
421
+ }
422
+ },
423
+ "metadata": {}
424
+ },
425
+ {
426
+ "output_type": "stream",
427
+ "name": "stderr",
428
+ "text": [
429
+ "[INFO|modeling_utils.py:4507] 2024-10-10 15:17:11,062 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.\n",
430
+ "\n",
431
+ "[INFO|modeling_utils.py:4515] 2024-10-10 15:17:11,070 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/phi-3-mini-4k-instruct.\n",
432
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.\n",
433
+ "[INFO|configuration_utils.py:993] 2024-10-10 15:17:11,251 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/generation_config.json\n",
434
+ "[INFO|configuration_utils.py:1038] 2024-10-10 15:17:11,253 >> Generate config GenerationConfig {\n",
435
+ " \"bos_token_id\": 1,\n",
436
+ " \"eos_token_id\": [\n",
437
+ " 32000,\n",
438
+ " 32001,\n",
439
+ " 32007\n",
440
+ " ],\n",
441
+ " \"pad_token_id\": 32000\n",
442
+ "}\n",
443
+ "\n",
444
+ "[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,768 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.model\n",
445
+ "[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,769 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.json\n",
446
+ "[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,771 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/added_tokens.json\n",
447
+ "[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,772 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/special_tokens_map.json\n",
448
+ "[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,775 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer_config.json\n",
449
+ "[INFO|tokenization_utils_base.py:2513] 2024-10-10 15:17:11,857 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
450
+ "/content/multi_model_phi_3/image_finetuning/finetuning/model.py:39: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
451
+ " self.projections.load_state_dict(torch.load(projection_path, map_location=device), strict=False)\n",
452
+ "Using custom data configuration default-559b28e319de0343\n",
453
+ "INFO:datasets.builder:Using custom data configuration default-559b28e319de0343\n",
454
+ "Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json\n",
455
+ "INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json\n",
456
+ "Overwrite dataset info from restored data version if exists.\n",
457
+ "INFO:datasets.builder:Overwrite dataset info from restored data version if exists.\n",
458
+ "Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
459
+ "INFO:datasets.info:Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
460
+ "Found cached dataset json (/root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)\n",
461
+ "INFO:datasets.builder:Found cached dataset json (/root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)\n",
462
+ "Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
463
+ "INFO:datasets.info:Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
464
+ "Process #0 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00000_of_00010.arrow\n",
465
+ "INFO:datasets.arrow_dataset:Process #0 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00000_of_00010.arrow\n",
466
+ "Process #1 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00001_of_00010.arrow\n",
467
+ "INFO:datasets.arrow_dataset:Process #1 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00001_of_00010.arrow\n",
468
+ "Process #2 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00002_of_00010.arrow\n",
469
+ "INFO:datasets.arrow_dataset:Process #2 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00002_of_00010.arrow\n",
470
+ "Process #3 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00003_of_00010.arrow\n",
471
+ "INFO:datasets.arrow_dataset:Process #3 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00003_of_00010.arrow\n",
472
+ "Process #4 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00004_of_00010.arrow\n",
473
+ "INFO:datasets.arrow_dataset:Process #4 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00004_of_00010.arrow\n",
474
+ "Process #5 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00005_of_00010.arrow\n",
475
+ "INFO:datasets.arrow_dataset:Process #5 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00005_of_00010.arrow\n",
476
+ "Process #6 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00006_of_00010.arrow\n",
477
+ "INFO:datasets.arrow_dataset:Process #6 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00006_of_00010.arrow\n",
478
+ "Process #7 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00007_of_00010.arrow\n",
479
+ "INFO:datasets.arrow_dataset:Process #7 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00007_of_00010.arrow\n",
480
+ "Process #8 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00008_of_00010.arrow\n",
481
+ "INFO:datasets.arrow_dataset:Process #8 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00008_of_00010.arrow\n",
482
+ "Process #9 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00009_of_00010.arrow\n",
483
+ "INFO:datasets.arrow_dataset:Process #9 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00009_of_00010.arrow\n",
484
+ "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_*_of_00010.arrow\n",
485
+ "INFO:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_*_of_00010.arrow\n",
486
+ "Concatenating 10 shards\n",
487
+ "INFO:datasets.arrow_dataset:Concatenating 10 shards\n",
488
+ "/content/multi_model_phi_3/image_finetuning/finetuning/dataset.py:10: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
489
+ " self.image_embeddings = torch.load('clip_embeddings.pt')\n",
490
+ "[WARNING|trainer.py:598] 2024-10-10 15:17:18,521 >> max_steps is given, it will override any value given in num_train_epochs\n",
491
+ "[INFO|trainer.py:648] 2024-10-10 15:17:18,522 >> Using auto half precision backend\n",
492
+ "[INFO|trainer.py:2134] 2024-10-10 15:17:19,563 >> ***** Running training *****\n",
493
+ "[INFO|trainer.py:2135] 2024-10-10 15:17:19,565 >> Num examples = 141,941\n",
494
+ "[INFO|trainer.py:2136] 2024-10-10 15:17:19,570 >> Num Epochs = 1\n",
495
+ "[INFO|trainer.py:2137] 2024-10-10 15:17:19,571 >> Instantaneous batch size per device = 4\n",
496
+ "[INFO|trainer.py:2140] 2024-10-10 15:17:19,573 >> Total train batch size (w. parallel, distributed & accumulation) = 4\n",
497
+ "[INFO|trainer.py:2141] 2024-10-10 15:17:19,574 >> Gradient Accumulation steps = 1\n",
498
+ "[INFO|trainer.py:2142] 2024-10-10 15:17:19,576 >> Total optimization steps = 60\n",
499
+ "[INFO|trainer.py:2143] 2024-10-10 15:17:19,580 >> Number of trainable parameters = 124,302,336\n",
500
+ "WARNING:transformers_modules.microsoft.phi-3-mini-4k-instruct.0a67737cc96d2554230f90338b163bc6380a2a85.modeling_phi3:You are not running the flash-attention implementation, expect numerical differences.\n",
501
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
502
+ " with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined]\n",
503
+ "[WARNING|modeling_utils.py:1264] 2024-10-10 15:17:29,290 >> Could not estimate the number of tokens of the input, floating-point operations will not be computed\n"
504
+ ]
505
+ },
506
+ {
507
+ "output_type": "display_data",
508
+ "data": {
509
+ "text/plain": [
510
+ "<IPython.core.display.HTML object>"
511
+ ],
512
+ "text/html": [
513
+ "\n",
514
+ " <div>\n",
515
+ " \n",
516
+ " <progress value='60' max='60' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
517
+ " [60/60 15:44, Epoch 0/1]\n",
518
+ " </div>\n",
519
+ " <table border=\"1\" class=\"dataframe\">\n",
520
+ " <thead>\n",
521
+ " <tr style=\"text-align: left;\">\n",
522
+ " <th>Step</th>\n",
523
+ " <th>Training Loss</th>\n",
524
+ " </tr>\n",
525
+ " </thead>\n",
526
+ " <tbody>\n",
527
+ " <tr>\n",
528
+ " <td>20</td>\n",
529
+ " <td>9.531900</td>\n",
530
+ " </tr>\n",
531
+ " <tr>\n",
532
+ " <td>40</td>\n",
533
+ " <td>10.267400</td>\n",
534
+ " </tr>\n",
535
+ " <tr>\n",
536
+ " <td>60</td>\n",
537
+ " <td>9.545700</td>\n",
538
+ " </tr>\n",
539
+ " </tbody>\n",
540
+ "</table><p>"
541
+ ]
542
+ },
543
+ "metadata": {}
544
+ },
545
+ {
546
+ "output_type": "stream",
547
+ "name": "stderr",
548
+ "text": [
549
+ "[INFO|trainer.py:3503] 2024-10-10 15:32:51,845 >> Saving model checkpoint to ./checkpoint_dir/checkpoint-60\n",
550
+ "[INFO|configuration_utils.py:472] 2024-10-10 15:32:51,849 >> Configuration saved in ./checkpoint_dir/checkpoint-60/config.json\n",
551
+ "[INFO|modeling_utils.py:2799] 2024-10-10 15:33:11,817 >> Model weights saved in ./checkpoint_dir/checkpoint-60/model.safetensors\n",
552
+ "[INFO|tokenization_utils_base.py:2684] 2024-10-10 15:33:11,827 >> tokenizer config file saved in ./checkpoint_dir/checkpoint-60/tokenizer_config.json\n",
553
+ "[INFO|tokenization_utils_base.py:2693] 2024-10-10 15:33:11,830 >> Special tokens file saved in ./checkpoint_dir/checkpoint-60/special_tokens_map.json\n",
554
+ "[INFO|trainer.py:2394] 2024-10-10 15:33:13,911 >> \n",
555
+ "\n",
556
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
557
+ "\n",
558
+ "\n",
559
+ "[INFO|trainer.py:3819] 2024-10-10 15:33:13,928 >> \n",
560
+ "***** Running Evaluation *****\n",
561
+ "[INFO|trainer.py:3821] 2024-10-10 15:33:13,931 >> Num examples = 15771\n",
562
+ "[INFO|trainer.py:3824] 2024-10-10 15:33:13,933 >> Batch size = 4\n"
563
+ ]
564
+ },
565
+ {
566
+ "output_type": "stream",
567
+ "name": "stdout",
568
+ "text": [
569
+ "***** train metrics *****\n",
570
+ " epoch = 0.0017\n",
571
+ " total_flos = 0GF\n",
572
+ " train_loss = 9.7817\n",
573
+ " train_runtime = 0:15:54.33\n",
574
+ " train_samples_per_second = 0.251\n",
575
+ " train_steps_per_second = 0.063\n"
576
+ ]
577
+ },
578
+ {
579
+ "output_type": "display_data",
580
+ "data": {
581
+ "text/plain": [
582
+ "<IPython.core.display.HTML object>"
583
+ ],
584
+ "text/html": [
585
+ "\n",
586
+ " <div>\n",
587
+ " \n",
588
+ " <progress value='1158' max='3943' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
589
+ " [1158/3943 1:48:54 < 4:22:10, 0.18 it/s]\n",
590
+ " </div>\n",
591
+ " "
592
+ ]
593
+ },
594
+ "metadata": {}
595
+ }
596
+ ],
597
+ "source": [
598
+ "%run finetune.py"
599
+ ]
600
+ }
601
+ ],
602
+ "metadata": {
603
+ "accelerator": "GPU",
604
+ "colab": {
605
+ "gpuType": "T4",
606
+ "provenance": []
607
+ },
608
+ "kernelspec": {
609
+ "display_name": "Python 3 (ipykernel)",
610
+ "language": "python",
611
+ "name": "python3"
612
+ },
613
+ "language_info": {
614
+ "codemirror_mode": {
615
+ "name": "ipython",
616
+ "version": 3
617
+ },
618
+ "file_extension": ".py",
619
+ "mimetype": "text/x-python",
620
+ "name": "python",
621
+ "nbconvert_exporter": "python",
622
+ "pygments_lexer": "ipython3",
623
+ "version": "3.10.13"
624
+ },
625
+ "widgets": {
626
+ "application/vnd.jupyter.widget-state+json": {
627
+ "f8108b8c120d4f49ac2b0fa38d6213c3": {
628
+ "model_module": "@jupyter-widgets/controls",
629
+ "model_name": "HBoxModel",
630
+ "model_module_version": "1.5.0",
631
+ "state": {
632
+ "_dom_classes": [],
633
+ "_model_module": "@jupyter-widgets/controls",
634
+ "_model_module_version": "1.5.0",
635
+ "_model_name": "HBoxModel",
636
+ "_view_count": null,
637
+ "_view_module": "@jupyter-widgets/controls",
638
+ "_view_module_version": "1.5.0",
639
+ "_view_name": "HBoxView",
640
+ "box_style": "",
641
+ "children": [
642
+ "IPY_MODEL_868fd8ed1fb5432ab6d6761b4e4ce17d",
643
+ "IPY_MODEL_f75b8c1cb15a4dc9871b28a286dd3b82",
644
+ "IPY_MODEL_a08098910db44feabc38e65bf4a55379"
645
+ ],
646
+ "layout": "IPY_MODEL_13be5acd97ab44babec61cedcf5b2a3a"
647
+ }
648
+ },
649
+ "868fd8ed1fb5432ab6d6761b4e4ce17d": {
650
+ "model_module": "@jupyter-widgets/controls",
651
+ "model_name": "HTMLModel",
652
+ "model_module_version": "1.5.0",
653
+ "state": {
654
+ "_dom_classes": [],
655
+ "_model_module": "@jupyter-widgets/controls",
656
+ "_model_module_version": "1.5.0",
657
+ "_model_name": "HTMLModel",
658
+ "_view_count": null,
659
+ "_view_module": "@jupyter-widgets/controls",
660
+ "_view_module_version": "1.5.0",
661
+ "_view_name": "HTMLView",
662
+ "description": "",
663
+ "description_tooltip": null,
664
+ "layout": "IPY_MODEL_239ab0a871684811ae7a3e16daa8991a",
665
+ "placeholder": "​",
666
+ "style": "IPY_MODEL_4ca7774f57454d74bd1b7c9445030038",
667
+ "value": "Loading checkpoint shards: 100%"
668
+ }
669
+ },
670
+ "f75b8c1cb15a4dc9871b28a286dd3b82": {
671
+ "model_module": "@jupyter-widgets/controls",
672
+ "model_name": "FloatProgressModel",
673
+ "model_module_version": "1.5.0",
674
+ "state": {
675
+ "_dom_classes": [],
676
+ "_model_module": "@jupyter-widgets/controls",
677
+ "_model_module_version": "1.5.0",
678
+ "_model_name": "FloatProgressModel",
679
+ "_view_count": null,
680
+ "_view_module": "@jupyter-widgets/controls",
681
+ "_view_module_version": "1.5.0",
682
+ "_view_name": "ProgressView",
683
+ "bar_style": "success",
684
+ "description": "",
685
+ "description_tooltip": null,
686
+ "layout": "IPY_MODEL_557ecfce51574b8db9236bdc8d0bd555",
687
+ "max": 2,
688
+ "min": 0,
689
+ "orientation": "horizontal",
690
+ "style": "IPY_MODEL_bc0e09ab397f42879b8c874eb10e6a2b",
691
+ "value": 2
692
+ }
693
+ },
694
+ "a08098910db44feabc38e65bf4a55379": {
695
+ "model_module": "@jupyter-widgets/controls",
696
+ "model_name": "HTMLModel",
697
+ "model_module_version": "1.5.0",
698
+ "state": {
699
+ "_dom_classes": [],
700
+ "_model_module": "@jupyter-widgets/controls",
701
+ "_model_module_version": "1.5.0",
702
+ "_model_name": "HTMLModel",
703
+ "_view_count": null,
704
+ "_view_module": "@jupyter-widgets/controls",
705
+ "_view_module_version": "1.5.0",
706
+ "_view_name": "HTMLView",
707
+ "description": "",
708
+ "description_tooltip": null,
709
+ "layout": "IPY_MODEL_8dec270eb8c649649f6b95ddde159a0f",
710
+ "placeholder": "​",
711
+ "style": "IPY_MODEL_f9d65533a8fd4310b1466713c22d8255",
712
+ "value": " 2/2 [00:30&lt;00:00, 14.75s/it]"
713
+ }
714
+ },
715
+ "13be5acd97ab44babec61cedcf5b2a3a": {
716
+ "model_module": "@jupyter-widgets/base",
717
+ "model_name": "LayoutModel",
718
+ "model_module_version": "1.2.0",
719
+ "state": {
720
+ "_model_module": "@jupyter-widgets/base",
721
+ "_model_module_version": "1.2.0",
722
+ "_model_name": "LayoutModel",
723
+ "_view_count": null,
724
+ "_view_module": "@jupyter-widgets/base",
725
+ "_view_module_version": "1.2.0",
726
+ "_view_name": "LayoutView",
727
+ "align_content": null,
728
+ "align_items": null,
729
+ "align_self": null,
730
+ "border": null,
731
+ "bottom": null,
732
+ "display": null,
733
+ "flex": null,
734
+ "flex_flow": null,
735
+ "grid_area": null,
736
+ "grid_auto_columns": null,
737
+ "grid_auto_flow": null,
738
+ "grid_auto_rows": null,
739
+ "grid_column": null,
740
+ "grid_gap": null,
741
+ "grid_row": null,
742
+ "grid_template_areas": null,
743
+ "grid_template_columns": null,
744
+ "grid_template_rows": null,
745
+ "height": null,
746
+ "justify_content": null,
747
+ "justify_items": null,
748
+ "left": null,
749
+ "margin": null,
750
+ "max_height": null,
751
+ "max_width": null,
752
+ "min_height": null,
753
+ "min_width": null,
754
+ "object_fit": null,
755
+ "object_position": null,
756
+ "order": null,
757
+ "overflow": null,
758
+ "overflow_x": null,
759
+ "overflow_y": null,
760
+ "padding": null,
761
+ "right": null,
762
+ "top": null,
763
+ "visibility": null,
764
+ "width": null
765
+ }
766
+ },
767
+ "239ab0a871684811ae7a3e16daa8991a": {
768
+ "model_module": "@jupyter-widgets/base",
769
+ "model_name": "LayoutModel",
770
+ "model_module_version": "1.2.0",
771
+ "state": {
772
+ "_model_module": "@jupyter-widgets/base",
773
+ "_model_module_version": "1.2.0",
774
+ "_model_name": "LayoutModel",
775
+ "_view_count": null,
776
+ "_view_module": "@jupyter-widgets/base",
777
+ "_view_module_version": "1.2.0",
778
+ "_view_name": "LayoutView",
779
+ "align_content": null,
780
+ "align_items": null,
781
+ "align_self": null,
782
+ "border": null,
783
+ "bottom": null,
784
+ "display": null,
785
+ "flex": null,
786
+ "flex_flow": null,
787
+ "grid_area": null,
788
+ "grid_auto_columns": null,
789
+ "grid_auto_flow": null,
790
+ "grid_auto_rows": null,
791
+ "grid_column": null,
792
+ "grid_gap": null,
793
+ "grid_row": null,
794
+ "grid_template_areas": null,
795
+ "grid_template_columns": null,
796
+ "grid_template_rows": null,
797
+ "height": null,
798
+ "justify_content": null,
799
+ "justify_items": null,
800
+ "left": null,
801
+ "margin": null,
802
+ "max_height": null,
803
+ "max_width": null,
804
+ "min_height": null,
805
+ "min_width": null,
806
+ "object_fit": null,
807
+ "object_position": null,
808
+ "order": null,
809
+ "overflow": null,
810
+ "overflow_x": null,
811
+ "overflow_y": null,
812
+ "padding": null,
813
+ "right": null,
814
+ "top": null,
815
+ "visibility": null,
816
+ "width": null
817
+ }
818
+ },
819
+ "4ca7774f57454d74bd1b7c9445030038": {
820
+ "model_module": "@jupyter-widgets/controls",
821
+ "model_name": "DescriptionStyleModel",
822
+ "model_module_version": "1.5.0",
823
+ "state": {
824
+ "_model_module": "@jupyter-widgets/controls",
825
+ "_model_module_version": "1.5.0",
826
+ "_model_name": "DescriptionStyleModel",
827
+ "_view_count": null,
828
+ "_view_module": "@jupyter-widgets/base",
829
+ "_view_module_version": "1.2.0",
830
+ "_view_name": "StyleView",
831
+ "description_width": ""
832
+ }
833
+ },
834
+ "557ecfce51574b8db9236bdc8d0bd555": {
835
+ "model_module": "@jupyter-widgets/base",
836
+ "model_name": "LayoutModel",
837
+ "model_module_version": "1.2.0",
838
+ "state": {
839
+ "_model_module": "@jupyter-widgets/base",
840
+ "_model_module_version": "1.2.0",
841
+ "_model_name": "LayoutModel",
842
+ "_view_count": null,
843
+ "_view_module": "@jupyter-widgets/base",
844
+ "_view_module_version": "1.2.0",
845
+ "_view_name": "LayoutView",
846
+ "align_content": null,
847
+ "align_items": null,
848
+ "align_self": null,
849
+ "border": null,
850
+ "bottom": null,
851
+ "display": null,
852
+ "flex": null,
853
+ "flex_flow": null,
854
+ "grid_area": null,
855
+ "grid_auto_columns": null,
856
+ "grid_auto_flow": null,
857
+ "grid_auto_rows": null,
858
+ "grid_column": null,
859
+ "grid_gap": null,
860
+ "grid_row": null,
861
+ "grid_template_areas": null,
862
+ "grid_template_columns": null,
863
+ "grid_template_rows": null,
864
+ "height": null,
865
+ "justify_content": null,
866
+ "justify_items": null,
867
+ "left": null,
868
+ "margin": null,
869
+ "max_height": null,
870
+ "max_width": null,
871
+ "min_height": null,
872
+ "min_width": null,
873
+ "object_fit": null,
874
+ "object_position": null,
875
+ "order": null,
876
+ "overflow": null,
877
+ "overflow_x": null,
878
+ "overflow_y": null,
879
+ "padding": null,
880
+ "right": null,
881
+ "top": null,
882
+ "visibility": null,
883
+ "width": null
884
+ }
885
+ },
886
+ "bc0e09ab397f42879b8c874eb10e6a2b": {
887
+ "model_module": "@jupyter-widgets/controls",
888
+ "model_name": "ProgressStyleModel",
889
+ "model_module_version": "1.5.0",
890
+ "state": {
891
+ "_model_module": "@jupyter-widgets/controls",
892
+ "_model_module_version": "1.5.0",
893
+ "_model_name": "ProgressStyleModel",
894
+ "_view_count": null,
895
+ "_view_module": "@jupyter-widgets/base",
896
+ "_view_module_version": "1.2.0",
897
+ "_view_name": "StyleView",
898
+ "bar_color": null,
899
+ "description_width": ""
900
+ }
901
+ },
902
+ "8dec270eb8c649649f6b95ddde159a0f": {
903
+ "model_module": "@jupyter-widgets/base",
904
+ "model_name": "LayoutModel",
905
+ "model_module_version": "1.2.0",
906
+ "state": {
907
+ "_model_module": "@jupyter-widgets/base",
908
+ "_model_module_version": "1.2.0",
909
+ "_model_name": "LayoutModel",
910
+ "_view_count": null,
911
+ "_view_module": "@jupyter-widgets/base",
912
+ "_view_module_version": "1.2.0",
913
+ "_view_name": "LayoutView",
914
+ "align_content": null,
915
+ "align_items": null,
916
+ "align_self": null,
917
+ "border": null,
918
+ "bottom": null,
919
+ "display": null,
920
+ "flex": null,
921
+ "flex_flow": null,
922
+ "grid_area": null,
923
+ "grid_auto_columns": null,
924
+ "grid_auto_flow": null,
925
+ "grid_auto_rows": null,
926
+ "grid_column": null,
927
+ "grid_gap": null,
928
+ "grid_row": null,
929
+ "grid_template_areas": null,
930
+ "grid_template_columns": null,
931
+ "grid_template_rows": null,
932
+ "height": null,
933
+ "justify_content": null,
934
+ "justify_items": null,
935
+ "left": null,
936
+ "margin": null,
937
+ "max_height": null,
938
+ "max_width": null,
939
+ "min_height": null,
940
+ "min_width": null,
941
+ "object_fit": null,
942
+ "object_position": null,
943
+ "order": null,
944
+ "overflow": null,
945
+ "overflow_x": null,
946
+ "overflow_y": null,
947
+ "padding": null,
948
+ "right": null,
949
+ "top": null,
950
+ "visibility": null,
951
+ "width": null
952
+ }
953
+ },
954
+ "f9d65533a8fd4310b1466713c22d8255": {
955
+ "model_module": "@jupyter-widgets/controls",
956
+ "model_name": "DescriptionStyleModel",
957
+ "model_module_version": "1.5.0",
958
+ "state": {
959
+ "_model_module": "@jupyter-widgets/controls",
960
+ "_model_module_version": "1.5.0",
961
+ "_model_name": "DescriptionStyleModel",
962
+ "_view_count": null,
963
+ "_view_module": "@jupyter-widgets/base",
964
+ "_view_module_version": "1.2.0",
965
+ "_view_name": "StyleView",
966
+ "description_width": ""
967
+ }
968
+ }
969
+ }
970
+ }
971
+ },
972
+ "nbformat": 4,
973
+ "nbformat_minor": 0
974
+ }
checkpoint_dir/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/phi-3-mini-4k-instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint_dir/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/phi-3-mini-4k-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "qkv_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint_dir/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72ecb3d5d3e593ecf340dd7c5904c74c644cd04902f6705454c0f5223a399ddf
3
+ size 37766064
checkpoint_dir/all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.6762849413886384,
3
+ "eval_loss": 7.323873043060303,
4
+ "eval_runtime": 386.842,
5
+ "eval_samples": 3154,
6
+ "eval_samples_per_second": 8.153,
7
+ "eval_steps_per_second": 0.512,
8
+ "total_flos": 0.0,
9
+ "train_loss": 2.13714133199056,
10
+ "train_runtime": 37217.3738,
11
+ "train_samples_per_second": 2.579,
12
+ "train_steps_per_second": 0.161
13
+ }
checkpoint_dir/checkpoint-6000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52b6c0d5af1fb0efa5ee1862caaee15f6ba008a7c7ba60d9d841995e490ff27f
3
+ size 535089466
checkpoint_dir/checkpoint-6000/phi_model/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/phi-3-mini-4k-instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint_dir/checkpoint-6000/phi_model/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/phi-3-mini-4k-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "qkv_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint_dir/checkpoint-6000/phi_model/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72ecb3d5d3e593ecf340dd7c5904c74c644cd04902f6705454c0f5223a399ddf
3
+ size 37766064
checkpoint_dir/checkpoint-6000/projection_layer/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16cce5f1f2b77da68f1e881a3cd1bff784aa48930991b1f0b52a3b81cc6f2923
3
+ size 229740738
checkpoint_dir/checkpoint-6000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50759f462f2cfd4149cdba36c5f0d942c8659cc7e9f4c6a09f5d75e2b1f5e160
3
+ size 14180
checkpoint_dir/checkpoint-6000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fca623b299b046cffde6dd6fc538bf592ad9cba4a26768597da5e62b84c1662
3
+ size 1064
checkpoint_dir/checkpoint-6000/trainer_state.json ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.6762849413886384,
5
+ "eval_steps": 500,
6
+ "global_step": 6000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01127141568981064,
13
+ "grad_norm": 36.142295837402344,
14
+ "learning_rate": 4.1666666666666667e-07,
15
+ "loss": 12.6438,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.02254283137962128,
20
+ "grad_norm": 30.892282485961914,
21
+ "learning_rate": 8.333333333333333e-07,
22
+ "loss": 12.4413,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.03381424706943192,
27
+ "grad_norm": 45.94392395019531,
28
+ "learning_rate": 1.25e-06,
29
+ "loss": 11.4952,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.04508566275924256,
34
+ "grad_norm": 19.69997787475586,
35
+ "learning_rate": 1.6666666666666667e-06,
36
+ "loss": 7.0421,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.0563570784490532,
41
+ "grad_norm": 3.0256764888763428,
42
+ "learning_rate": 2.0833333333333334e-06,
43
+ "loss": 3.1071,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.06762849413886383,
48
+ "grad_norm": 1.9536222219467163,
49
+ "learning_rate": 2.5e-06,
50
+ "loss": 2.7468,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.07889990982867448,
55
+ "grad_norm": 3.0367178916931152,
56
+ "learning_rate": 2.916666666666667e-06,
57
+ "loss": 2.5461,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.09017132551848513,
62
+ "grad_norm": 1.1410305500030518,
63
+ "learning_rate": 3.3333333333333333e-06,
64
+ "loss": 2.3206,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.10144274120829576,
69
+ "grad_norm": 1.2108758687973022,
70
+ "learning_rate": 3.7500000000000005e-06,
71
+ "loss": 2.036,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.1127141568981064,
76
+ "grad_norm": 1.0124415159225464,
77
+ "learning_rate": 4.166666666666667e-06,
78
+ "loss": 1.9927,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.12398557258791704,
83
+ "grad_norm": 1.2103397846221924,
84
+ "learning_rate": 4.583333333333333e-06,
85
+ "loss": 1.8255,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.13525698827772767,
90
+ "grad_norm": 1.2905486822128296,
91
+ "learning_rate": 5e-06,
92
+ "loss": 1.6822,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.14652840396753833,
97
+ "grad_norm": 0.9415493607521057,
98
+ "learning_rate": 4.994647308096509e-06,
99
+ "loss": 1.6783,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.15779981965734896,
104
+ "grad_norm": 0.9576979279518127,
105
+ "learning_rate": 4.978612153434527e-06,
106
+ "loss": 1.5967,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.1690712353471596,
111
+ "grad_norm": 1.068982481956482,
112
+ "learning_rate": 4.9519632010080765e-06,
113
+ "loss": 1.5821,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.18034265103697025,
118
+ "grad_norm": 0.7473943829536438,
119
+ "learning_rate": 4.914814565722671e-06,
120
+ "loss": 1.5587,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.19161406672678089,
125
+ "grad_norm": 0.9740411043167114,
126
+ "learning_rate": 4.867325323737765e-06,
127
+ "loss": 1.525,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.20288548241659152,
132
+ "grad_norm": 0.6997997164726257,
133
+ "learning_rate": 4.809698831278217e-06,
134
+ "loss": 1.49,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.21415689810640218,
139
+ "grad_norm": 0.6754641532897949,
140
+ "learning_rate": 4.742181853831721e-06,
141
+ "loss": 1.4597,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.2254283137962128,
146
+ "grad_norm": 1.0468783378601074,
147
+ "learning_rate": 4.665063509461098e-06,
148
+ "loss": 1.4539,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.23669972948602344,
153
+ "grad_norm": 0.9208471179008484,
154
+ "learning_rate": 4.578674030756364e-06,
155
+ "loss": 1.4459,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.24797114517583407,
160
+ "grad_norm": 0.9443785548210144,
161
+ "learning_rate": 4.4833833507280884e-06,
162
+ "loss": 1.4437,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.25924256086564473,
167
+ "grad_norm": 1.0956292152404785,
168
+ "learning_rate": 4.379599518697444e-06,
169
+ "loss": 1.417,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.27051397655545534,
174
+ "grad_norm": 0.8564383387565613,
175
+ "learning_rate": 4.267766952966369e-06,
176
+ "loss": 1.4138,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.281785392245266,
181
+ "grad_norm": 0.8340147733688354,
182
+ "learning_rate": 4.1483645377501726e-06,
183
+ "loss": 1.3637,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.29305680793507666,
188
+ "grad_norm": 1.1379494667053223,
189
+ "learning_rate": 4.021903572521802e-06,
190
+ "loss": 1.4079,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.30432822362488726,
195
+ "grad_norm": 0.8148013353347778,
196
+ "learning_rate": 3.888925582549006e-06,
197
+ "loss": 1.3729,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.3155996393146979,
202
+ "grad_norm": 0.9854215383529663,
203
+ "learning_rate": 3.7500000000000005e-06,
204
+ "loss": 1.3824,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.3268710550045086,
209
+ "grad_norm": 0.9190597534179688,
210
+ "learning_rate": 3.6057217255475034e-06,
211
+ "loss": 1.3879,
212
+ "step": 2900
213
+ },
214
+ {
215
+ "epoch": 0.3381424706943192,
216
+ "grad_norm": 1.1362121105194092,
217
+ "learning_rate": 3.4567085809127247e-06,
218
+ "loss": 1.3939,
219
+ "step": 3000
220
+ },
221
+ {
222
+ "epoch": 0.34941388638412985,
223
+ "grad_norm": 1.2068192958831787,
224
+ "learning_rate": 3.303598663257904e-06,
225
+ "loss": 1.3463,
226
+ "step": 3100
227
+ },
228
+ {
229
+ "epoch": 0.3606853020739405,
230
+ "grad_norm": 0.952643632888794,
231
+ "learning_rate": 3.147047612756302e-06,
232
+ "loss": 1.3741,
233
+ "step": 3200
234
+ },
235
+ {
236
+ "epoch": 0.3719567177637511,
237
+ "grad_norm": 0.8026754260063171,
238
+ "learning_rate": 2.9877258050403214e-06,
239
+ "loss": 1.3704,
240
+ "step": 3300
241
+ },
242
+ {
243
+ "epoch": 0.38322813345356177,
244
+ "grad_norm": 0.8540117144584656,
245
+ "learning_rate": 2.82631548055013e-06,
246
+ "loss": 1.3904,
247
+ "step": 3400
248
+ },
249
+ {
250
+ "epoch": 0.39449954914337243,
251
+ "grad_norm": 0.9906865954399109,
252
+ "learning_rate": 2.663507823075358e-06,
253
+ "loss": 1.3523,
254
+ "step": 3500
255
+ },
256
+ {
257
+ "epoch": 0.40577096483318303,
258
+ "grad_norm": 0.8289706707000732,
259
+ "learning_rate": 2.5e-06,
260
+ "loss": 1.3519,
261
+ "step": 3600
262
+ },
263
+ {
264
+ "epoch": 0.4170423805229937,
265
+ "grad_norm": 1.0827723741531372,
266
+ "learning_rate": 2.3364921769246423e-06,
267
+ "loss": 1.3475,
268
+ "step": 3700
269
+ },
270
+ {
271
+ "epoch": 0.42831379621280435,
272
+ "grad_norm": 1.220688819885254,
273
+ "learning_rate": 2.173684519449872e-06,
274
+ "loss": 1.3243,
275
+ "step": 3800
276
+ },
277
+ {
278
+ "epoch": 0.43958521190261496,
279
+ "grad_norm": 1.0109795331954956,
280
+ "learning_rate": 2.01227419495968e-06,
281
+ "loss": 1.3444,
282
+ "step": 3900
283
+ },
284
+ {
285
+ "epoch": 0.4508566275924256,
286
+ "grad_norm": 1.041104793548584,
287
+ "learning_rate": 1.852952387243698e-06,
288
+ "loss": 1.3436,
289
+ "step": 4000
290
+ },
291
+ {
292
+ "epoch": 0.4621280432822362,
293
+ "grad_norm": 0.7376370429992676,
294
+ "learning_rate": 1.6964013367420967e-06,
295
+ "loss": 1.3002,
296
+ "step": 4100
297
+ },
298
+ {
299
+ "epoch": 0.4733994589720469,
300
+ "grad_norm": 0.8842127919197083,
301
+ "learning_rate": 1.5432914190872757e-06,
302
+ "loss": 1.3454,
303
+ "step": 4200
304
+ },
305
+ {
306
+ "epoch": 0.48467087466185754,
307
+ "grad_norm": 1.0636272430419922,
308
+ "learning_rate": 1.3942782744524974e-06,
309
+ "loss": 1.3396,
310
+ "step": 4300
311
+ },
312
+ {
313
+ "epoch": 0.49594229035166815,
314
+ "grad_norm": 1.2041317224502563,
315
+ "learning_rate": 1.2500000000000007e-06,
316
+ "loss": 1.335,
317
+ "step": 4400
318
+ },
319
+ {
320
+ "epoch": 0.5072137060414789,
321
+ "grad_norm": 0.9379550218582153,
322
+ "learning_rate": 1.1110744174509952e-06,
323
+ "loss": 1.3213,
324
+ "step": 4500
325
+ },
326
+ {
327
+ "epoch": 0.5184851217312895,
328
+ "grad_norm": 0.7874147891998291,
329
+ "learning_rate": 9.780964274781984e-07,
330
+ "loss": 1.3198,
331
+ "step": 4600
332
+ },
333
+ {
334
+ "epoch": 0.5297565374211001,
335
+ "grad_norm": 0.7258532047271729,
336
+ "learning_rate": 8.516354622498279e-07,
337
+ "loss": 1.2681,
338
+ "step": 4700
339
+ },
340
+ {
341
+ "epoch": 0.5410279531109107,
342
+ "grad_norm": 1.1035155057907104,
343
+ "learning_rate": 7.322330470336314e-07,
344
+ "loss": 1.3219,
345
+ "step": 4800
346
+ },
347
+ {
348
+ "epoch": 0.5522993688007214,
349
+ "grad_norm": 0.7815728187561035,
350
+ "learning_rate": 6.204004813025569e-07,
351
+ "loss": 1.3381,
352
+ "step": 4900
353
+ },
354
+ {
355
+ "epoch": 0.563570784490532,
356
+ "grad_norm": 1.0812482833862305,
357
+ "learning_rate": 5.166166492719124e-07,
358
+ "loss": 1.3322,
359
+ "step": 5000
360
+ },
361
+ {
362
+ "epoch": 0.5748422001803426,
363
+ "grad_norm": 0.9642081260681152,
364
+ "learning_rate": 4.2132596924363666e-07,
365
+ "loss": 1.3218,
366
+ "step": 5100
367
+ },
368
+ {
369
+ "epoch": 0.5861136158701533,
370
+ "grad_norm": 0.8039354085922241,
371
+ "learning_rate": 3.3493649053890325e-07,
372
+ "loss": 1.3216,
373
+ "step": 5200
374
+ },
375
+ {
376
+ "epoch": 0.5973850315599639,
377
+ "grad_norm": 0.8643052577972412,
378
+ "learning_rate": 2.5781814616827936e-07,
379
+ "loss": 1.3333,
380
+ "step": 5300
381
+ },
382
+ {
383
+ "epoch": 0.6086564472497745,
384
+ "grad_norm": 1.4110788106918335,
385
+ "learning_rate": 1.9030116872178317e-07,
386
+ "loss": 1.3412,
387
+ "step": 5400
388
+ },
389
+ {
390
+ "epoch": 0.6199278629395852,
391
+ "grad_norm": 0.7792391180992126,
392
+ "learning_rate": 1.3267467626223606e-07,
393
+ "loss": 1.3069,
394
+ "step": 5500
395
+ },
396
+ {
397
+ "epoch": 0.6311992786293958,
398
+ "grad_norm": 1.0475589036941528,
399
+ "learning_rate": 8.518543427732951e-08,
400
+ "loss": 1.3187,
401
+ "step": 5600
402
+ },
403
+ {
404
+ "epoch": 0.6424706943192064,
405
+ "grad_norm": 0.9902795553207397,
406
+ "learning_rate": 4.8036798991923925e-08,
407
+ "loss": 1.328,
408
+ "step": 5700
409
+ },
410
+ {
411
+ "epoch": 0.6537421100090172,
412
+ "grad_norm": 0.7977623343467712,
413
+ "learning_rate": 2.1387846565474047e-08,
414
+ "loss": 1.3175,
415
+ "step": 5800
416
+ },
417
+ {
418
+ "epoch": 0.6650135256988278,
419
+ "grad_norm": 0.872138261795044,
420
+ "learning_rate": 5.352691903491303e-09,
421
+ "loss": 1.3116,
422
+ "step": 5900
423
+ },
424
+ {
425
+ "epoch": 0.6762849413886384,
426
+ "grad_norm": 0.8640491366386414,
427
+ "learning_rate": 0.0,
428
+ "loss": 1.3081,
429
+ "step": 6000
430
+ }
431
+ ],
432
+ "logging_steps": 100,
433
+ "max_steps": 6000,
434
+ "num_input_tokens_seen": 0,
435
+ "num_train_epochs": 1,
436
+ "save_steps": 100,
437
+ "stateful_callbacks": {
438
+ "TrainerControl": {
439
+ "args": {
440
+ "should_epoch_stop": false,
441
+ "should_evaluate": false,
442
+ "should_log": false,
443
+ "should_save": true,
444
+ "should_training_stop": true
445
+ },
446
+ "attributes": {}
447
+ }
448
+ },
449
+ "total_flos": 0.0,
450
+ "train_batch_size": 16,
451
+ "trial_name": null,
452
+ "trial_params": null
453
+ }
checkpoint_dir/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/phi-3-mini-4k-instruct",
3
+ "architectures": [
4
+ "Phi3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "microsoft/phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
10
+ "AutoModelForCausalLM": "microsoft/phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
11
+ },
12
+ "bos_token_id": 1,
13
+ "embd_pdrop": 0.0,
14
+ "eos_token_id": 32000,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 3072,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 8192,
19
+ "max_position_embeddings": 4096,
20
+ "model_type": "phi3",
21
+ "num_attention_heads": 32,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 32,
24
+ "original_max_position_embeddings": 4096,
25
+ "pad_token_id": 32000,
26
+ "quantization_config": {
27
+ "_load_in_4bit": true,
28
+ "_load_in_8bit": false,
29
+ "bnb_4bit_compute_dtype": "bfloat16",
30
+ "bnb_4bit_quant_storage": "uint8",
31
+ "bnb_4bit_quant_type": "nf4",
32
+ "bnb_4bit_use_double_quant": true,
33
+ "llm_int8_enable_fp32_cpu_offload": false,
34
+ "llm_int8_has_fp16_weight": false,
35
+ "llm_int8_skip_modules": null,
36
+ "llm_int8_threshold": 6.0,
37
+ "load_in_4bit": true,
38
+ "load_in_8bit": false,
39
+ "quant_method": "bitsandbytes"
40
+ },
41
+ "resid_pdrop": 0.0,
42
+ "rms_norm_eps": 1e-05,
43
+ "rope_scaling": null,
44
+ "rope_theta": 10000.0,
45
+ "sliding_window": 2047,
46
+ "tie_word_embeddings": false,
47
+ "torch_dtype": "bfloat16",
48
+ "transformers_version": "4.44.2",
49
+ "use_cache": false,
50
+ "vocab_size": 32064
51
+ }
checkpoint_dir/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.6762849413886384,
3
+ "eval_loss": 7.323873043060303,
4
+ "eval_runtime": 386.842,
5
+ "eval_samples": 3154,
6
+ "eval_samples_per_second": 8.153,
7
+ "eval_steps_per_second": 0.512
8
+ }
checkpoint_dir/image_projector.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aecea00b5c6bae9364ea91f91668bdd520b9aa8596318d8fad8cbdc846442502
3
+ size 229740802
checkpoint_dir/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.6762849413886384,
3
+ "total_flos": 0.0,
4
+ "train_loss": 2.13714133199056,
5
+ "train_runtime": 37217.3738,
6
+ "train_samples_per_second": 2.579,
7
+ "train_steps_per_second": 0.161
8
+ }
checkpoint_dir/trainer_state.json ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.6762849413886384,
5
+ "eval_steps": 500,
6
+ "global_step": 6000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01127141568981064,
13
+ "grad_norm": 36.142295837402344,
14
+ "learning_rate": 4.1666666666666667e-07,
15
+ "loss": 12.6438,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.02254283137962128,
20
+ "grad_norm": 30.892282485961914,
21
+ "learning_rate": 8.333333333333333e-07,
22
+ "loss": 12.4413,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.03381424706943192,
27
+ "grad_norm": 45.94392395019531,
28
+ "learning_rate": 1.25e-06,
29
+ "loss": 11.4952,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.04508566275924256,
34
+ "grad_norm": 19.69997787475586,
35
+ "learning_rate": 1.6666666666666667e-06,
36
+ "loss": 7.0421,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.0563570784490532,
41
+ "grad_norm": 3.0256764888763428,
42
+ "learning_rate": 2.0833333333333334e-06,
43
+ "loss": 3.1071,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.06762849413886383,
48
+ "grad_norm": 1.9536222219467163,
49
+ "learning_rate": 2.5e-06,
50
+ "loss": 2.7468,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.07889990982867448,
55
+ "grad_norm": 3.0367178916931152,
56
+ "learning_rate": 2.916666666666667e-06,
57
+ "loss": 2.5461,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.09017132551848513,
62
+ "grad_norm": 1.1410305500030518,
63
+ "learning_rate": 3.3333333333333333e-06,
64
+ "loss": 2.3206,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.10144274120829576,
69
+ "grad_norm": 1.2108758687973022,
70
+ "learning_rate": 3.7500000000000005e-06,
71
+ "loss": 2.036,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.1127141568981064,
76
+ "grad_norm": 1.0124415159225464,
77
+ "learning_rate": 4.166666666666667e-06,
78
+ "loss": 1.9927,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.12398557258791704,
83
+ "grad_norm": 1.2103397846221924,
84
+ "learning_rate": 4.583333333333333e-06,
85
+ "loss": 1.8255,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.13525698827772767,
90
+ "grad_norm": 1.2905486822128296,
91
+ "learning_rate": 5e-06,
92
+ "loss": 1.6822,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.14652840396753833,
97
+ "grad_norm": 0.9415493607521057,
98
+ "learning_rate": 4.994647308096509e-06,
99
+ "loss": 1.6783,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.15779981965734896,
104
+ "grad_norm": 0.9576979279518127,
105
+ "learning_rate": 4.978612153434527e-06,
106
+ "loss": 1.5967,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.1690712353471596,
111
+ "grad_norm": 1.068982481956482,
112
+ "learning_rate": 4.9519632010080765e-06,
113
+ "loss": 1.5821,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.18034265103697025,
118
+ "grad_norm": 0.7473943829536438,
119
+ "learning_rate": 4.914814565722671e-06,
120
+ "loss": 1.5587,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.19161406672678089,
125
+ "grad_norm": 0.9740411043167114,
126
+ "learning_rate": 4.867325323737765e-06,
127
+ "loss": 1.525,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.20288548241659152,
132
+ "grad_norm": 0.6997997164726257,
133
+ "learning_rate": 4.809698831278217e-06,
134
+ "loss": 1.49,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.21415689810640218,
139
+ "grad_norm": 0.6754641532897949,
140
+ "learning_rate": 4.742181853831721e-06,
141
+ "loss": 1.4597,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.2254283137962128,
146
+ "grad_norm": 1.0468783378601074,
147
+ "learning_rate": 4.665063509461098e-06,
148
+ "loss": 1.4539,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.23669972948602344,
153
+ "grad_norm": 0.9208471179008484,
154
+ "learning_rate": 4.578674030756364e-06,
155
+ "loss": 1.4459,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.24797114517583407,
160
+ "grad_norm": 0.9443785548210144,
161
+ "learning_rate": 4.4833833507280884e-06,
162
+ "loss": 1.4437,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.25924256086564473,
167
+ "grad_norm": 1.0956292152404785,
168
+ "learning_rate": 4.379599518697444e-06,
169
+ "loss": 1.417,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.27051397655545534,
174
+ "grad_norm": 0.8564383387565613,
175
+ "learning_rate": 4.267766952966369e-06,
176
+ "loss": 1.4138,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.281785392245266,
181
+ "grad_norm": 0.8340147733688354,
182
+ "learning_rate": 4.1483645377501726e-06,
183
+ "loss": 1.3637,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.29305680793507666,
188
+ "grad_norm": 1.1379494667053223,
189
+ "learning_rate": 4.021903572521802e-06,
190
+ "loss": 1.4079,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.30432822362488726,
195
+ "grad_norm": 0.8148013353347778,
196
+ "learning_rate": 3.888925582549006e-06,
197
+ "loss": 1.3729,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.3155996393146979,
202
+ "grad_norm": 0.9854215383529663,
203
+ "learning_rate": 3.7500000000000005e-06,
204
+ "loss": 1.3824,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.3268710550045086,
209
+ "grad_norm": 0.9190597534179688,
210
+ "learning_rate": 3.6057217255475034e-06,
211
+ "loss": 1.3879,
212
+ "step": 2900
213
+ },
214
+ {
215
+ "epoch": 0.3381424706943192,
216
+ "grad_norm": 1.1362121105194092,
217
+ "learning_rate": 3.4567085809127247e-06,
218
+ "loss": 1.3939,
219
+ "step": 3000
220
+ },
221
+ {
222
+ "epoch": 0.34941388638412985,
223
+ "grad_norm": 1.2068192958831787,
224
+ "learning_rate": 3.303598663257904e-06,
225
+ "loss": 1.3463,
226
+ "step": 3100
227
+ },
228
+ {
229
+ "epoch": 0.3606853020739405,
230
+ "grad_norm": 0.952643632888794,
231
+ "learning_rate": 3.147047612756302e-06,
232
+ "loss": 1.3741,
233
+ "step": 3200
234
+ },
235
+ {
236
+ "epoch": 0.3719567177637511,
237
+ "grad_norm": 0.8026754260063171,
238
+ "learning_rate": 2.9877258050403214e-06,
239
+ "loss": 1.3704,
240
+ "step": 3300
241
+ },
242
+ {
243
+ "epoch": 0.38322813345356177,
244
+ "grad_norm": 0.8540117144584656,
245
+ "learning_rate": 2.82631548055013e-06,
246
+ "loss": 1.3904,
247
+ "step": 3400
248
+ },
249
+ {
250
+ "epoch": 0.39449954914337243,
251
+ "grad_norm": 0.9906865954399109,
252
+ "learning_rate": 2.663507823075358e-06,
253
+ "loss": 1.3523,
254
+ "step": 3500
255
+ },
256
+ {
257
+ "epoch": 0.40577096483318303,
258
+ "grad_norm": 0.8289706707000732,
259
+ "learning_rate": 2.5e-06,
260
+ "loss": 1.3519,
261
+ "step": 3600
262
+ },
263
+ {
264
+ "epoch": 0.4170423805229937,
265
+ "grad_norm": 1.0827723741531372,
266
+ "learning_rate": 2.3364921769246423e-06,
267
+ "loss": 1.3475,
268
+ "step": 3700
269
+ },
270
+ {
271
+ "epoch": 0.42831379621280435,
272
+ "grad_norm": 1.220688819885254,
273
+ "learning_rate": 2.173684519449872e-06,
274
+ "loss": 1.3243,
275
+ "step": 3800
276
+ },
277
+ {
278
+ "epoch": 0.43958521190261496,
279
+ "grad_norm": 1.0109795331954956,
280
+ "learning_rate": 2.01227419495968e-06,
281
+ "loss": 1.3444,
282
+ "step": 3900
283
+ },
284
+ {
285
+ "epoch": 0.4508566275924256,
286
+ "grad_norm": 1.041104793548584,
287
+ "learning_rate": 1.852952387243698e-06,
288
+ "loss": 1.3436,
289
+ "step": 4000
290
+ },
291
+ {
292
+ "epoch": 0.4621280432822362,
293
+ "grad_norm": 0.7376370429992676,
294
+ "learning_rate": 1.6964013367420967e-06,
295
+ "loss": 1.3002,
296
+ "step": 4100
297
+ },
298
+ {
299
+ "epoch": 0.4733994589720469,
300
+ "grad_norm": 0.8842127919197083,
301
+ "learning_rate": 1.5432914190872757e-06,
302
+ "loss": 1.3454,
303
+ "step": 4200
304
+ },
305
+ {
306
+ "epoch": 0.48467087466185754,
307
+ "grad_norm": 1.0636272430419922,
308
+ "learning_rate": 1.3942782744524974e-06,
309
+ "loss": 1.3396,
310
+ "step": 4300
311
+ },
312
+ {
313
+ "epoch": 0.49594229035166815,
314
+ "grad_norm": 1.2041317224502563,
315
+ "learning_rate": 1.2500000000000007e-06,
316
+ "loss": 1.335,
317
+ "step": 4400
318
+ },
319
+ {
320
+ "epoch": 0.5072137060414789,
321
+ "grad_norm": 0.9379550218582153,
322
+ "learning_rate": 1.1110744174509952e-06,
323
+ "loss": 1.3213,
324
+ "step": 4500
325
+ },
326
+ {
327
+ "epoch": 0.5184851217312895,
328
+ "grad_norm": 0.7874147891998291,
329
+ "learning_rate": 9.780964274781984e-07,
330
+ "loss": 1.3198,
331
+ "step": 4600
332
+ },
333
+ {
334
+ "epoch": 0.5297565374211001,
335
+ "grad_norm": 0.7258532047271729,
336
+ "learning_rate": 8.516354622498279e-07,
337
+ "loss": 1.2681,
338
+ "step": 4700
339
+ },
340
+ {
341
+ "epoch": 0.5410279531109107,
342
+ "grad_norm": 1.1035155057907104,
343
+ "learning_rate": 7.322330470336314e-07,
344
+ "loss": 1.3219,
345
+ "step": 4800
346
+ },
347
+ {
348
+ "epoch": 0.5522993688007214,
349
+ "grad_norm": 0.7815728187561035,
350
+ "learning_rate": 6.204004813025569e-07,
351
+ "loss": 1.3381,
352
+ "step": 4900
353
+ },
354
+ {
355
+ "epoch": 0.563570784490532,
356
+ "grad_norm": 1.0812482833862305,
357
+ "learning_rate": 5.166166492719124e-07,
358
+ "loss": 1.3322,
359
+ "step": 5000
360
+ },
361
+ {
362
+ "epoch": 0.5748422001803426,
363
+ "grad_norm": 0.9642081260681152,
364
+ "learning_rate": 4.2132596924363666e-07,
365
+ "loss": 1.3218,
366
+ "step": 5100
367
+ },
368
+ {
369
+ "epoch": 0.5861136158701533,
370
+ "grad_norm": 0.8039354085922241,
371
+ "learning_rate": 3.3493649053890325e-07,
372
+ "loss": 1.3216,
373
+ "step": 5200
374
+ },
375
+ {
376
+ "epoch": 0.5973850315599639,
377
+ "grad_norm": 0.8643052577972412,
378
+ "learning_rate": 2.5781814616827936e-07,
379
+ "loss": 1.3333,
380
+ "step": 5300
381
+ },
382
+ {
383
+ "epoch": 0.6086564472497745,
384
+ "grad_norm": 1.4110788106918335,
385
+ "learning_rate": 1.9030116872178317e-07,
386
+ "loss": 1.3412,
387
+ "step": 5400
388
+ },
389
+ {
390
+ "epoch": 0.6199278629395852,
391
+ "grad_norm": 0.7792391180992126,
392
+ "learning_rate": 1.3267467626223606e-07,
393
+ "loss": 1.3069,
394
+ "step": 5500
395
+ },
396
+ {
397
+ "epoch": 0.6311992786293958,
398
+ "grad_norm": 1.0475589036941528,
399
+ "learning_rate": 8.518543427732951e-08,
400
+ "loss": 1.3187,
401
+ "step": 5600
402
+ },
403
+ {
404
+ "epoch": 0.6424706943192064,
405
+ "grad_norm": 0.9902795553207397,
406
+ "learning_rate": 4.8036798991923925e-08,
407
+ "loss": 1.328,
408
+ "step": 5700
409
+ },
410
+ {
411
+ "epoch": 0.6537421100090172,
412
+ "grad_norm": 0.7977623343467712,
413
+ "learning_rate": 2.1387846565474047e-08,
414
+ "loss": 1.3175,
415
+ "step": 5800
416
+ },
417
+ {
418
+ "epoch": 0.6650135256988278,
419
+ "grad_norm": 0.872138261795044,
420
+ "learning_rate": 5.352691903491303e-09,
421
+ "loss": 1.3116,
422
+ "step": 5900
423
+ },
424
+ {
425
+ "epoch": 0.6762849413886384,
426
+ "grad_norm": 0.8640491366386414,
427
+ "learning_rate": 0.0,
428
+ "loss": 1.3081,
429
+ "step": 6000
430
+ },
431
+ {
432
+ "epoch": 0.6762849413886384,
433
+ "step": 6000,
434
+ "total_flos": 0.0,
435
+ "train_loss": 2.13714133199056,
436
+ "train_runtime": 37217.3738,
437
+ "train_samples_per_second": 2.579,
438
+ "train_steps_per_second": 0.161
439
+ }
440
+ ],
441
+ "logging_steps": 100,
442
+ "max_steps": 6000,
443
+ "num_input_tokens_seen": 0,
444
+ "num_train_epochs": 1,
445
+ "save_steps": 100,
446
+ "stateful_callbacks": {
447
+ "TrainerControl": {
448
+ "args": {
449
+ "should_epoch_stop": false,
450
+ "should_evaluate": false,
451
+ "should_log": false,
452
+ "should_save": true,
453
+ "should_training_stop": true
454
+ },
455
+ "attributes": {}
456
+ }
457
+ },
458
+ "total_flos": 0.0,
459
+ "train_batch_size": 16,
460
+ "trial_name": null,
461
+ "trial_params": null
462
+ }
model.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+ class Projections(nn.Module):
4
+ def __init__(self, clip_embed, phi_embed, num_projection_layers=6):
5
+ super().__init__()
6
+
7
+ self.output = nn.Linear(clip_embed, phi_embed)
8
+ self.norm = nn.LayerNorm(phi_embed)
9
+ self.projection_layers = nn.ModuleList(
10
+ [
11
+ nn.Sequential(
12
+ nn.Linear(phi_embed, phi_embed),
13
+ nn.GELU(),
14
+ nn.Linear(phi_embed, phi_embed),
15
+ )
16
+ for _ in range(num_projection_layers)
17
+ ]
18
+ )
19
+
20
+ def forward(self, x):
21
+ x = self.output(x)
22
+ x = self.norm(x)
23
+ for layer in self.projection_layers:
24
+ residual = x
25
+ x = layer(x) + residual
26
+
27
+ return x
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bitsandbytes==0.43.3
2
+ clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
3
+ colorama==0.4.6
4
+ datasets==3.0.0
5
+ dill==0.3.8
6
+ multiprocess==0.70.16
7
+ numpy==1.26.4
8
+ pandas==2.2.2
9
+ peft==0.12.0
10
+ shtab==1.7.1
11
+ tokenizers==0.19.1
12
+ torch==2.4.1
13
+ torchvision==0.19.1
14
+ tqdm==4.66.5
15
+ transformers==4.44.2
16
+ treelib==1.7.0
17
+ trl==0.10.1
18
+ typing_extensions==4.12.2
19
+ tyro==0.8.10
20
+ tzdata==2024.1
21
+ urllib3==2.2.3
22
+ wcwidth==0.2.13
23
+ xxhash==3.5.0
24
+ yarl==1.11.1