File size: 5,869 Bytes
4f0d565 |
1 |
{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"colab":{"provenance":[{"file_id":"https://huggingface.co/datasets/codeShare/lora-training-data/blob/main/parquet_explorer.ipynb","timestamp":1754497857381},{"file_id":"https://huggingface.co/datasets/codeShare/chroma_prompts/blob/main/parquet_explorer.ipynb","timestamp":1754475181338},{"file_id":"https://huggingface.co/datasets/codeShare/chroma_prompts/blob/main/parquet_explorer.ipynb","timestamp":1754312448728},{"file_id":"https://huggingface.co/datasets/codeShare/chroma_prompts/blob/main/parquet_explorer.ipynb","timestamp":1754310418707},{"file_id":"https://huggingface.co/datasets/codeShare/lora-training-data/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1754223895158},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1747490904984},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1740037333374},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1736477078136},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1725365086834}]},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":12728776,"sourceType":"datasetVersion","datasetId":8022630}],"dockerImageVersionId":31089,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"Download a parquet file to your Google drive and load it from there into this notebook.\n\nParquet files: https://huggingface.co/datasets/codeShare/chroma_prompts/tree/main\n\nE621 JSON files: https://huggingface.co/datasets/lodestones/e621-captions/tree/main","metadata":{"id":"LeCfcqgiQvCP"}},{"cell_type":"code","source":"%%capture\nimport os\nif \"COLAB_\" not in \"\".join(os.environ.keys()):\n !pip install unsloth\nelse:\n !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo\n !pip install sentencepiece protobuf \"datasets>=3.4.1,<4.0.0\" \"huggingface_hub>=0.34.0\" hf_transfer\n !pip install --no-deps unsloth\n\nfrom unsloth import FastVisionModel, get_chat_template\nfrom peft import LoraConfig, get_peft_model\nfrom transformers import TrainingArguments, DataCollatorForLanguageModeling\nfrom trl import SFTTrainer\nfrom datasets import load_from_disk\nimport torch\n\n# Set a writable cache directory for datasets to avoid read-only issues\nos.environ[\"HF_DATASETS_CACHE\"] = \"/kaggle/working/dataset_cache\"\n\n# Create the cache directory if it doesn't exist\nos.makedirs(\"/kaggle/working/dataset_cache\", exist_ok=True)\n\n# Load model and processor\nmodel, processor = FastVisionModel.from_pretrained(\n model_name=\"codeShare/flux_chroma_image_captioner\",\n load_in_4bit=True,\n)\nprocessor = get_chat_template(processor, \"gemma-3\")\n\n# Load dataset\ndataset_path = '/kaggle/input/image-caption-dataset'\ndataset = load_from_disk(dataset_path)\n\n# Preprocess dataset\ndef preprocess_data(example):\n image = example[\"image\"]\n caption = example[\"text\"]\n messages = [\n {\n \"role\": \"user\",\n \"content\": [{\"type\": \"image\"}, {\"type\": \"text\", \"text\": \"Describe this image.\"}],\n },\n {\n \"role\": \"assistant\",\n \"content\": [{\"type\": \"text\", \"text\": caption}]\n }\n ]\n input_text = processor.apply_chat_template(messages, add_generation_prompt=True)\n processed = processor(\n image,\n input_text,\n add_special_tokens=False,\n return_tensors=\"pt\",\n padding=\"max_length\",\n truncation=True,\n max_length=512\n )\n return {key: val.squeeze(0) for key, val in processed.items()}\n\n# Apply preprocessing with explicit cache file in writable directory\ntokenized_dataset = dataset.map(\n preprocess_data,\n batched=False,\n remove_columns=dataset.column_names,\n load_from_cache_file=False,\n cache_file_name=\"/kaggle/working/dataset_cache/tokenized_cache.arrow\",\n)\n\n# Debug: Inspect the first example\nprint(tokenized_dataset[0])\nprint({key: len(val) if isinstance(val, torch.Tensor) else val for key, val in tokenized_dataset[0].items()})\n\n# Configure training\ntraining_args = TrainingArguments(\n output_dir=\"/kaggle/working/continued_lora_model\",\n per_device_train_batch_size=4,\n num_train_epochs=20,\n save_steps=500,\n logging_steps=100,\n learning_rate=1e-4,\n fp16=True,\n save_total_limit=2,\n push_to_hub=False,\n report_to=\"none\",\n)\n\n# Configure data collator\ndata_collator = DataCollatorForLanguageModeling(\n tokenizer=processor,\n mlm=False,\n pad_to_multiple_of=8\n)\n\n# Initialize trainer\ntrainer = SFTTrainer(\n model=model,\n args=training_args,\n train_dataset=tokenized_dataset,\n tokenizer=processor,\n max_seq_length=512,\n data_collator=data_collator,\n)\n\n# Train\ntrainer.train()\n\n# Save\nmodel.save_pretrained(\"/kaggle/working/continued_lora_model\")\nprocessor.save_pretrained(\"/kaggle/working/continued_lora_model\")\n# Optional: Push to Hugging Face\n# from huggingface_hub import login\n# login() # Enter your Hugging Face token\n# model.push_to_hub(\"your_username/new_flux_chroma_image_captioner\")\n# processor.push_to_hub(\"your_username/new_flux_chroma_image_captioner\")","metadata":{"trusted":true},"outputs":[],"execution_count":null}]} |