File size: 5,869 Bytes
4f0d565
1
{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"colab":{"provenance":[{"file_id":"https://huggingface.co/datasets/codeShare/lora-training-data/blob/main/parquet_explorer.ipynb","timestamp":1754497857381},{"file_id":"https://huggingface.co/datasets/codeShare/chroma_prompts/blob/main/parquet_explorer.ipynb","timestamp":1754475181338},{"file_id":"https://huggingface.co/datasets/codeShare/chroma_prompts/blob/main/parquet_explorer.ipynb","timestamp":1754312448728},{"file_id":"https://huggingface.co/datasets/codeShare/chroma_prompts/blob/main/parquet_explorer.ipynb","timestamp":1754310418707},{"file_id":"https://huggingface.co/datasets/codeShare/lora-training-data/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1754223895158},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1747490904984},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1740037333374},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1736477078136},{"file_id":"https://huggingface.co/codeShare/JupyterNotebooks/blob/main/YT-playlist-to-mp3.ipynb","timestamp":1725365086834}]},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":12728776,"sourceType":"datasetVersion","datasetId":8022630}],"dockerImageVersionId":31089,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"Download a parquet file to your Google drive and load it from there into this notebook.\n\nParquet files: https://huggingface.co/datasets/codeShare/chroma_prompts/tree/main\n\nE621 JSON files: https://huggingface.co/datasets/lodestones/e621-captions/tree/main","metadata":{"id":"LeCfcqgiQvCP"}},{"cell_type":"code","source":"%%capture\nimport os\nif \"COLAB_\" not in \"\".join(os.environ.keys()):\n    !pip install unsloth\nelse:\n    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo\n    !pip install sentencepiece protobuf \"datasets>=3.4.1,<4.0.0\" \"huggingface_hub>=0.34.0\" hf_transfer\n    !pip install --no-deps unsloth\n\nfrom unsloth import FastVisionModel, get_chat_template\nfrom peft import LoraConfig, get_peft_model\nfrom transformers import TrainingArguments, DataCollatorForLanguageModeling\nfrom trl import SFTTrainer\nfrom datasets import load_from_disk\nimport torch\n\n# Set a writable cache directory for datasets to avoid read-only issues\nos.environ[\"HF_DATASETS_CACHE\"] = \"/kaggle/working/dataset_cache\"\n\n# Create the cache directory if it doesn't exist\nos.makedirs(\"/kaggle/working/dataset_cache\", exist_ok=True)\n\n# Load model and processor\nmodel, processor = FastVisionModel.from_pretrained(\n    model_name=\"codeShare/flux_chroma_image_captioner\",\n    load_in_4bit=True,\n)\nprocessor = get_chat_template(processor, \"gemma-3\")\n\n# Load dataset\ndataset_path = '/kaggle/input/image-caption-dataset'\ndataset = load_from_disk(dataset_path)\n\n# Preprocess dataset\ndef preprocess_data(example):\n    image = example[\"image\"]\n    caption = example[\"text\"]\n    messages = [\n        {\n            \"role\": \"user\",\n            \"content\": [{\"type\": \"image\"}, {\"type\": \"text\", \"text\": \"Describe this image.\"}],\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [{\"type\": \"text\", \"text\": caption}]\n        }\n    ]\n    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)\n    processed = processor(\n        image,\n        input_text,\n        add_special_tokens=False,\n        return_tensors=\"pt\",\n        padding=\"max_length\",\n        truncation=True,\n        max_length=512\n    )\n    return {key: val.squeeze(0) for key, val in processed.items()}\n\n# Apply preprocessing with explicit cache file in writable directory\ntokenized_dataset = dataset.map(\n    preprocess_data,\n    batched=False,\n    remove_columns=dataset.column_names,\n    load_from_cache_file=False,\n    cache_file_name=\"/kaggle/working/dataset_cache/tokenized_cache.arrow\",\n)\n\n# Debug: Inspect the first example\nprint(tokenized_dataset[0])\nprint({key: len(val) if isinstance(val, torch.Tensor) else val for key, val in tokenized_dataset[0].items()})\n\n# Configure training\ntraining_args = TrainingArguments(\n    output_dir=\"/kaggle/working/continued_lora_model\",\n    per_device_train_batch_size=4,\n    num_train_epochs=20,\n    save_steps=500,\n    logging_steps=100,\n    learning_rate=1e-4,\n    fp16=True,\n    save_total_limit=2,\n    push_to_hub=False,\n    report_to=\"none\",\n)\n\n# Configure data collator\ndata_collator = DataCollatorForLanguageModeling(\n    tokenizer=processor,\n    mlm=False,\n    pad_to_multiple_of=8\n)\n\n# Initialize trainer\ntrainer = SFTTrainer(\n    model=model,\n    args=training_args,\n    train_dataset=tokenized_dataset,\n    tokenizer=processor,\n    max_seq_length=512,\n    data_collator=data_collator,\n)\n\n# Train\ntrainer.train()\n\n# Save\nmodel.save_pretrained(\"/kaggle/working/continued_lora_model\")\nprocessor.save_pretrained(\"/kaggle/working/continued_lora_model\")\n# Optional: Push to Hugging Face\n# from huggingface_hub import login\n# login()  # Enter your Hugging Face token\n# model.push_to_hub(\"your_username/new_flux_chroma_image_captioner\")\n# processor.push_to_hub(\"your_username/new_flux_chroma_image_captioner\")","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}