unslothai · AAB20 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/kaggle_T4x2/Advanced_Llama3_1_(3B)_GRPO_LoRA_kaggle_T4x2.ipynb b/kaggle_T4x2/Advanced_Llama3_1_(3B)_GRPO_LoRA_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/Advanced_Llama3_2_(3B)_GRPO_LoRA_kaggle_T4x2.ipynb b/kaggle_T4x2/Advanced_Llama3_2_(3B)_GRPO_LoRA_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/All_MiniLM_L6_v2_kaggle_T4x2.ipynb b/kaggle_T4x2/All_MiniLM_L6_v2_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/BGE_M3_kaggle_T4x2.ipynb b/kaggle_T4x2/BGE_M3_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/CodeForces-cot-Finetune_for_Reasoning_on_CodeForces_kaggle_T4x2.ipynb b/kaggle_T4x2/CodeForces-cot-Finetune_for_Reasoning_on_CodeForces_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/CodeGemma_(7B)-Conversational_kaggle_T4x2.ipynb b/kaggle_T4x2/CodeGemma_(7B)-Conversational_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/DeepSeek_R1_0528_Qwen3_(8B)_GRPO_kaggle_T4x2.ipynb b/kaggle_T4x2/DeepSeek_R1_0528_Qwen3_(8B)_GRPO_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/Deepseek_OCR_(3B)-Eval_kaggle_T4x2.ipynb b/kaggle_T4x2/Deepseek_OCR_(3B)-Eval_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/Deepseek_OCR_(3B)-Evaluation_kaggle_T4x2.ipynb b/kaggle_T4x2/Deepseek_OCR_(3B)-Evaluation_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/Deepseek_OCR_(3B)_kaggle_T4x2.ipynb b/kaggle_T4x2/Deepseek_OCR_(3B)_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/Deepseek_OCR_2_(3B)_kaggle_T4x2.ipynb b/kaggle_T4x2/Deepseek_OCR_2_(3B)_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/ERNIE_4_5_21B_A3B_PT-Conversational_kaggle_T4x2.ipynb b/kaggle_T4x2/ERNIE_4_5_21B_A3B_PT-Conversational_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/ERNIE_4_5_VL_28B_A3B_PT_Vision_kaggle_T4x2.ipynb b/kaggle_T4x2/ERNIE_4_5_VL_28B_A3B_PT_Vision_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/EmbeddingGemma_(300M)_kaggle_T4x2.ipynb b/kaggle_T4x2/EmbeddingGemma_(300M)_kaggle_T4x2.ipynb
diff --git a/kaggle_T4x2/Falcon_H1-Alpaca_kaggle_T4x2.ipynb b/kaggle_T4x2/Falcon_H1-Alpaca_kaggle_T4x2.ipynb
@@ -0,0 +1,186 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "import subprocess, sys\n\n# Fix torchvision/torch version mismatch (breaks unsloth on Kaggle)\nsubprocess.run([\n    sys.executable, \"-m\", \"pip\", \"install\", \"-q\",\n    \"--upgrade\", \"--no-cache-dir\", \"torchvision\", \"unsloth_zoo\",\n], check=False)\nprint(\"Dependencies OK\")"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "import os, gc, torch\nimport torch.nn as nn\n\n# ── Kaggle T4x2 environment setup ─────────────────────────\nos.environ[\"PYTORCH_CUDA_ALLOC_CONF\"]               = \"expandable_segments:True\"\nos.environ[\"TOKENIZERS_PARALLELISM\"]                = \"false\"\nos.environ[\"TRANSFORMERS_ATTENTION_IMPLEMENTATION\"] = \"sdpa\"\ngc.collect()\ntorch.cuda.empty_cache()\n\n# Block DataParallel — force DDP path instead\nclass _BlockedDP(nn.Module):\n    def __init__(self, module, **kwargs):\n        raise RuntimeError(\"DataParallel blocked — DDP is used instead\")\n\nnn.DataParallel       = _BlockedDP\ntorch.nn.DataParallel = _BlockedDP\n\nfor i in range(torch.cuda.device_count()):\n    p = torch.cuda.get_device_properties(i)\n    f, t = torch.cuda.mem_get_info(i)\n    print(f\"  GPU {i}: {p.name} | {round(f/1024**3,1)} GiB free / {round(t/1024**3,1)} GiB total\")\nprint(f\"Ready — {torch.cuda.device_count()} GPU(s)\")"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "# Unsloth Training for Falcon H1\n\nThis Notebook has been authored by TII Falcon Team.\nFor more details on Falcon H1 series of models :\n1. [Official Page](https://tiiuae.github.io/Falcon-H1/)\n2. [blogpost](https://falcon-lm.github.io/blog/falcon-h1/)\n3. [Official github page ](https://github.com/tiiuae/Falcon-H1)\n4. [hf collection](https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df)"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "To run this, press \"*Runtime*\" and press \"*Run all*\" on a **free** Tesla T4 Google Colab instance!\n<div class=\"align-center\">\n\n<a href=\"https://unsloth.ai/\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png\" width=\"115\"></a>\n<a href=\"https://discord.gg/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Discord button.png\" width=\"145\"></a>\n<a href=\"https://unsloth.ai/docs/\"><img src=\"https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true\" width=\"125\"></a> Join Discord if you need help + ⭐ <i>Star us on <a href=\"https://github.com/unslothai/unsloth\">Github</a> </i> ⭐\n</div>\n\nTo install Unsloth on your own computer, follow the installation instructions on our Github page [here](https://unsloth.ai/docs/get-started/install).\n\nYou will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & how to save it"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "### Installation"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "\n# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n!pip install -q unsloth\n# Get latest Unsloth\n!pip uninstall unsloth -y"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "!pip install -q \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git@main\""
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "!pip install -q --force-reinstall git+https://github.com/huggingface/transformers.git "
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "!pip install -q --no-build-isolation git+https://github.com/Dao-AILab/causal-conv1d.git@main"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "!pip install -q --no-build-isolation git+https://github.com/state-spaces/mamba.git@main"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "!pip install -q git+https://github.com/unslothai/unsloth-zoo.git"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "import unsloth\nfrom unsloth import FastLanguageModel\nimport torch\nimport os\nos.environ['TRITON_JIT_DISABLE_OPT'] = '1' # Likely the most critical change\n\nmax_seq_length = 1024  # reduced for T4 VRAM # Choose any! We auto support RoPE Scaling internally!\ndtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\nload_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n    model_name = \"tiiuae/Falcon-H1-0.5B-Instruct\", # Choose any model from https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df\n    max_seq_length = max_seq_length,\n    dtype = dtype,\n    load_in_4bit = load_in_4bit\n)"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "# Configure PEFT model\nmodel = FastLanguageModel.get_peft_model(\n    model,\n    r=64,\n    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n                    \"gate_proj\", \"up_proj\", \"down_proj\"], #Mamba out_proj and conv1d layers should not be included here see https://github.com/huggingface/peft/pull/2562\n    lora_alpha= 32,\n    lora_dropout= 0.1,\n    use_gradient_checkpointing=False,\n    random_state=3407,\n)"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "from datasets import load_dataset, load_from_disk\nimport os, time\n\nDATASET_CACHE = \"/kaggle/working/_dataset_cache\"\n\ndef load_with_retry(path, split=\"train\", retries=5, wait=15):\n    if os.path.exists(DATASET_CACHE):\n        print(f\"Loading from cache: {DATASET_CACHE}\")\n        return load_from_disk(DATASET_CACHE)\n    for i in range(retries):\n        try:\n            print(f\"Attempt {i+1}/{retries}: {path}\")\n            ds = load_dataset(path, split=split)\n            ds.save_to_disk(DATASET_CACHE)\n            print(f\"Loaded OK — {len(ds)} rows\")\n            return ds\n        except Exception as e:\n            print(f\"  Failed: {e}\")\n            if i < retries - 1:\n                time.sleep(wait)\n    raise RuntimeError(f\"Failed to load {path} after {retries} attempts\")"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Input:\n{}\n\n### Response:\n{}\"\"\"\n\nEOS_TOKEN = tokenizer.eos_token\ndef formatting_prompts_func(examples):\n    instructions = examples[\"instruction\"]\n    inputs       = examples[\"input\"]\n    outputs      = examples[\"output\"]\n    texts = []\n    for instruction, input, output in zip(instructions, inputs, outputs):\n        # Must add EOS_TOKEN, otherwise your generation will go on forever!\n        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN\n        texts.append(text)\n    return { \"text\" : texts, }\n\nfrom datasets import load_dataset\ndataset = load_dataset(\"yahma/alpaca-cleaned\", split = \"train\")\ndataset = dataset.map(formatting_prompts_func, batched = True,)"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "from trl import SFTTrainer\nfrom transformers import TrainingArguments\n\ntrainer = SFTTrainer(\n    model = model,\n    tokenizer = tokenizer,\n    train_dataset = dataset,\n    dataset_text_field = \"text\",\n    max_seq_length = max_seq_length,\n    packing = False, # Can make training 5x faster for short sequences.\n    args = TrainingArguments(\n        per_device_train_batch_size = 1,\n        gradient_accumulation_steps = 8,\n        warmup_steps = 5,\n        max_steps = 60,\n        learning_rate = 2e-4,\n        fp16 = not torch.cuda.is_bf16_supported(),\n        bf16 = torch.cuda.is_bf16_supported(),\n        logging_steps = 1,\n        optim = \"adamw_8bit\",\n        weight_decay = 0.001,\n        lr_scheduler_type = \"linear\",\n        seed = 3407,\n        output_dir = \"outputs\",\n    ),\n)"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "### Show current memory stats"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "\ngpu_stats = torch.cuda.get_device_properties(0)\nstart_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\nmax_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\nprint(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\nprint(f\"{start_gpu_memory} GB of memory reserved.\")"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "# Training"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "import gc, torch\n\n# ── Patch trainer before training ─────────────────────────\ntry:\n    trainer.args.per_device_train_batch_size = 1\n    trainer.args.gradient_accumulation_steps = 32\n    trainer.args.gradient_checkpointing      = False\n    trainer.args.fp16                        = True\n    trainer.args.bf16                        = False\n    trainer.args.dataloader_pin_memory       = True\n    trainer.args.dataloader_num_workers      = 2\n    trainer.args.ddp_find_unused_parameters  = False\n    trainer.args.max_grad_norm               = 0.3\n    eff = trainer.args.per_device_train_batch_size * trainer.args.gradient_accumulation_steps\n    print(f\"TrainingArguments patched — effective batch: {eff}\")\nexcept Exception as e:\n    print(f\"TrainingArguments skipped: {e}\")\n\ntry:\n    _m = trainer.model\n    while hasattr(_m, \"module\"):\n        _m = _m.module\n    if hasattr(_m, \"enable_input_require_grads\"):\n        _m.enable_input_require_grads()\n        print(\"enable_input_require_grads OK\")\nexcept Exception as e:\n    print(f\"enable_input_require_grads skipped: {e}\")\n\ngc.collect()\ntorch.cuda.empty_cache()\nfor i in range(torch.cuda.device_count()):\n    print(f\"  GPU {i} free: {round(torch.cuda.mem_get_info(i)[0]/1024**3,2)} GiB\")\nprint(\"Ready to train\")"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "import os\nos.environ[\"TRITON_DISABLE_LINE_INFO\"] = \"1\"\ntrainer_stats = trainer.train()"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "### Show final memory and time stats"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\nused_memory_for_lora = round(used_memory - start_gpu_memory, 3)\nused_percentage = round(used_memory / max_memory * 100, 3)\nlora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\nprint(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\nprint(\n    f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\"\n)\nprint(f\"Peak reserved memory = {used_memory} GB.\")\nprint(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\nprint(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\nprint(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "This notebook and all Unsloth notebooks are licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme)."
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.12"
+    },
+    "kaggle": {
+      "accelerator": "nvidiaTeslaT4",
+      "acceleratorCount": 2,
+      "enableGpuQuickConnect": true,
+      "isInternetEnabled": true,
+      "language": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}