Uploaded model

  • Developed by: metascroy
  • License: apache-2.0
  • Finetuned from model : unsloth/Qwen3-4B

This qwen3 model was trained 2x faster with Unsloth and Huggingface's TRL library.

Finetune with unsloth and torchao

Below we show how to finetune Qwen3-4B using unsloth in a way that can be deployed with ExecuTorch. The example is based on the notebook here.

################################################################################
# We first load the model for QAT using the mobile CPU friendly int8-int4 scheme
################################################################################

from unsloth import FastLanguageModel
from unsloth.chat_templates import (
    get_chat_template,
)
import torch

MODEL_ID = "unsloth/Qwen3-4B"
QAT_SCHEME = "int8-int4"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_ID,
    max_seq_length = 2048,
    dtype = torch.bfloat16,
    load_in_4bit = False,
    full_finetuning = True,
    # ExecuTorch CPU quantization scheme
    # Quantize embedding to 8-bits, and quantize linear layers to 4-bits
    # with 8-bit dynamically quantized activations
    qat_scheme = QAT_SCHEME,
)
tokenizer = get_chat_template(tokenizer, chat_template = "qwen3")


################################################################################
# Data prep
################################################################################

from datasets import load_dataset
reasoning_dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot")
non_reasoning_dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

# Convert the dataset into a conversational format
def generate_conversation(examples):
    problems  = examples["problem"]
    solutions = examples["generated_solution"]
    conversations = []
    for problem, solution in zip(problems, solutions):
        conversations.append([
            {"role" : "user",      "content" : problem},
            {"role" : "assistant", "content" : solution},
        ])
    return { "conversations": conversations, }

reasoning_conversations = tokenizer.apply_chat_template(
    list(reasoning_dataset.map(generate_conversation, batched = True)["conversations"]),
    tokenize = False,
)

from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(non_reasoning_dataset)
non_reasoning_conversations = tokenizer.apply_chat_template(
    list(dataset["conversations"]),
    tokenize = False,
)

# Let's create a combined dataset that mixes 25% conversational vs. 75% reasoning
chat_percentage = 0.25
import pandas as pd
non_reasoning_subset = pd.Series(non_reasoning_conversations)
non_reasoning_subset = non_reasoning_subset.sample(
    int(len(reasoning_conversations)*(chat_percentage/(1 - chat_percentage))),
    random_state=2407,
)
print(len(reasoning_conversations))
print(len(non_reasoning_subset))
print(len(non_reasoning_subset) / (len(non_reasoning_subset) + len(reasoning_conversations)))


data = pd.concat([
    pd.Series(reasoning_conversations),
    pd.Series(non_reasoning_subset)
])
data.name = "text"

from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)


################################################################################
# Define trainer
################################################################################

from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-5,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use TrackIO/WandB etc
    ),
)


################################################################################
# Do fine tuning
################################################################################
trainer_stats = trainer.train()
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)


################################################################################
# Inference
################################################################################
messages = [
    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)


# ################################################################################
# # Convert model to torchao format and save
# ################################################################################

from unsloth.models._utils import _convert_torchao_model
_convert_torchao_model(model)

model_name = MODEL_ID.split("/")[-1]
save_to = f"{model_name}-{QAT_SCHEME}-unsloth-v3"

# Save locally
# model.save_pretrained(save_to, safe_serialization=False)
# tokenizer.save_pretrained(save_to)

# Or save to hub
from huggingface_hub import get_token, whoami
def _get_username():
    token = get_token()
    username = whoami(token=token)["name"]
    return username
username = _get_username()
model.push_to_hub(f"{username}/{save_to}", safe_serialization=False)
tokenizer.push_to_hub(f"{username}/{save_to}")

Export to ExecuTorch

After we've finetuned our model, we need to export it to an ExecuTorch *.pte file.

# 1. Install ExecuTorch
pip install executorch pytorch_tokenizers torchtune

# 2. Download finetuned weights we uploaded to HuggingFace (or use local directory we saved to)
HF_DIR=metascroy/Qwen3-4B-int8-int4-unsloth-v3
WEIGHT_DIR=$(hf download ${HF_DIR})

# 3. Convert the weight checkpoint state dict keys to one that ExecuTorch expects
python -m executorch.examples.models.qwen3.convert_weights $WEIGHT_DIR pytorch_model_converted.bin

# 4. Download model config from ExecuTorch repo
curl -L -o 4b_config.json https://raw.githubusercontent.com/pytorch/executorch/main/examples/models/qwen3/config/4b_config.json

# 5. Export to ExecuTorch pte file
python -m executorch.examples.models.llama.export_llama \
  --model "qwen3_4b" \
  --checkpoint pytorch_model_converted.bin \
  --params 4b_config.json \
  --output_name qwen3_model.pte \
  -kv \
  --use_sdpa_with_kv_cache \
  -X \
  --xnnpack-extended-ops \
  --max_context_length 1024 \
  --max_seq_length 128 \
  --dtype fp32 \
  --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'

# 6. (optional) Upload pte file to HuggingFace
hf upload ${HF_DIR} qwen3_model.pte

Run on mobile device

Once we have qwen3_model.pte and the tokenizer.json, we can run on a mobile device! To run on iOS, follow the instructions here. Below is a screenshot of the model in action:

Screenshot 2025-12-03 at 3.19.31 PM

(To build on Android, follow the instructions here.)

Downloads last month
1
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for metascroy/Qwen3-4B-int8-int4-unsloth-v3

Base model

Qwen/Qwen3-4B-Base
Finetuned
Qwen/Qwen3-4B
Finetuned
unsloth/Qwen3-4B
Quantized
(14)
this model