Uploaded model
- Developed by: metascroy
- License: apache-2.0
- Finetuned from model : unsloth/Qwen3-4B
This qwen3 model was trained 2x faster with Unsloth and Huggingface's TRL library.
Finetune with unsloth and torchao
Below we show how to finetune Qwen3-4B using unsloth in a way that can be deployed with ExecuTorch. The example is based on the notebook here.
################################################################################
# We first load the model for QAT using the mobile CPU friendly int8-int4 scheme
################################################################################
from unsloth import FastLanguageModel
from unsloth.chat_templates import (
get_chat_template,
)
import torch
MODEL_ID = "unsloth/Qwen3-4B"
QAT_SCHEME = "int8-int4"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_ID,
max_seq_length = 2048,
dtype = torch.bfloat16,
load_in_4bit = False,
full_finetuning = True,
# ExecuTorch CPU quantization scheme
# Quantize embedding to 8-bits, and quantize linear layers to 4-bits
# with 8-bit dynamically quantized activations
qat_scheme = QAT_SCHEME,
)
tokenizer = get_chat_template(tokenizer, chat_template = "qwen3")
################################################################################
# Data prep
################################################################################
from datasets import load_dataset
reasoning_dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot")
non_reasoning_dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
# Convert the dataset into a conversational format
def generate_conversation(examples):
problems = examples["problem"]
solutions = examples["generated_solution"]
conversations = []
for problem, solution in zip(problems, solutions):
conversations.append([
{"role" : "user", "content" : problem},
{"role" : "assistant", "content" : solution},
])
return { "conversations": conversations, }
reasoning_conversations = tokenizer.apply_chat_template(
list(reasoning_dataset.map(generate_conversation, batched = True)["conversations"]),
tokenize = False,
)
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(non_reasoning_dataset)
non_reasoning_conversations = tokenizer.apply_chat_template(
list(dataset["conversations"]),
tokenize = False,
)
# Let's create a combined dataset that mixes 25% conversational vs. 75% reasoning
chat_percentage = 0.25
import pandas as pd
non_reasoning_subset = pd.Series(non_reasoning_conversations)
non_reasoning_subset = non_reasoning_subset.sample(
int(len(reasoning_conversations)*(chat_percentage/(1 - chat_percentage))),
random_state=2407,
)
print(len(reasoning_conversations))
print(len(non_reasoning_subset))
print(len(non_reasoning_subset) / (len(non_reasoning_subset) + len(reasoning_conversations)))
data = pd.concat([
pd.Series(reasoning_conversations),
pd.Series(non_reasoning_subset)
])
data.name = "text"
from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)
################################################################################
# Define trainer
################################################################################
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = combined_dataset,
eval_dataset = None, # Can set up evaluation!
args = SFTConfig(
dataset_text_field = "text",
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4, # Use GA to mimic batch size!
warmup_steps = 5,
# num_train_epochs = 1, # Set this for 1 full training run.
max_steps = 30,
learning_rate = 2e-5,
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.001,
lr_scheduler_type = "linear",
seed = 3407,
report_to = "none", # Use TrackIO/WandB etc
),
)
################################################################################
# Do fine tuning
################################################################################
trainer_stats = trainer.train()
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
################################################################################
# Inference
################################################################################
messages = [
{"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
messages,
tokenize = False,
add_generation_prompt = True, # Must add for generation
enable_thinking = False, # Disable thinking
)
from transformers import TextStreamer
_ = model.generate(
**tokenizer(text, return_tensors = "pt").to("cuda"),
max_new_tokens = 256, # Increase for longer outputs!
temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
streamer = TextStreamer(tokenizer, skip_prompt = True),
)
# ################################################################################
# # Convert model to torchao format and save
# ################################################################################
from unsloth.models._utils import _convert_torchao_model
_convert_torchao_model(model)
model_name = MODEL_ID.split("/")[-1]
save_to = f"{model_name}-{QAT_SCHEME}-unsloth-v3"
# Save locally
# model.save_pretrained(save_to, safe_serialization=False)
# tokenizer.save_pretrained(save_to)
# Or save to hub
from huggingface_hub import get_token, whoami
def _get_username():
token = get_token()
username = whoami(token=token)["name"]
return username
username = _get_username()
model.push_to_hub(f"{username}/{save_to}", safe_serialization=False)
tokenizer.push_to_hub(f"{username}/{save_to}")
Export to ExecuTorch
After we've finetuned our model, we need to export it to an ExecuTorch *.pte file.
# 1. Install ExecuTorch
pip install executorch pytorch_tokenizers torchtune
# 2. Download finetuned weights we uploaded to HuggingFace (or use local directory we saved to)
HF_DIR=metascroy/Qwen3-4B-int8-int4-unsloth-v3
WEIGHT_DIR=$(hf download ${HF_DIR})
# 3. Convert the weight checkpoint state dict keys to one that ExecuTorch expects
python -m executorch.examples.models.qwen3.convert_weights $WEIGHT_DIR pytorch_model_converted.bin
# 4. Download model config from ExecuTorch repo
curl -L -o 4b_config.json https://raw.githubusercontent.com/pytorch/executorch/main/examples/models/qwen3/config/4b_config.json
# 5. Export to ExecuTorch pte file
python -m executorch.examples.models.llama.export_llama \
--model "qwen3_4b" \
--checkpoint pytorch_model_converted.bin \
--params 4b_config.json \
--output_name qwen3_model.pte \
-kv \
--use_sdpa_with_kv_cache \
-X \
--xnnpack-extended-ops \
--max_context_length 1024 \
--max_seq_length 128 \
--dtype fp32 \
--metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
# 6. (optional) Upload pte file to HuggingFace
hf upload ${HF_DIR} qwen3_model.pte
Run on mobile device
Once we have qwen3_model.pte and the tokenizer.json, we can run on a mobile device! To run on iOS, follow the instructions here. Below is a screenshot of the model in action:
(To build on Android, follow the instructions here.)
- Downloads last month
- 1

