Persian-BPE-Tokenizer / tokenizer_training.py
amirhofo's picture
Upload files
6534adc verified
from packages import *
def tokenizer_training_fn(dataset: pd.DataFrame, vocab_size= 30_000, min_frequency= 5, pre_tokenizer= True)-> Tokenizer:
unk, eos= "[UNK]", "<|endoftext|>"
tokenizer= Tokenizer(models.BPE(unk_token= unk))
if pre_tokenizer: tokenizer.pre_tokenizer= pre_tokenizers.Whitespace()
trainer= trainers.BpeTrainer(vocab_size= vocab_size, min_frequency= min_frequency,
special_tokens= [unk, eos] )
tokenizer.train_from_iterator(dataset["text"], trainer)
print(10 * "--", " vocab size ", 10 * "--")
print(tokenizer.get_vocab_size())
tokenizer.post_processor= processors.TemplateProcessing(
single= f"{eos} $A {eos}",
special_tokens= [(eos, tokenizer.token_to_id(eos))]
)
tokenizer.decoder= decoders.BPEDecoder()
tokenizer.save(f"Persian_BPE_Tokenizer_{vocab_size//1000}K.json")
print(70*"-"), print("tokenizer training is complete and saved."), print(70*"-")
return tokenizer