fix no white space when using stream_chat with fast tokenizer

Files changed (3) hide show

configuration_internlm2.py CHANGED Viewed

@@ -148,4 +148,4 @@ class InternLM2Config(PretrainedConfig):
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")

                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")

tokenization_internlm2.py CHANGED Viewed

@@ -233,4 +233,4 @@ class InternLM2Tokenizer(PreTrainedTokenizer):
         if token_ids_1 is None:
             return len(token_ids_0 + eos) * [0]
-        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

         if token_ids_1 is None:
             return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

tokenization_internlm2_fast.py CHANGED Viewed

@@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter):
         return unk_id
     def decoder(self, replacement, add_prefix_space):
-        return decoders.Sequence(
-            [
-                decoders.Replace("▁", " "),
-                decoders.ByteFallback(),
-                decoders.Fuse(),
-                decoders.Strip(content=" ", left=1),
-            ]
-        )
     def tokenizer(self, proto):
         model_type = proto.trainer_spec.model_type
@@ -211,4 +211,4 @@ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
-        return (out_vocab_file,)

         return unk_id
     def decoder(self, replacement, add_prefix_space):
+        decoders_sequence = [
+            decoders.Replace("▁", " "),
+            decoders.ByteFallback(),
+            decoders.Fuse(),
+        ]
+        if self.proto.normalizer_spec.add_dummy_prefix:
+            decoders_sequence.append(decoders.Strip(content=" ", left=1))
+        return decoders.Sequence(decoders_sequence)
     def tokenizer(self, proto):
         model_type = proto.trainer_spec.model_type
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)