Delta-Vector
/

dan-chat-apertus.py

Model card Files Files and versions

xet

Community

Delta-Vector commited on Oct 16

Commit

7d4dd10

verified ·

1 Parent(s): 664f5f7

Upload dan-chat-apertus.py with huggingface_hub

Browse files

Files changed (1) hide show

dan-chat-apertus.py +237 -0

dan-chat-apertus.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""Module containing the DanApertusPromptTokenizingStrategy and DanApertusPrompter class"""
+import copy
+import logging
+from typing import Generator, Tuple
+from axolotl.prompt_tokenizers import (
+    PromptTokenizingStrategy,
+    # Assuming these are available from the base library
+    # parse_tokenized_to_result,
+    # tokenize_prompt_default,
+)
+LOG = logging.getLogger("axolotl")
+IGNORE_TOKEN_ID = -100
+# New Special Tokens and Prefixes based on the Jinja2 template
+turn_separator = "" # The Jinja2 template relies on the tokens themselves for separation
+system_prefix = "<|system_start|>"
+system_suffix = "<|system_end|>"
+developer_prefix = "<|developer_start|>"
+developer_suffix = "<|developer_end|>"
+user_prefix = "<|user_start|>"
+user_suffix = "<|user_end|>"
+assistant_prefix = "<|assistant_start|>"
+assistant_suffix = "<|assistant_end|>"
+tool_calls_prefix = "<|tools_prefix|>"
+tool_calls_suffix = "<|tools_suffix|>"
+# This is a simplification and would need a full Jinja2 renderer to be 100% accurate
+# We'll use a placeholder for the developer block
+DEVELOPER_BLOCK_PLACEHOLDER = f"{developer_prefix}Deliberation: disabled\nTool Capabilities: disabled{developer_suffix}"
+DEFAULT_SYSTEM_PROMPT = "You are Dolphin, a helpful, unbiased, and uncensored AI assistant." # Date is a placeholder
+class DanApertusPromptTokenizingStrategy(PromptTokenizingStrategy):
+    def __init__(self, prompter, tokenizer, train_on_inputs, sequence_len, *args, **kwargs):
+        super().__init__(prompter, tokenizer, *args, **kwargs)
+        # Tokenize the assistant prefix for use in calculating labels
+        res = self._tokenize(assistant_prefix, add_eos_token=False, strip_bos_token=True)
+        self.bot_prefix_token_ids = res["input_ids"]
+        # The new format doesn't have a simple turn_separator token like "\n"
+        self.turn_separator_token_ids = []
+        self.train_on_inputs = train_on_inputs
+        self.sequence_len = sequence_len
+    def tokenize_prompt(self, prompt):
+        # 1. Build prompt parts, which now includes system and developer context
+        # This will include a virtual 'initial_context' part for the system/developer block
+        prompt_parts = list(self.prompter.build_prompt(prompt["conversations"]))
+        tokenized_parts = []
+        total_length = 0
+        not_first_turn = False # This flag is still useful for generic separator logic if needed, but not for this specific format
+        # 2. Add the initial system/developer block (simplified)
+        # Assuming the first message in conversations is the actual system message if present
+        initial_context_message = ""
+        initial_context_labels = []
+        # Check for system message in the first turn
+        if prompt_parts and prompt_parts[0][0] == "system":
+            _, system_msg, _, _ = prompt_parts.pop(0) # Pop off the explicit system message
+        else:
+            system_msg = DEFAULT_SYSTEM_PROMPT # Use default if not present
+        full_context = f"{system_prefix}{system_msg}{system_suffix}{DEVELOPER_BLOCK_PLACEHOLDER}"
+        res_context = self._tokenize(full_context, add_eos_token=False, strip_bos_token=False)
+        initial_context_labels = [IGNORE_TOKEN_ID] * len(res_context["input_ids"])
+        tokenized_parts.append({
+            "input_ids": res_context["input_ids"],
+            "attention_mask": res_context["attention_mask"],
+            "labels": initial_context_labels,
+            "role": "context",
+            "loss": False
+        })
+        total_length += len(res_context["input_ids"])
+        # 3. Process conversation turns
+        for role, message, loss, prefix in prompt_parts:
+            if total_length >= self.sequence_len:
+                break
+            # If prefix is not defined, set it to an empty string
+            if prefix is None:
+                prefix = ""
+            # Helper to generate prefix and suffix for a role
+            role_prefix = ""
+            role_suffix = ""
+            if role in ["system", "user", "human"]:
+                role_prefix = user_prefix # All user/human/system (within conversation) are user_token
+                role_suffix = user_suffix
+                # Assuming the message content is what we want to wrap
+                full_text = role_prefix + prefix + message + role_suffix
+                res = self._tokenize(full_text, add_eos_token=False, strip_bos_token=True)
+                labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+            elif role in ["model", "gpt"]:
+                role_prefix = assistant_prefix
+                role_suffix = assistant_suffix
+                # In this complex format, the assistant turn contains the full response
+                # (including potential tool calls/thoughts/responses from the Jinja template logic)
+                # We assume 'message' here is the full, pre-formatted assistant block
+                # Tokenize the full block with its prefix/suffix
+                full_text = role_prefix + prefix + message + role_suffix
+                res = self._tokenize(full_text, add_eos_token=True, strip_bos_token=True)
+                # Labels for assistant (model) turn
+                if not loss:
+                    labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                else:
+                    # Treat the entire assistant block as the ground truth if loss=True
+                    # We tokenize the *full* text but only train on the response part.
+                    # This is an approximation. A more accurate way would be to only train
+                    # on the message *content* tokens, excluding prefix/suffix/tool tokens.
+                    # Approximate prefix length as the role_prefix length
+                    # We strip_bos_token=True above, so we only need to account for role_prefix
+                    res_prefix = self._tokenize(role_prefix, add_eos_token=False, strip_bos_token=True)
+                    prefix_len = len(res_prefix["input_ids"])
+                    # Labels: IGNORE for the prefix, real tokens for the rest
+                    labels = [IGNORE_TOKEN_ID] * prefix_len + [*copy.deepcopy(res["input_ids"])][prefix_len:]
+            elif role == "tool":
+                # Tool messages are tricky in this format as they are nested inside the assistant turn
+                # The Prompter should probably not yield a separate 'tool' role
+                # For compatibility, we'll wrap it minimally, but this might not match the template
+                role_prefix = "["
+                role_suffix = "]"
+                full_text = role_prefix + prefix + message + role_suffix
+                res = self._tokenize(full_text, add_eos_token=False, strip_bos_token=True)
+                labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) # Tool output is usually not trained on
+            else:
+                LOG.warning(f"unknown role in conversation: {role}")
+                continue
+            part_length = len(res["input_ids"])
+            if total_length + part_length > self.sequence_len:
+                break
+            tokenized_parts.append({
+                "input_ids": res["input_ids"],
+                "attention_mask": res["attention_mask"],
+                "labels": labels,
+                "role": role,
+                "loss": loss
+            })
+            total_length += part_length
+            not_first_turn = True
+        result = {
+            "input_ids": [],
+            "attention_mask": [],
+            "labels": []
+        }
+        # Check if the last turn is a human/user/system turn or loss = False
+        while tokenized_parts and (tokenized_parts[-1]["role"] in ["human", "user", "system", "tool"] or not tokenized_parts[-1]["loss"]):
+            tokenized_parts.pop()
+        # Ensure we have a conversation (user + model turn)
+        if not any(part["role"] in ["human", "user", "system"] for part in tokenized_parts):
+            return result
+        if not any(part["role"] in ["model", "gpt"] for part in tokenized_parts):
+            return result
+        # Concatenate the final result
+        for part in tokenized_parts:
+            result["input_ids"] += part["input_ids"]
+            result["attention_mask"] += part["attention_mask"]
+            result["labels"] += part["labels"]
+        return result
+    # Helper functions can remain similar, but _tokenize_with_turn is less relevant
+    # given the new explicit role_prefix/suffix tokens
+    def _tokenize_with_turn(self, role_prefix, message, not_first_turn, add_eos_token=True):
+        # This function is now largely redundant due to the new structure, but kept
+        # for compatibility with the base class if other methods call it.
+        # It's simplified to ignore the turn_separator and rely on the prefixes.
+        full_message = role_prefix + message.strip()
+        return self._tokenize(full_message, add_eos_token=add_eos_token, strip_bos_token=True)
+    def _get_labels(self, res, loss, not_first_turn):
+        # Redefined to work with the assistant_prefix length
+        if not loss:
+            return [IGNORE_TOKEN_ID] * len(res["input_ids"])
+        # Calculate the length of the assistant_prefix tokenization
+        prefix_len = len(self.bot_prefix_token_ids)
+        return [IGNORE_TOKEN_ID] * prefix_len + [*copy.deepcopy(res["input_ids"])][prefix_len:]
+class DanApertusPrompter:
+    """
+    Prompter for DanApertus format.
+    """
+    def __init__(self, *args, **kwargs):
+        pass
+    def build_prompt(self, source, *args, **kwargs) -> Generator[Tuple[str, str, bool, str], None, None]:
+        # This part remains mostly the same, yielding (role, message, loss, prefix) tuples
+        # The complex formatting is now handled by the TokenizingStrategy's logic
+        for msg in source:
+            from_value = msg["from"]
+            # Assuming 'value' in the input data is the *text* content of the message
+            message_value = msg["value"]
+            # Set loss based on the message source
+            loss = msg.get("loss")
+            if loss is None:
+                loss = True if from_value in ["gpt", "model"] else False # Changed default for safety, but typically True for model output
+            # Set prefix, defaulting to an empty string if not present
+            prefix = msg.get("prefix", "")
+            yield from_value, message_value, loss, prefix
+def load(tokenizer, cfg):
+    # This remains the entry point
+    return DanApertusPromptTokenizingStrategy(DanApertusPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len)