Source code for omnigenbench.src.tokenizer.single_nucleotide_tokenizer

# -*- coding: utf-8 -*-
# file: single_nucleotide_tokenizer.py
# time: 18:05 08/04/2024
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
# github: https://github.com/yangheng95
# huggingface: https://huggingface.co/yangheng
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# Copyright (C) 2019-2025. All Rights Reserved.

import warnings

from transformers import AutoTokenizer

from ..abc.abstract_tokenizer import OmniTokenizer

warnings.filterwarnings("once")



[docs]
class OmniSingleNucleotideTokenizer(OmniTokenizer):
    """
    Tokenizer for single nucleotide tokenization in genomics.

    This tokenizer converts genomic sequences into individual nucleotide tokens,
    where each nucleotide (A, T, C, G, U) becomes a separate token. It's designed
    for genomic sequence processing where fine-grained nucleotide-level analysis
    is required.

    The tokenizer supports various preprocessing options including U/T conversion
    and whitespace addition between nucleotides. It also handles special tokens
    like BOS (beginning of sequence) and EOS (end of sequence) tokens.

    Attributes:
        u2t (bool): Whether to convert 'U' to 'T'.
        t2u (bool): Whether to convert 'T' to 'U'.
        add_whitespace (bool): Whether to add whitespace between nucleotides.
    """

    def __init__(self, base_tokenizer=None, **kwargs):
        """
        Initializes the single nucleotide tokenizer.

        Args:
            base_tokenizer: The underlying Hugging Face tokenizer.
            **kwargs: Additional keyword arguments passed to the parent class.

        Example:
            >>> from transformers import AutoTokenizer
            >>> base_tokenizer = AutoTokenizer.from_pretrained("model_name")
            >>> tokenizer = OmniSingleNucleotideTokenizer(base_tokenizer)
        """
        super(OmniSingleNucleotideTokenizer, self).__init__(base_tokenizer, **kwargs)
        self.metadata["tokenizer_name"] = self.__class__.__name__

    def __call__(self, sequence, **kwargs):
        """
        Tokenizes sequences using single nucleotide tokenization.

        This method converts genomic sequences into tokenized inputs suitable
        for model training and inference. It handles sequence preprocessing,
        tokenization, and padding/truncation.

        Args:
            sequence (str or list): A single sequence or list of sequences to tokenize.
            **kwargs: Additional arguments for tokenization:
                - max_length (int): Maximum sequence length.
                - padding (str): Padding strategy.
                - truncation (bool): Whether to truncate sequences.
                - warnings (bool): Whether to show warnings for unknown tokens.

        Returns:
            dict: A dictionary containing tokenized inputs:
                - input_ids: Token IDs for the sequences
                - attention_mask: Attention mask for the sequences

        Example:
            >>> # Tokenize a single sequence
            >>> inputs = tokenizer("ATCGATCG")
            >>> print(inputs['input_ids'].shape)  # torch.Size([1, seq_len])

            >>> # Tokenize multiple sequences
            >>> inputs = tokenizer(["ATCGATCG", "GCTAGCTA"])
            >>> print(inputs['input_ids'].shape)  # torch.Size([2, seq_len])
        """
        if self.u2t:
            sequence = "".join([seq.replace("U", "T").upper() for seq in sequence])
        if self.t2u:
            sequence = "".join([seq.replace("T", "U").upper() for seq in sequence])
        if self.add_whitespace:
            sequence = " ".join(list(sequence))
        sequence_tokens = self.tokenize(sequence)[
            : kwargs.get("max_length", self.max_length) - 2
        ]
        tokenized_inputs = {
            "input_ids": [],
            "attention_mask": [],
        }
        bos_id = (
            self.base_tokenizer.bos_token_id
            if self.base_tokenizer.bos_token_id is not None
            else self.base_tokenizer.cls_token_id
        )
        eos_id = (
            self.base_tokenizer.eos_token_id
            if self.base_tokenizer.eos_token_id is not None
            else self.base_tokenizer.sep_token_id
        )
        for tokens in sequence_tokens:
            tokenized_inputs["input_ids"].append(
                [bos_id] + self.base_tokenizer.convert_tokens_to_ids(tokens) + [eos_id]
            )
            tokenized_inputs["attention_mask"].append(
                [1] * len(tokenized_inputs["input_ids"][-1])
            )

        if kwargs.get("warnings", True):
            for i, ids in enumerate(tokenized_inputs["input_ids"]):
                if ids.count(self.base_tokenizer.unk_token_id) / len(ids) > 0.1:
                    warnings.warn(
                        f"Unknown tokens are more than "
                        f"{ids.count(self.base_tokenizer.unk_token_id) / len(ids)}% in the {i}-th sequence, "
                        f"please check the tokenization process."
                    )
        max_length = max(len(ids) for ids in tokenized_inputs["input_ids"])
        tokenized_inputs = self.base_tokenizer.pad(
            tokenized_inputs,
            padding=kwargs.get("padding", "max_length"),
            max_length=min(max_length, kwargs.get("max_length", 512)),
            return_attention_mask=kwargs.get("return_attention_mask", True),
            return_tensors="pt",
        )
        return tokenized_inputs


[docs]
    @staticmethod
    def from_pretrained(config_or_model, **kwargs):
        """
        Loads a single nucleotide tokenizer from a pre-trained model.

        This method creates a single nucleotide tokenizer wrapper around
        a Hugging Face tokenizer loaded from a pre-trained model.

        Args:
            config_or_model (str): The name or path of the pre-trained model.
            **kwargs: Additional arguments for the tokenizer.

        Returns:
            OmniSingleNucleotideTokenizer: An instance of the tokenizer.

        Example:
            >>> tokenizer = OmniSingleNucleotideTokenizer.from_pretrained("model_name")
        """
        self = OmniSingleNucleotideTokenizer(
            AutoTokenizer.from_pretrained(config_or_model, **kwargs)
        )
        return self



[docs]
    def tokenize(self, sequence, **kwargs):
        """
        Converts a sequence into a list of individual nucleotide tokens.

        This method tokenizes genomic sequences by treating each nucleotide
        as a separate token. It handles both single sequences and lists of sequences.

        Args:
            sequence (str or list): A single sequence or list of sequences to tokenize.
            **kwargs: Additional arguments (not used in this implementation).

        Returns:
            list: A list of token lists, where each inner list contains
                  individual nucleotide tokens.

        Example:
            >>> # Tokenize a single sequence
            >>> tokens = tokenizer.tokenize("ATCGATCG")
            >>> print(tokens)  # [['A', 'T', 'C', 'G', 'A', 'T', 'C', 'G']]

            >>> # Tokenize multiple sequences
            >>> tokens = tokenizer.tokenize(["ATCGATCG", "GCTAGCTA"])
            >>> print(tokens)  # [['A', 'T', 'C', 'G', ...], ['G', 'C', 'T', 'A', ...]]
        """
        if isinstance(sequence, str):
            sequences = [sequence]
        else:
            sequences = sequence

        sequence_tokens = []
        for i in range(len(sequences)):
            sequence_tokens.append(list(sequences[i]))

        return sequence_tokens



[docs]
    def encode(self, sequence, **kwargs):
        """
        Converts a sequence into a list of token IDs.

        This method encodes genomic sequences into token IDs using the
        underlying base tokenizer.

        Args:
            sequence (str): The input sequence to encode.
            **kwargs: Additional arguments for encoding.

        Returns:
            list: A list of token IDs.

        Example:
            >>> token_ids = tokenizer.encode("ATCGATCG")
            >>> print(token_ids)  # [1, 2, 3, 4, 1, 2, 3, 4]
        """
        return self.base_tokenizer.encode(sequence, **kwargs)



[docs]
    def decode(self, sequence, **kwargs):
        """
        Converts a list of token IDs back into a sequence.

        This method decodes token IDs back into genomic sequences using
        the underlying base tokenizer.

        Args:
            sequence (list): A list of token IDs.
            **kwargs: Additional arguments for decoding.

        Returns:
            str: The decoded sequence.

        Example:
            >>> sequence = tokenizer.decode([1, 2, 3, 4])
            >>> print(sequence)  # "ATCG"
        """
        return self.base_tokenizer.decode(sequence, **kwargs)



[docs]
    def encode_plus(self, sequence, **kwargs):
        """
        Encodes a sequence with additional information.

        This method provides enhanced encoding with additional information
        like attention masks and token type IDs.

        Args:
            sequence (str): The input sequence to encode.
            **kwargs: Additional arguments for encoding.

        Returns:
            dict: A dictionary containing encoded information.

        Example:
            >>> encoded = tokenizer.encode_plus("ATCGATCG")
            >>> print(encoded.keys())  # dict_keys(['input_ids', 'attention_mask'])
        """
        return self.base_tokenizer.encode_plus(sequence, **kwargs)