Source code for speeq.data.tokenizers

from __future__ import annotations

import os
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Union

from speeq.constants import CHAR_TOKENIZER_TYPE, TOKENIZER_TYPE_KEY, WORD_TOKENIZER_TYPE
from speeq.interfaces import ITokenizer
from speeq.utils.utils import load_json, save_json

from .decorators import check_token

PAD = "<PAD>"
OOV = "<OOV>"
SOS = "<SOS>"
EOS = "<EOS>"
BLANK = "<BLANK>"


@dataclass
class _SpecialTokens:
    _pad: Tuple[str, int] = (None, None)
    _blank: Tuple[str, int] = (None, None)
    _sos: Tuple[str, int] = (None, None)
    _eos: Tuple[str, int] = (None, None)
    _oov: Tuple[str, int] = (None, None)

    @property
    def pad_id(self):
        return self._pad[1]

    @property
    def pad_token(self):
        return self._pad[0]

    @property
    def blank_id(self):
        return self._blank[1]

    @property
    def blank_token(self):
        return self._blank[0]

    @property
    def sos_id(self):
        return self._sos[1]

    @property
    def sos_token(self):
        return self._sos[0]

    @property
    def eos_id(self):
        return self._eos[1]

    @property
    def eos_token(self):
        return self._eos[0]

    @property
    def mask_id(self):
        return self._mask[1]

    @property
    def mask_token(self):
        return self._mask[0]

    @property
    def oov_id(self):
        return self._oov[1]

    @property
    def oov_token(self):
        return self._oov[0]


[docs]class BaseTokenizer(ITokenizer): _pad_key = "pad" _oov_key = "oov" _sos_key = "sos" _eos_key = "eos" _blank_key = "blank" _token_to_id_key = "token_to_id" _special_tokens_key = "special_tokens" def __init__(self) -> None: super().__init__() self._token_to_id = dict() self._id_to_token = dict() self.special_tokens = _SpecialTokens() self.add_oov_token() @property def vocab_size(self) -> int: return len(self._token_to_id)
[docs] def add_token(self, token: str) -> int: """Adds the provided token to the tokenizer. Args: token (str): The token to be added. Returns: int: The id of the token. """ if token in self._token_to_id: return self._token_to_id[token] token_id = self.vocab_size self._token_to_id[token] = token_id self._id_to_token[token_id] = token return token_id
[docs] @check_token(PAD) def add_pad_token(self, token=PAD) -> ITokenizer: """Adds PAD token""" token_id = self.add_token(token) self.special_tokens._pad = (token, token_id) return self
[docs] @check_token(BLANK) def add_blank_token(self, token=BLANK) -> ITokenizer: """Adds BLANK token""" token_id = self.add_token(token) self.special_tokens._blank = (token, token_id) return self
[docs] @check_token(SOS) def add_sos_token(self, token=SOS) -> ITokenizer: """Adds SOS token""" token_id = self.add_token(token) self.special_tokens._sos = (token, token_id) return self
[docs] @check_token(EOS) def add_eos_token(self, token=EOS) -> ITokenizer: """Adds EOS token""" token_id = self.add_token(token) self.special_tokens._eos = (token, token_id) return self
[docs] @check_token(OOV) def add_oov_token(self, token=OOV) -> ITokenizer: """Adds OOV token""" token_id = self.add_token(token) self.special_tokens._oov = (token, token_id) return self
def _reset_id_to_token(self) -> None: self._id_to_token = dict( zip(self._token_to_id.values(), self._token_to_id.keys()) ) def __set_special_tokens_dict(self, data: dict) -> None: if self._pad_key in data: self.special_tokens._pad = tuple(data[self._pad_key]) if self._blank_key in data: self.special_tokens._blank = tuple(data[self._blank_key]) if self._sos_key in data: self.special_tokens._sos = tuple(data[self._sos_key]) if self._eos_key in data: self.special_tokens._eos = tuple(data[self._eos_key]) if self._oov_key in data: self.special_tokens._oov = tuple(data[self._oov_key]) def __get_special_tokens_dict(self) -> dict: data = {} if self.special_tokens.pad_id is not None: data[self._pad_key] = list(self.special_tokens._pad) if self.special_tokens.blank_id is not None: data[self._blank_key] = list(self.special_tokens._blank) if self.special_tokens.sos_id is not None: data[self._sos_key] = list(self.special_tokens._sos) if self.special_tokens.eos_id is not None: data[self._eos_key] = list(self.special_tokens._eos) if self.special_tokens.oov_id is not None: data[self._oov_key] = list(self.special_tokens._oov) return data
[docs] def load_tokenizer_from_dict(self, data: dict) -> ITokenizer: """Loads a pre-trained tokenizer of type dict. Args: data (dict): The pre-trained tokenizer dictionary. Returns: ITokenizer: The loaded tokenizer. """ self._token_to_id = data[self._token_to_id_key] self.__set_special_tokens_dict(data[self._special_tokens_key]) self._reset_id_to_token() return self
[docs] def load_tokenizer( self, tokenizer_path: Union[str, Path], *args, **kwargs ) -> ITokenizer: """Loads a pre-trained tokenizer. Args: tokenizer_path (Union[str, Path]): The pre-trained tokenizer path. Returns: ITokenizer: The loaded tokenizer. """ if os.path.exists(tokenizer_path) is False: raise FileNotFoundError(f"{tokenizer_path} not found!") data = load_json(tokenizer_path) assert ( data[TOKENIZER_TYPE_KEY] == self._type ), f""" The used tokenizer is not matched with the pre-trained tokenizer! Given pre-trained tokenizer of type {data[TOKENIZER_TYPE_KEY]} while {self._type} is used! """ return self.load_tokenizer_from_dict(data)
[docs] def set_tokenizer(self, data: List[str], *args, **kwargs) -> ITokenizer: """Sets/trains the tokenizer on the provided data. Args: data (List[str]): A list of all text sentences. Returns: ITokenizer: The trained tokenizer. """ all_tokens = self.get_tokens(data) for token in all_tokens: self.add_token(token=token) self._reset_id_to_token() return self
[docs] def save_tokenizer(self, save_path: Union[str, Path], *args, **kwargs) -> None: """Saves the tokenizer to a json file Args: save_path (Union[str, Path]): The path to save the tokenizer to. """ data = { TOKENIZER_TYPE_KEY: self._type, self._token_to_id_key: self._token_to_id, self._special_tokens_key: self.__get_special_tokens_dict(), } save_json(save_path, data)
[docs] def ids2tokens(self, ids: List[int]) -> List[str]: """Converts a list of integers to a list of strings Args: ids (List[int]): The list of tokens ids. Returns: List[str]: A list of string. """ return list(map(lambda x: self._id_to_token[x], ids))
[docs] def tokenize(self, sentence: str, add_sos=False, add_eos=False) -> List[int]: """Tokenizes the input sentence. Args: sentence (str): The sentence to be tokenized. add_sos (bool, optional): A flag to whether added SOS token at the of the sequence. Defaults to False. add_eos (bool, optional): A flag to whether add EOS token at the end of the sequence. Defaults to False. Returns: List[int]: The tokenized sequence. """ results = [] if add_sos is True: assert self.special_tokens.sos_id is not None results.append(self.special_tokens.sos_id) tokens = self.preprocess_tokens(sentence) results.extend( map(lambda x: self._token_to_id.get(x, self.special_tokens.oov_id), tokens) ) if add_eos is True: assert self.special_tokens.eos_id is not None results.append(self.special_tokens.eos_id) return results
[docs] def batch_tokenizer(self, data: List[str], add_sos=False, add_eos=False) -> list: def func(sentence): return self.tokenize(sentence=sentence, add_sos=add_sos, add_eos=add_eos) return list(map(func, data))
[docs] def batch_detokenizer(self, data: List[int]) -> list: return list(map(self.ids2tokens, data))
[docs]class CharTokenizer(BaseTokenizer): """Implements character based tokenizer.""" _type = CHAR_TOKENIZER_TYPE def __init__(self) -> None: super().__init__()
[docs] def get_tokens(self, data: List[str]): return set("".join(data))
[docs] def preprocess_tokens(self, sentence: str) -> List[str]: return list(sentence)
[docs]class WordTokenizer(BaseTokenizer): """Implements white space based tokenizer.""" _type = WORD_TOKENIZER_TYPE def __init__(self, sep=" ") -> None: super().__init__() self.sep = sep
[docs] def get_tokens(self, data: List[str]): result = set() for line in data: result = result.union(line.split(self.sep)) if "" in result: result.remove("") return result
[docs] def preprocess_tokens(self, sentence: str) -> List[str]: return sentence.split(self.sep)