diff --git a/docs/source/en/model_doc/cpm.mdx b/docs/source/en/model_doc/cpm.mdx index 189c4a4ac4..ac8ed8fdba 100644 --- a/docs/source/en/model_doc/cpm.mdx +++ b/docs/source/en/model_doc/cpm.mdx @@ -38,3 +38,7 @@ Note: We only have a tokenizer here, since the model architecture is the same as ## CpmTokenizer [[autodoc]] CpmTokenizer + +## CpmTokenizerFast + +[[autodoc]] CpmTokenizerFast diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 05c11d4c54..409448a7ae 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -168,7 +168,7 @@ _import_structure = { ], "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"], "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"], - "models.cpm": ["CpmTokenizer"], + "models.cpm": [], "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"], "models.data2vec": [ "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -197,7 +197,7 @@ _import_structure = { "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"], "models.encoder_decoder": ["EncoderDecoderConfig"], "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"], - "models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig", "FNetTokenizer"], + "models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig"], "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"], "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"], "models.glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"], @@ -419,7 +419,9 @@ if is_sentencepiece_available(): _import_structure["models.bert_generation"].append("BertGenerationTokenizer") _import_structure["models.big_bird"].append("BigBirdTokenizer") _import_structure["models.camembert"].append("CamembertTokenizer") + _import_structure["models.cpm"].append("CpmTokenizer") _import_structure["models.deberta_v2"].append("DebertaV2Tokenizer") + _import_structure["models.fnet"].append("FNetTokenizer") _import_structure["models.layoutxlm"].append("LayoutXLMTokenizer") _import_structure["models.m2m_100"].append("M2M100Tokenizer") _import_structure["models.marian"].append("MarianTokenizer") @@ -457,6 +459,7 @@ if is_tokenizers_available(): _import_structure["models.camembert"].append("CamembertTokenizerFast") _import_structure["models.clip"].append("CLIPTokenizerFast") _import_structure["models.convbert"].append("ConvBertTokenizerFast") + _import_structure["models.cpm"].append("CpmTokenizerFast") _import_structure["models.deberta"].append("DebertaTokenizerFast") _import_structure["models.deberta_v2"].append("DebertaV2TokenizerFast") _import_structure["models.distilbert"].append("DistilBertTokenizerFast") @@ -2575,7 +2578,6 @@ if TYPE_CHECKING: ) from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig - from .models.cpm import CpmTokenizer from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer from .models.data2vec import ( DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -2605,7 +2607,7 @@ if TYPE_CHECKING: from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer from .models.encoder_decoder import EncoderDecoderConfig from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer - from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig, FNetTokenizer + from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer from .models.glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig @@ -2809,7 +2811,9 @@ if TYPE_CHECKING: from .models.bert_generation import BertGenerationTokenizer from .models.big_bird import BigBirdTokenizer from .models.camembert import CamembertTokenizer + from .models.cpm import CpmTokenizer from .models.deberta_v2 import DebertaV2Tokenizer + from .models.fnet import FNetTokenizer from .models.layoutxlm import LayoutXLMTokenizer from .models.m2m_100 import M2M100Tokenizer from .models.marian import MarianTokenizer @@ -2840,6 +2844,7 @@ if TYPE_CHECKING: from .models.camembert import CamembertTokenizerFast from .models.clip import CLIPTokenizerFast from .models.convbert import ConvBertTokenizerFast + from .models.cpm import CpmTokenizerFast from .models.deberta import DebertaTokenizerFast from .models.deberta_v2 import DebertaV2TokenizerFast from .models.distilbert import DistilBertTokenizerFast diff --git a/src/transformers/models/fnet/__init__.py b/src/transformers/models/fnet/__init__.py index 7b09e97ab1..1cc474c74c 100644 --- a/src/transformers/models/fnet/__init__.py +++ b/src/transformers/models/fnet/__init__.py @@ -17,14 +17,18 @@ # limitations under the License. from typing import TYPE_CHECKING +from transformers import is_sentencepiece_available + from ...utils import _LazyModule, is_tokenizers_available, is_torch_available _import_structure = { "configuration_fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig"], - "tokenization_fnet": ["FNetTokenizer"], } +if is_sentencepiece_available(): + _import_structure["tokenization_fnet"] = ["FNetTokenizer"] + if is_tokenizers_available(): _import_structure["tokenization_fnet_fast"] = ["FNetTokenizerFast"] @@ -46,7 +50,9 @@ if is_torch_available(): if TYPE_CHECKING: from .configuration_fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig - from .tokenization_fnet import FNetTokenizer + + if is_sentencepiece_available(): + from .tokenization_fnet import FNetTokenizer if is_tokenizers_available(): from .tokenization_fnet_fast import FNetTokenizerFast diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py index 37d52fc094..00989dc0d1 100644 --- a/src/transformers/utils/dummy_sentencepiece_objects.py +++ b/src/transformers/utils/dummy_sentencepiece_objects.py @@ -45,6 +45,13 @@ class CamembertTokenizer(metaclass=DummyObject): requires_backends(self, ["sentencepiece"]) +class CpmTokenizer(metaclass=DummyObject): + _backends = ["sentencepiece"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["sentencepiece"]) + + class DebertaV2Tokenizer(metaclass=DummyObject): _backends = ["sentencepiece"] @@ -52,6 +59,13 @@ class DebertaV2Tokenizer(metaclass=DummyObject): requires_backends(self, ["sentencepiece"]) +class FNetTokenizer(metaclass=DummyObject): + _backends = ["sentencepiece"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["sentencepiece"]) + + class LayoutXLMTokenizer(metaclass=DummyObject): _backends = ["sentencepiece"] diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py index 64c7541649..12cec6a4a2 100644 --- a/src/transformers/utils/dummy_tokenizers_objects.py +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -73,6 +73,13 @@ class ConvBertTokenizerFast(metaclass=DummyObject): requires_backends(self, ["tokenizers"]) +class CpmTokenizerFast(metaclass=DummyObject): + _backends = ["tokenizers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tokenizers"]) + + class DebertaTokenizerFast(metaclass=DummyObject): _backends = ["tokenizers"]