diff --git a/src/tokenizers.js b/src/tokenizers.js index ad257fd..ee1aa19 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -849,7 +849,7 @@ class BertPreTokenizer extends PreTokenizer { constructor(config) { super(); // TODO use config - this.pattern = /\b\p{L}+\b|[^\s\p{L}]+/gu + this.pattern = /\b\p{L}+\b|[^\s\p{L}]/gu; } /** * Tokenizes a single text using the BERT pre-tokenization scheme.