Fix BPE tokenization for weird whitespace characters (Closes #199) (#208)

* Add new tokenizer unit test (#199)

* Perform `NFKC` normalization for sentencepiece models w/ precompiled charmap

* Fix JSDoc indentation

* Add problematic string to unit tests

* Use consistent BPE split token

* Add second problematic string
This commit is contained in:
Joshua Lochner 2023-07-22 04:51:11 +02:00 committed by GitHub
parent 86e68bf9c0
commit 1165f04a9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 9 deletions

View File

@ -475,6 +475,8 @@ class BPE extends TokenizerModel {
constructor(config) {
super(config);
this.BPE_SPLIT_TOKEN = ' ';
this.tokens_to_ids = config.vocab;
this.unk_token_id = this.tokens_to_ids.get(config.unk_token);
@ -486,7 +488,7 @@ class BPE extends TokenizerModel {
}
this.bpe_ranks = Object.fromEntries(config.merges.map((x, i) => [x, i]));
this.merges = config.merges.map(x => x.split(/\s+/))
this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
this.end_of_word_suffix = config.end_of_word_suffix;
@ -511,7 +513,7 @@ class BPE extends TokenizerModel {
let prev_char = word[0];
for (let i = 1; i < word.length; ++i) {
let char = word[i];
pairs.add(`${prev_char} ${char}`);
pairs.add(prev_char + this.BPE_SPLIT_TOKEN + char);
prev_char = char;
}
return Array.from(pairs);
@ -548,7 +550,7 @@ class BPE extends TokenizerModel {
if (!(bigram in this.bpe_ranks)) {
break;
}
let [first, second] = bigram.split(/\s+/g)
let [first, second] = bigram.split(this.BPE_SPLIT_TOKEN);
let new_word = [];
let i = 0;
let j = -1;
@ -579,7 +581,7 @@ class BPE extends TokenizerModel {
pairs = this.get_pairs(word);
}
}
let final_word = word.join(" ");
let final_word = word.join(this.BPE_SPLIT_TOKEN);
this.cache[token] = final_word;
return final_word;
}
@ -593,7 +595,7 @@ class BPE extends TokenizerModel {
let outputTokens = [];
for (let token of tokens) {
let bpe_token_list = this.bpe(token).split(' ');
let bpe_token_list = this.bpe(token).split(this.BPE_SPLIT_TOKEN);
for (let t of bpe_token_list) {
if (this.tokens_to_ids.has(t)) {
@ -801,10 +803,10 @@ class NormalizerSequence extends Normalizer {
this.normalizers = config.normalizers.map(x => Normalizer.fromConfig(x));
}
/**
* Apply a sequence of Normalizers to the input text.
* @param {string} text The text to normalize.
* @returns {string} The normalized text.
*/
* Apply a sequence of Normalizers to the input text.
* @param {string} text The text to normalize.
* @returns {string} The normalized text.
*/
normalize(text) {
return this.normalizers.reduce((t, normalizer) => {
return normalizer.normalize(t);
@ -1758,6 +1760,9 @@ class Precompiled extends Normalizer {
*/
normalize(text) {
// TODO use this.charsmap
// For now, we just apply NFKC normalization
// https://github.com/huggingface/tokenizers/blob/291b2e23ae81cf94738835852213ce120152d121/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L34
text = text.normalize('NFKC');
return text;
}
}

View File

@ -38,6 +38,9 @@ TOKENIZER_TEST_DATA = {
"The company was founded in 2016.",
"test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test",
"I bought an apple for $1.00 at the store.",
"you… ",
"\u0079\u006F\u0075\u2026\u00A0\u00A0",
"\u0079\u006F\u0075\u2026\u00A0\u00A0\u0079\u006F\u0075\u2026\u00A0\u00A0",
],
"custom": {
"tiiuae/falcon-7b": [