* Add new tokenizer unit test (#199) * Perform `NFKC` normalization for sentencepiece models w/ precompiled charmap * Fix JSDoc indentation * Add problematic string to unit tests * Use consistent BPE split token * Add second problematic string
This commit is contained in:
parent
86e68bf9c0
commit
1165f04a9f
|
@ -475,6 +475,8 @@ class BPE extends TokenizerModel {
|
|||
constructor(config) {
|
||||
super(config);
|
||||
|
||||
this.BPE_SPLIT_TOKEN = ' ';
|
||||
|
||||
this.tokens_to_ids = config.vocab;
|
||||
|
||||
this.unk_token_id = this.tokens_to_ids.get(config.unk_token);
|
||||
|
@ -486,7 +488,7 @@ class BPE extends TokenizerModel {
|
|||
}
|
||||
|
||||
this.bpe_ranks = Object.fromEntries(config.merges.map((x, i) => [x, i]));
|
||||
this.merges = config.merges.map(x => x.split(/\s+/))
|
||||
this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
|
||||
|
||||
this.end_of_word_suffix = config.end_of_word_suffix;
|
||||
|
||||
|
@ -511,7 +513,7 @@ class BPE extends TokenizerModel {
|
|||
let prev_char = word[0];
|
||||
for (let i = 1; i < word.length; ++i) {
|
||||
let char = word[i];
|
||||
pairs.add(`${prev_char} ${char}`);
|
||||
pairs.add(prev_char + this.BPE_SPLIT_TOKEN + char);
|
||||
prev_char = char;
|
||||
}
|
||||
return Array.from(pairs);
|
||||
|
@ -548,7 +550,7 @@ class BPE extends TokenizerModel {
|
|||
if (!(bigram in this.bpe_ranks)) {
|
||||
break;
|
||||
}
|
||||
let [first, second] = bigram.split(/\s+/g)
|
||||
let [first, second] = bigram.split(this.BPE_SPLIT_TOKEN);
|
||||
let new_word = [];
|
||||
let i = 0;
|
||||
let j = -1;
|
||||
|
@ -579,7 +581,7 @@ class BPE extends TokenizerModel {
|
|||
pairs = this.get_pairs(word);
|
||||
}
|
||||
}
|
||||
let final_word = word.join(" ");
|
||||
let final_word = word.join(this.BPE_SPLIT_TOKEN);
|
||||
this.cache[token] = final_word;
|
||||
return final_word;
|
||||
}
|
||||
|
@ -593,7 +595,7 @@ class BPE extends TokenizerModel {
|
|||
let outputTokens = [];
|
||||
|
||||
for (let token of tokens) {
|
||||
let bpe_token_list = this.bpe(token).split(' ');
|
||||
let bpe_token_list = this.bpe(token).split(this.BPE_SPLIT_TOKEN);
|
||||
|
||||
for (let t of bpe_token_list) {
|
||||
if (this.tokens_to_ids.has(t)) {
|
||||
|
@ -801,10 +803,10 @@ class NormalizerSequence extends Normalizer {
|
|||
this.normalizers = config.normalizers.map(x => Normalizer.fromConfig(x));
|
||||
}
|
||||
/**
|
||||
* Apply a sequence of Normalizers to the input text.
|
||||
* @param {string} text The text to normalize.
|
||||
* @returns {string} The normalized text.
|
||||
*/
|
||||
* Apply a sequence of Normalizers to the input text.
|
||||
* @param {string} text The text to normalize.
|
||||
* @returns {string} The normalized text.
|
||||
*/
|
||||
normalize(text) {
|
||||
return this.normalizers.reduce((t, normalizer) => {
|
||||
return normalizer.normalize(t);
|
||||
|
@ -1758,6 +1760,9 @@ class Precompiled extends Normalizer {
|
|||
*/
|
||||
normalize(text) {
|
||||
// TODO use this.charsmap
|
||||
// For now, we just apply NFKC normalization
|
||||
// https://github.com/huggingface/tokenizers/blob/291b2e23ae81cf94738835852213ce120152d121/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L34
|
||||
text = text.normalize('NFKC');
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,6 +38,9 @@ TOKENIZER_TEST_DATA = {
|
|||
"The company was founded in 2016.",
|
||||
"test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test",
|
||||
"I bought an apple for $1.00 at the store.",
|
||||
"you… ",
|
||||
"\u0079\u006F\u0075\u2026\u00A0\u00A0",
|
||||
"\u0079\u006F\u0075\u2026\u00A0\u00A0\u0079\u006F\u0075\u2026\u00A0\u00A0",
|
||||
],
|
||||
"custom": {
|
||||
"tiiuae/falcon-7b": [
|
||||
|
|
Loading…
Reference in New Issue