From 480417118044f407bd5f0feca9008d1f31e82888 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 21 Jun 2023 01:21:14 +0200
Subject: [PATCH] Do not use spread operator to concatenate large arrays
 (Closes #153) (#154)

* Do not use spread operator for merging large arrays (Fix #153)

* Add unit test for encoding long strings
---
 src/tokenizers.js        | 15 ++++++++-------
 src/utils/core.js        | 10 ++++++++++
 tests/tokenizers.test.js | 10 ++++++++++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/tokenizers.js b/src/tokenizers.js
index 1bbf148..0b6ae49 100644
--- a/src/tokenizers.js
+++ b/src/tokenizers.js
@@ -25,13 +25,14 @@ import {
     reverseDictionary,
     escapeRegExp,
     isIntegralNumber,
+    mergeArrays,
 } from './utils/core.js';
 
 import {
     getModelJSON,
 } from './utils/hub.js';
 
-import { min } from './utils/maths.js';
+import { max, min } from './utils/maths.js';
 import { Tensor } from './utils/tensor.js';
 
 /**
@@ -1188,12 +1189,12 @@ class RobertaProcessing extends PostProcessor {
      * @returns {string[]} The input tokens with the special tokens added to the beginning and end.
      */
     post_process(tokens, tokens_pair = null) {
-        tokens = [this.cls, ...tokens, this.sep]
+        tokens = mergeArrays([this.cls], tokens, [this.sep]);
 
         // NOTE: It is intended to add 2 EOS tokens after the first set of tokens
         // https://github.com/huggingface/tokenizers/issues/983
         if (tokens_pair !== null) {
-            tokens = [...tokens, this.sep, ...tokens_pair, this.sep]
+            tokens = mergeArrays(tokens, [this.sep], tokens_pair, [this.sep]);
         }
         return tokens;
     }
@@ -1233,10 +1234,10 @@ class TemplateProcessing extends PostProcessor {
 
             } else if ('Sequence' in item) {
                 if (item.Sequence.id === 'A') {
-                    toReturn.push(...tokens);
+                    toReturn = mergeArrays(toReturn, tokens);
 
                 } else if (item.Sequence.id === 'B') {
-                    toReturn.push(...tokens_pair);
+                    toReturn = mergeArrays(toReturn, tokens_pair);
                 }
             }
         }
@@ -1961,7 +1962,7 @@ export class PreTrainedTokenizer extends Callable {
         // At this point, tokens is batched: [batch_size, tokens]
         // However, array may be jagged. So, we pad to max_length
 
-        let maxLengthOfBatch = Math.max(...tokens.map(x => x.length));
+        let maxLengthOfBatch = max(tokens.map(x => x.length))[0];
 
         // If null, we calculate max length from sequences
         if (max_length === null) {
@@ -2917,7 +2918,7 @@ export class MarianTokenizer extends PreTrainedTokenizer {
             if (!this.supported_language_codes.includes(language)) {
                 console.warn(`Unsupported language code "${language}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`)
             }
-            return [language, ...super._encode_text(text)]
+            return mergeArrays([language], super._encode_text(text));
         }
     }
 
diff --git a/src/utils/core.js b/src/utils/core.js
index 4ba6e0c..4a2e0e8 100644
--- a/src/utils/core.js
+++ b/src/utils/core.js
@@ -152,3 +152,13 @@ export function pop(obj, key, defaultValue = undefined) {
     }
     return defaultValue;
 }
+
+/**
+ * Efficiently merge arrays, creating a new copy.
+ * Adapted from https://stackoverflow.com/a/6768642/13989043
+ * @param  {...any} arrs Arrays to merge.
+ * @returns The merged array.
+ */
+export function mergeArrays(...arrs) {
+    return Array.prototype.concat.apply([], arrs);
+}
diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js
index 62588c9..f66f7ef 100644
--- a/tests/tokenizers.test.js
+++ b/tests/tokenizers.test.js
@@ -33,3 +33,13 @@ describe('Tokenizers', () => {
         });
     }
 });
+
+describe('Edge cases', () => {
+    it('should not crash when encoding a very long string', async () => {
+        let tokenizer = await AutoTokenizer.from_pretrained('t5-small');
+
+        let text = String.prototype.repeat.call('Hello world! ', 50000);
+        let encoded = await tokenizer(text);
+        expect(encoded.input_ids.data.length).toBeGreaterThan(100000);
+    });
+});