57 lines
2.0 KiB
Python
57 lines
2.0 KiB
Python
# coding=utf-8
|
|
# Copyright 2019-present, the HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
Preprocessing script before training the distilled model.
|
|
"""
|
|
import argparse
|
|
import logging
|
|
import pickle
|
|
from collections import Counter
|
|
|
|
|
|
logging.basicConfig(
|
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
|
|
)
|
|
parser.add_argument(
|
|
"--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
|
|
)
|
|
parser.add_argument(
|
|
"--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
|
|
)
|
|
parser.add_argument("--vocab_size", default=30522, type=int)
|
|
args = parser.parse_args()
|
|
|
|
logger.info(f"Loading data from {args.data_file}")
|
|
with open(args.data_file, "rb") as fp:
|
|
data = pickle.load(fp)
|
|
|
|
logger.info("Counting occurrences for MLM.")
|
|
counter = Counter()
|
|
for tk_ids in data:
|
|
counter.update(tk_ids)
|
|
counts = [0] * args.vocab_size
|
|
for k, v in counter.items():
|
|
counts[k] = v
|
|
|
|
logger.info(f"Dump to {args.token_counts_dump}")
|
|
with open(args.token_counts_dump, "wb") as handle:
|
|
pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
|