50 lines
1.2 KiB
Python
50 lines
1.2 KiB
Python
import multiprocessing
|
|
import time
|
|
|
|
from arguments import PretokenizationArguments
|
|
from datasets import load_dataset
|
|
|
|
from transformers import AutoTokenizer, HfArgumentParser
|
|
|
|
|
|
def tokenize(example):
|
|
output = {}
|
|
output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"]
|
|
output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"])
|
|
return output
|
|
|
|
|
|
parser = HfArgumentParser(PretokenizationArguments)
|
|
args = parser.parse_args()
|
|
if args.num_workers is None:
|
|
args.num_workers = multiprocessing.cpu_count()
|
|
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
|
|
|
|
t_start = time.time()
|
|
ds = load_dataset(args.dataset_name, split="train")
|
|
print(f"Dataset loaded in {time.time()-t_start:.2f}s")
|
|
|
|
t_start = time.time()
|
|
ds = ds.map(
|
|
tokenize,
|
|
num_proc=args.num_workers,
|
|
remove_columns=[
|
|
"repo_name",
|
|
"path",
|
|
"copies",
|
|
"size",
|
|
"content",
|
|
"license",
|
|
"hash",
|
|
"line_mean",
|
|
"line_max",
|
|
"alpha_frac",
|
|
"autogenerated",
|
|
],
|
|
)
|
|
print(f"Dataset tokenized in {time.time()-t_start:.2f}s")
|
|
|
|
t_start = time.time()
|
|
ds.push_to_hub(args.tokenized_data_repo)
|
|
print(f"Data pushed to the hub in {time.time()-t_start:.2f}s")
|