transformers/notebooks/Comparing-PT-and-TF-models....

1631 lines
90 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pytorch to Tensorflow Conversion Test Notebook\n",
"\n",
"To run this notebook follow these steps, modifying the **Config** section as necessary:\n",
"\n",
"1. Point `pt_model_dir` to your local directory containing the pytorch Bert model to be converted.\n",
"2. Point `tf_bert_dir` to your clone of Google's Bert implementation which can be found here: https://github.com/google-research/bert.\n",
"\n",
"Note: \n",
"1. This feature currently only supports the base BERT models (uncased/cased).\n",
"2. Tensorflow model will be dumped in `tf_model_dir`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Config"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"model_cls = 'BertModel'\n",
"model_typ = 'bert-base-uncased'\n",
"token_cls = 'BertTokenizer'\n",
"max_seq = 12\n",
"CLS = \"[CLS]\"\n",
"SEP = \"[SEP]\"\n",
"MASK = \"[MASK]\"\n",
"CLS_IDX = 0\n",
"layer_idxs = tuple(range(12))\n",
"input_text = \"jim henson was a puppeteer\"\n",
"\n",
"pt_model_dir = \"/home/ubuntu/.pytorch-pretrained-BERT-cache/{}\".format(model_typ)\n",
"tf_bert_dir = \"/home/ubuntu/bert\"\n",
"\n",
"pt_vocab_file = os.path.join(pt_model_dir, \"vocab.txt\")\n",
"pt_init_ckpt = os.path.join(pt_model_dir, model_typ.replace(\"-\", \"_\") + \".bin\")\n",
"tf_model_dir = os.path.join(pt_model_dir, 'tf')\n",
"tf_vocab_file = os.path.join(tf_model_dir, \"vocab.txt\")\n",
"tf_init_ckpt = os.path.join(tf_model_dir, model_typ.replace(\"-\", \"_\") + \".ckpt\")\n",
"tf_config_file = os.path.join(tf_model_dir, \"bert_config.json\")\n",
"\n",
"if not os.path.isdir(tf_model_dir): \n",
" os.makedirs(tf_model_dir, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tokenization"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(text, tokenizer):\n",
" text = text.strip().lower()\n",
" tok_ids = tokenizer.tokenize(text)\n",
" if len(tok_ids) > max_seq - 2:\n",
" tok_ids = tok_ids[:max_seq - 2]\n",
" tok_ids.insert(CLS_IDX, CLS)\n",
" tok_ids.append(SEP)\n",
" input_ids = tokenizer.convert_tokens_to_ids(tok_ids)\n",
" mask_ids = [1] * len(input_ids)\n",
" seg_ids = [0] * len(input_ids)\n",
" padding = [0] * (max_seq - len(input_ids))\n",
" input_ids += padding\n",
" mask_ids += padding\n",
" seg_ids += padding\n",
" return input_ids, mask_ids, seg_ids"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pytorch execution"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 231508/231508 [00:00<00:00, 41092464.26B/s]\n",
"100%|██████████| 407873900/407873900 [00:07<00:00, 58092479.52B/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pytorch embedding shape: (1, 768)\n"
]
}
],
"source": [
"import numpy as np\n",
"import torch\n",
"from pytorch_pretrained_bert import (BertConfig,\n",
" BertModel, \n",
" BertTokenizer, \n",
" BertForSequenceClassification)\n",
"\n",
"# Save Vocab\n",
"pt_tokenizer = BertTokenizer.from_pretrained(\n",
" pretrained_model_name_or_path=model_typ, \n",
" cache_dir=pt_model_dir)\n",
"pt_tokenizer.save_vocabulary(pt_model_dir)\n",
"pt_tokenizer.save_vocabulary(tf_model_dir)\n",
"\n",
"# Save Model\n",
"pt_model = BertModel.from_pretrained(\n",
" pretrained_model_name_or_path=model_typ, \n",
" cache_dir=pt_model_dir).to('cpu')\n",
"pt_model.eval()\n",
"pt_model.config.hidden_dropout_prob = 0.0\n",
"pt_model.config.attention_probs_dropout_prob = 0.0\n",
"pt_model.config.to_json_file(tf_config_file)\n",
"torch.save(pt_model.state_dict(), pt_init_ckpt)\n",
"\n",
"# Inputs\n",
"input_ids_pt, mask_ids_pt, seg_ids_pt = tokenize(input_text, pt_tokenizer)\n",
"\n",
"# PT Embedding\n",
"tok_tensor = torch.tensor(input_ids_pt).to('cpu').unsqueeze(0)\n",
"seg_tensor = torch.tensor(seg_ids_pt).to('cpu').unsqueeze(0)\n",
"msk_tensor = torch.tensor(mask_ids_pt).to('cpu').unsqueeze(0)\n",
"attn_blks, nsp_logits = pt_model(tok_tensor, seg_tensor, msk_tensor)\n",
"pt_embedding = nsp_logits.detach().numpy() \n",
"print(\"Pytorch embedding shape: {}\".format(pt_embedding.shape))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pytorch &rarr; Tensorflow conversion"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Colocations handled automatically by placer.\n",
"bert/embeddings/word_embeddings initialized\n",
"bert/embeddings/position_embeddings initialized\n",
"bert/embeddings/token_type_embeddings initialized\n",
"bert/embeddings/LayerNorm/gamma initialized\n",
"bert/embeddings/LayerNorm/beta initialized\n",
"bert/encoder/layer_0/attention/self/query/kernel initialized\n",
"bert/encoder/layer_0/attention/self/query/bias initialized\n",
"bert/encoder/layer_0/attention/self/key/kernel initialized\n",
"bert/encoder/layer_0/attention/self/key/bias initialized\n",
"bert/encoder/layer_0/attention/self/value/kernel initialized\n",
"bert/encoder/layer_0/attention/self/value/bias initialized\n",
"bert/encoder/layer_0/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_0/attention/output/dense/bias initialized\n",
"bert/encoder/layer_0/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_0/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_0/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_0/intermediate/dense/bias initialized\n",
"bert/encoder/layer_0/output/dense/kernel initialized\n",
"bert/encoder/layer_0/output/dense/bias initialized\n",
"bert/encoder/layer_0/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_0/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_1/attention/self/query/kernel initialized\n",
"bert/encoder/layer_1/attention/self/query/bias initialized\n",
"bert/encoder/layer_1/attention/self/key/kernel initialized\n",
"bert/encoder/layer_1/attention/self/key/bias initialized\n",
"bert/encoder/layer_1/attention/self/value/kernel initialized\n",
"bert/encoder/layer_1/attention/self/value/bias initialized\n",
"bert/encoder/layer_1/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_1/attention/output/dense/bias initialized\n",
"bert/encoder/layer_1/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_1/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_1/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_1/intermediate/dense/bias initialized\n",
"bert/encoder/layer_1/output/dense/kernel initialized\n",
"bert/encoder/layer_1/output/dense/bias initialized\n",
"bert/encoder/layer_1/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_1/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_2/attention/self/query/kernel initialized\n",
"bert/encoder/layer_2/attention/self/query/bias initialized\n",
"bert/encoder/layer_2/attention/self/key/kernel initialized\n",
"bert/encoder/layer_2/attention/self/key/bias initialized\n",
"bert/encoder/layer_2/attention/self/value/kernel initialized\n",
"bert/encoder/layer_2/attention/self/value/bias initialized\n",
"bert/encoder/layer_2/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_2/attention/output/dense/bias initialized\n",
"bert/encoder/layer_2/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_2/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_2/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_2/intermediate/dense/bias initialized\n",
"bert/encoder/layer_2/output/dense/kernel initialized\n",
"bert/encoder/layer_2/output/dense/bias initialized\n",
"bert/encoder/layer_2/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_2/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_3/attention/self/query/kernel initialized\n",
"bert/encoder/layer_3/attention/self/query/bias initialized\n",
"bert/encoder/layer_3/attention/self/key/kernel initialized\n",
"bert/encoder/layer_3/attention/self/key/bias initialized\n",
"bert/encoder/layer_3/attention/self/value/kernel initialized\n",
"bert/encoder/layer_3/attention/self/value/bias initialized\n",
"bert/encoder/layer_3/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_3/attention/output/dense/bias initialized\n",
"bert/encoder/layer_3/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_3/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_3/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_3/intermediate/dense/bias initialized\n",
"bert/encoder/layer_3/output/dense/kernel initialized\n",
"bert/encoder/layer_3/output/dense/bias initialized\n",
"bert/encoder/layer_3/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_3/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_4/attention/self/query/kernel initialized\n",
"bert/encoder/layer_4/attention/self/query/bias initialized\n",
"bert/encoder/layer_4/attention/self/key/kernel initialized\n",
"bert/encoder/layer_4/attention/self/key/bias initialized\n",
"bert/encoder/layer_4/attention/self/value/kernel initialized\n",
"bert/encoder/layer_4/attention/self/value/bias initialized\n",
"bert/encoder/layer_4/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_4/attention/output/dense/bias initialized\n",
"bert/encoder/layer_4/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_4/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_4/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_4/intermediate/dense/bias initialized\n",
"bert/encoder/layer_4/output/dense/kernel initialized\n",
"bert/encoder/layer_4/output/dense/bias initialized\n",
"bert/encoder/layer_4/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_4/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_5/attention/self/query/kernel initialized\n",
"bert/encoder/layer_5/attention/self/query/bias initialized\n",
"bert/encoder/layer_5/attention/self/key/kernel initialized\n",
"bert/encoder/layer_5/attention/self/key/bias initialized\n",
"bert/encoder/layer_5/attention/self/value/kernel initialized\n",
"bert/encoder/layer_5/attention/self/value/bias initialized\n",
"bert/encoder/layer_5/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_5/attention/output/dense/bias initialized\n",
"bert/encoder/layer_5/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_5/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_5/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_5/intermediate/dense/bias initialized\n",
"bert/encoder/layer_5/output/dense/kernel initialized\n",
"bert/encoder/layer_5/output/dense/bias initialized\n",
"bert/encoder/layer_5/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_5/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_6/attention/self/query/kernel initialized\n",
"bert/encoder/layer_6/attention/self/query/bias initialized\n",
"bert/encoder/layer_6/attention/self/key/kernel initialized\n",
"bert/encoder/layer_6/attention/self/key/bias initialized\n",
"bert/encoder/layer_6/attention/self/value/kernel initialized\n",
"bert/encoder/layer_6/attention/self/value/bias initialized\n",
"bert/encoder/layer_6/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_6/attention/output/dense/bias initialized\n",
"bert/encoder/layer_6/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_6/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_6/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_6/intermediate/dense/bias initialized\n",
"bert/encoder/layer_6/output/dense/kernel initialized\n",
"bert/encoder/layer_6/output/dense/bias initialized\n",
"bert/encoder/layer_6/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_6/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_7/attention/self/query/kernel initialized\n",
"bert/encoder/layer_7/attention/self/query/bias initialized\n",
"bert/encoder/layer_7/attention/self/key/kernel initialized\n",
"bert/encoder/layer_7/attention/self/key/bias initialized\n",
"bert/encoder/layer_7/attention/self/value/kernel initialized\n",
"bert/encoder/layer_7/attention/self/value/bias initialized\n",
"bert/encoder/layer_7/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_7/attention/output/dense/bias initialized\n",
"bert/encoder/layer_7/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_7/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_7/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_7/intermediate/dense/bias initialized\n",
"bert/encoder/layer_7/output/dense/kernel initialized\n",
"bert/encoder/layer_7/output/dense/bias initialized\n",
"bert/encoder/layer_7/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_7/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_8/attention/self/query/kernel initialized\n",
"bert/encoder/layer_8/attention/self/query/bias initialized\n",
"bert/encoder/layer_8/attention/self/key/kernel initialized\n",
"bert/encoder/layer_8/attention/self/key/bias initialized\n",
"bert/encoder/layer_8/attention/self/value/kernel initialized\n",
"bert/encoder/layer_8/attention/self/value/bias initialized\n",
"bert/encoder/layer_8/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_8/attention/output/dense/bias initialized\n",
"bert/encoder/layer_8/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_8/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_8/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_8/intermediate/dense/bias initialized\n",
"bert/encoder/layer_8/output/dense/kernel initialized\n",
"bert/encoder/layer_8/output/dense/bias initialized\n",
"bert/encoder/layer_8/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_8/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_9/attention/self/query/kernel initialized\n",
"bert/encoder/layer_9/attention/self/query/bias initialized\n",
"bert/encoder/layer_9/attention/self/key/kernel initialized\n",
"bert/encoder/layer_9/attention/self/key/bias initialized\n",
"bert/encoder/layer_9/attention/self/value/kernel initialized\n",
"bert/encoder/layer_9/attention/self/value/bias initialized\n",
"bert/encoder/layer_9/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_9/attention/output/dense/bias initialized\n",
"bert/encoder/layer_9/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_9/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_9/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_9/intermediate/dense/bias initialized\n",
"bert/encoder/layer_9/output/dense/kernel initialized\n",
"bert/encoder/layer_9/output/dense/bias initialized\n",
"bert/encoder/layer_9/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_9/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_10/attention/self/query/kernel initialized\n",
"bert/encoder/layer_10/attention/self/query/bias initialized\n",
"bert/encoder/layer_10/attention/self/key/kernel initialized\n",
"bert/encoder/layer_10/attention/self/key/bias initialized\n",
"bert/encoder/layer_10/attention/self/value/kernel initialized\n",
"bert/encoder/layer_10/attention/self/value/bias initialized\n",
"bert/encoder/layer_10/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_10/attention/output/dense/bias initialized\n",
"bert/encoder/layer_10/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_10/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_10/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_10/intermediate/dense/bias initialized\n",
"bert/encoder/layer_10/output/dense/kernel initialized\n",
"bert/encoder/layer_10/output/dense/bias initialized\n",
"bert/encoder/layer_10/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_10/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_11/attention/self/query/kernel initialized\n",
"bert/encoder/layer_11/attention/self/query/bias initialized\n",
"bert/encoder/layer_11/attention/self/key/kernel initialized\n",
"bert/encoder/layer_11/attention/self/key/bias initialized\n",
"bert/encoder/layer_11/attention/self/value/kernel initialized\n",
"bert/encoder/layer_11/attention/self/value/bias initialized\n",
"bert/encoder/layer_11/attention/output/dense/kernel initialized\n",
"bert/encoder/layer_11/attention/output/dense/bias initialized\n",
"bert/encoder/layer_11/attention/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_11/attention/output/LayerNorm/beta initialized\n",
"bert/encoder/layer_11/intermediate/dense/kernel initialized\n",
"bert/encoder/layer_11/intermediate/dense/bias initialized\n",
"bert/encoder/layer_11/output/dense/kernel initialized\n",
"bert/encoder/layer_11/output/dense/bias initialized\n",
"bert/encoder/layer_11/output/LayerNorm/gamma initialized\n",
"bert/encoder/layer_11/output/LayerNorm/beta initialized\n",
"bert/pooler/dense/kernel initialized\n",
"bert/pooler/dense/bias initialized\n"
]
}
],
"source": [
"from pytorch_pretrained_bert.convert_pytorch_checkpoint_to_tf import main\n",
"\n",
"main([\n",
" '--model_name', model_typ, \n",
" '--pytorch_model_path', pt_init_ckpt,\n",
" '--tf_cache_dir', tf_model_dir,\n",
" '--cache_dir', pt_model_dir\n",
"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tensorflow execution"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
"For more information, please see:\n",
" * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
" * https://github.com/tensorflow/addons\n",
"If you depend on functionality not listed there, please file an issue.\n",
"\n",
"WARNING:tensorflow:From /home/ubuntu/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Use keras.layers.dense instead.\n",
"WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Use standard file APIs to check for files with this prefix.\n",
"INFO:tensorflow:Restoring parameters from /home/ubuntu/.pytorch-pretrained-BERT-cache/bert-base-uncased/tf/bert_base_uncased.ckpt\n",
"Tensorflow embedding shape: (1, 768)\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"sys.path.insert(0, tf_bert_dir)\n",
"import modeling\n",
"import tokenization\n",
"\n",
"tf.reset_default_graph()\n",
"\n",
"# Process text\n",
"tf_tokenizer = tokenization.FullTokenizer(vocab_file=tf_vocab_file)\n",
"\n",
"# Graph inputs\n",
"input_ids_tf, mask_ids_tf, seg_ids_tf = tokenize(input_text, tf_tokenizer)\n",
"config = modeling.BertConfig.from_json_file(\n",
" os.path.join(tf_model_dir, 'bert_config.json'))\n",
"input_tensor = tf.placeholder(\n",
" dtype=tf.int32,\n",
" shape=[1, None],\n",
" name='input_ids')\n",
"mask_tensor = tf.placeholder(\n",
" dtype=tf.int32,\n",
" shape=[1, None],\n",
" name='mask_ids')\n",
"seg_tensor = tf.placeholder(\n",
" dtype=tf.int32,\n",
" shape=[1, None],\n",
" name='seg_ids')\n",
"tf_model = modeling.BertModel(\n",
" config=config,\n",
" is_training=False,\n",
" input_ids=input_tensor,\n",
" input_mask=mask_tensor,\n",
" token_type_ids=seg_tensor,\n",
" use_one_hot_embeddings=False)\n",
"output_layer = tf_model.get_pooled_output()\n",
"\n",
"# Load tf model\n",
"session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
"vars_to_load = [v for v in tf.global_variables()]\n",
"session.run(tf.variables_initializer(var_list=vars_to_load))\n",
"saver = tf.train.Saver(vars_to_load)\n",
"saver.restore(session, save_path=tf_init_ckpt)\n",
"\n",
"# TF Embedding\n",
"fetches = output_layer\n",
"feed_dict = {\n",
" input_tensor: [input_ids_tf],\n",
" mask_tensor: [mask_ids_tf],\n",
" seg_tensor: [seg_ids_tf]\n",
"}\n",
"tf_embedding = session.run(fetches=fetches, feed_dict=feed_dict)\n",
"print(\"Tensorflow embedding shape: {}\".format(tf_embedding.shape))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare Tokenization"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TOKEN_IDS_PT: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
"TOKEN_IDS_TF: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
"SEG_IDS_PT: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"SEG_IDS_TF: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"MASK_IDS_PT: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n",
"MASK_IDS_TF: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n"
]
}
],
"source": [
"print(\"TOKEN_IDS_PT: {}\".format(input_ids_pt))\n",
"print(\"TOKEN_IDS_TF: {}\".format(input_ids_tf))\n",
"print(\"SEG_IDS_PT: {}\".format(seg_ids_pt))\n",
"print(\"SEG_IDS_TF: {}\".format(seg_ids_tf))\n",
"print(\"MASK_IDS_PT: {}\".format(mask_ids_pt))\n",
"print(\"MASK_IDS_TF: {}\".format(mask_ids_tf))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare Model Weights"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bert/embeddings/word_embeddings\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608 0.00116716]\n",
"TF: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608 0.00116716]\n",
"\n",
"bert/embeddings/token_type_embeddings\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
"TF: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
"\n",
"bert/embeddings/position_embeddings\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613 0.00797095]\n",
"TF: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613 0.00797095]\n",
"\n",
"bert/embeddings/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.02591471 -0.0195513 0.02423946 0.08904593 -0.06281059]\n",
"TF: shape: (768,) values: [-0.02591471 -0.0195513 0.02423946 0.08904593 -0.06281059]\n",
"\n",
"bert/embeddings/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.9260566 0.8851115 0.85807985 0.8616906 0.8937205 ]\n",
"TF: shape: (768,) values: [0.9260566 0.8851115 0.85807985 0.8616906 0.8937205 ]\n",
"\n",
"bert/encoder/layer_0/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01640572 -0.03257025 0.01046295 -0.04442816 -0.02256124]\n",
"TF: shape: (768, 768) values: [-0.01640572 -0.03257025 0.01046295 -0.04442816 -0.02256124]\n",
"\n",
"bert/encoder/layer_0/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.58488506 -0.3312432 -0.43010172 0.37446147 -0.29811692]\n",
"TF: shape: (768,) values: [ 0.58488506 -0.3312432 -0.43010172 0.37446147 -0.29811692]\n",
"\n",
"bert/encoder/layer_0/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.00807745 0.02652155 -0.01866494 0.01797846 0.00450485]\n",
"TF: shape: (768, 768) values: [ 0.00807745 0.02652155 -0.01866494 0.01797846 0.00450485]\n",
"\n",
"bert/encoder/layer_0/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.00104306 0.00035106 -0.0024626 -0.00010567 -0.00119283]\n",
"TF: shape: (768,) values: [ 0.00104306 0.00035106 -0.0024626 -0.00010567 -0.00119283]\n",
"\n",
"bert/encoder/layer_0/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.01144261 -0.02663044 0.01911472 -0.02206182 -0.00287949]\n",
"TF: shape: (768, 768) values: [ 0.01144261 -0.02663044 0.01911472 -0.02206182 -0.00287949]\n",
"\n",
"bert/encoder/layer_0/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847 0.01736802 0.00449983]\n",
"TF: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847 0.01736802 0.00449983]\n",
"\n",
"bert/encoder/layer_0/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.00581949 0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
"TF: shape: (768, 768) values: [ 0.00581949 0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
"\n",
"bert/encoder/layer_0/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.00511063 -0.0166625 0.02812938 -0.01166061 0.01942627]\n",
"TF: shape: (768,) values: [ 0.00511063 -0.0166625 0.02812938 -0.01166061 0.01942627]\n",
"\n",
"bert/encoder/layer_0/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697 -0.38847703 0.36841765]\n",
"TF: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697 -0.38847703 0.36841765]\n",
"\n",
"bert/encoder/layer_0/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.9803408 0.959969 0.96368986 0.9603653 0.9801324 ]\n",
"TF: shape: (768,) values: [0.9803408 0.959969 0.96368986 0.9603653 0.9801324 ]\n",
"\n",
"bert/encoder/layer_0/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [-0.01010427 -0.060398 -0.01468864 0.00311493 0.02862451]\n",
"TF: shape: (768, 3072) values: [-0.01010427 -0.060398 -0.01468864 0.00311493 0.02862451]\n",
"\n",
"bert/encoder/layer_0/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036 -0.06369043]\n",
"TF: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036 -0.06369043]\n",
"\n",
"bert/encoder/layer_0/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.03710171 0.0648794 0.00758566 -0.05224452 -0.04348791]\n",
"TF: shape: (3072, 768) values: [-0.03710171 0.0648794 0.00758566 -0.05224452 -0.04348791]\n",
"\n",
"bert/encoder/layer_0/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.04801027 0.19766568 0.02154854 0.02880666 0.0444298 ]\n",
"TF: shape: (768,) values: [-0.04801027 0.19766568 0.02154854 0.02880666 0.0444298 ]\n",
"\n",
"bert/encoder/layer_0/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.10142924 -0.00499344 0.04274083 0.09324206 -0.10700516]\n",
"TF: shape: (768,) values: [-0.10142924 -0.00499344 0.04274083 0.09324206 -0.10700516]\n",
"\n",
"bert/encoder/layer_0/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.7835125 0.8072406 0.7670588 0.73706394 0.76303864]\n",
"TF: shape: (768,) values: [0.7835125 0.8072406 0.7670588 0.73706394 0.76303864]\n",
"\n",
"bert/encoder/layer_1/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582 0.0655639 -0.00337808]\n",
"TF: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582 0.0655639 -0.00337808]\n",
"\n",
"bert/encoder/layer_1/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.27827993 0.17387655 -0.2497937 -0.8809636 0.41262135]\n",
"TF: shape: (768,) values: [-0.27827993 0.17387655 -0.2497937 -0.8809636 0.41262135]\n",
"\n",
"bert/encoder/layer_1/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.03353037 0.04007257 0.05320328 -0.02166729 -0.03581231]\n",
"TF: shape: (768, 768) values: [-0.03353037 0.04007257 0.05320328 -0.02166729 -0.03581231]\n",
"\n",
"bert/encoder/layer_1/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00504407 0.00136887 -0.00394336 0.00646125 -0.00148919]\n",
"TF: shape: (768,) values: [-0.00504407 0.00136887 -0.00394336 0.00646125 -0.00148919]\n",
"\n",
"bert/encoder/layer_1/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.00464159 0.06674305 -0.00970626 -0.0276653 -0.01597566]\n",
"TF: shape: (768, 768) values: [-0.00464159 0.06674305 -0.00970626 -0.0276653 -0.01597566]\n",
"\n",
"bert/encoder/layer_1/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.00381288 0.02650839 -0.0059689 -0.00508269 -0.01293722]\n",
"TF: shape: (768,) values: [ 0.00381288 0.02650839 -0.0059689 -0.00508269 -0.01293722]\n",
"\n",
"bert/encoder/layer_1/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01390745 -0.01100563 0.01303005 -0.01969771 0.0125082 ]\n",
"TF: shape: (768, 768) values: [-0.01390745 -0.01100563 0.01303005 -0.01969771 0.0125082 ]\n",
"\n",
"bert/encoder/layer_1/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
"TF: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
"\n",
"bert/encoder/layer_1/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.08583715 0.14199966 -0.0856637 -0.18797271 0.21056814]\n",
"TF: shape: (768,) values: [ 0.08583715 0.14199966 -0.0856637 -0.18797271 0.21056814]\n",
"\n",
"bert/encoder/layer_1/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.896962 0.87148863 0.8531161 0.8690647 0.9488987 ]\n",
"TF: shape: (768,) values: [0.896962 0.87148863 0.8531161 0.8690647 0.9488987 ]\n",
"\n",
"bert/encoder/layer_1/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
"TF: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
"\n",
"bert/encoder/layer_1/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
"TF: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
"\n",
"bert/encoder/layer_1/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.02372648 0.03326349 0.08291997 -0.01519038 0.01868557]\n",
"TF: shape: (3072, 768) values: [-0.02372648 0.03326349 0.08291997 -0.01519038 0.01868557]\n",
"\n",
"bert/encoder/layer_1/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.02514724 0.09868994 -0.027811 0.03749462 0.01086514]\n",
"TF: shape: (768,) values: [-0.02514724 0.09868994 -0.027811 0.03749462 0.01086514]\n",
"\n",
"bert/encoder/layer_1/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.07662535 -0.10506564 0.03191236 0.07633785 -0.11187791]\n",
"TF: shape: (768,) values: [-0.07662535 -0.10506564 0.03191236 0.07633785 -0.11187791]\n",
"\n",
"bert/encoder/layer_1/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.9017883 0.8868776 0.8862677 0.85865664 0.87496454]\n",
"TF: shape: (768,) values: [0.9017883 0.8868776 0.8862677 0.85865664 0.87496454]\n",
"\n",
"bert/encoder/layer_2/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.08433672 0.09580533 0.07543895 -0.01126779 -0.01354045]\n",
"TF: shape: (768, 768) values: [ 0.08433672 0.09580533 0.07543895 -0.01126779 -0.01354045]\n",
"\n",
"bert/encoder/layer_2/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.0371241 0.03406003 0.27713948 -0.21613775 -0.05275448]\n",
"TF: shape: (768,) values: [ 0.0371241 0.03406003 0.27713948 -0.21613775 -0.05275448]\n",
"\n",
"bert/encoder/layer_2/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.04794507 0.02517631 -0.01319554 -0.02094732 0.09073472]\n",
"TF: shape: (768, 768) values: [ 0.04794507 0.02517631 -0.01319554 -0.02094732 0.09073472]\n",
"\n",
"bert/encoder/layer_2/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741 0.00037122]\n",
"TF: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741 0.00037122]\n",
"\n",
"bert/encoder/layer_2/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914 0.04746444 0.00428481]\n",
"TF: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914 0.04746444 0.00428481]\n",
"\n",
"bert/encoder/layer_2/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.02728729 0.04979054 0.08326469 0.04150949 0.600959 ]\n",
"TF: shape: (768,) values: [-0.02728729 0.04979054 0.08326469 0.04150949 0.600959 ]\n",
"\n",
"bert/encoder/layer_2/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.00517425 0.01197957 0.0393172 -0.0063884 -0.02673388]\n",
"TF: shape: (768, 768) values: [ 0.00517425 0.01197957 0.0393172 -0.0063884 -0.02673388]\n",
"\n",
"bert/encoder/layer_2/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.01754025 0.1226335 -0.05733554 0.06844623 0.00879776]\n",
"TF: shape: (768,) values: [ 0.01754025 0.1226335 -0.05733554 0.06844623 0.00879776]\n",
"\n",
"bert/encoder/layer_2/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.1490809 0.12386955 -0.19382021 -0.26515856 0.32723007]\n",
"TF: shape: (768,) values: [ 0.1490809 0.12386955 -0.19382021 -0.26515856 0.32723007]\n",
"\n",
"bert/encoder/layer_2/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8983343 0.88877076 0.86283594 0.8584952 0.9587886 ]\n",
"TF: shape: (768,) values: [0.8983343 0.88877076 0.86283594 0.8584952 0.9587886 ]\n",
"\n",
"bert/encoder/layer_2/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [-0.01619919 0.00662888 0.01492284 -0.01280748 0.01318596]\n",
"TF: shape: (768, 3072) values: [-0.01619919 0.00662888 0.01492284 -0.01280748 0.01318596]\n",
"\n",
"bert/encoder/layer_2/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
"TF: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
"\n",
"bert/encoder/layer_2/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.07225161 -0.0129784 0.00618811 -0.01593373 -0.02160194]\n",
"TF: shape: (3072, 768) values: [-0.07225161 -0.0129784 0.00618811 -0.01593373 -0.02160194]\n",
"\n",
"bert/encoder/layer_2/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.06319264 0.06169628 -0.03041368 0.00924282 0.06277442]\n",
"TF: shape: (768,) values: [-0.06319264 0.06169628 -0.03041368 0.00924282 0.06277442]\n",
"\n",
"bert/encoder/layer_2/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.1139038 -0.11665309 0.07883061 0.07796711 -0.14219187]\n",
"TF: shape: (768,) values: [-0.1139038 -0.11665309 0.07883061 0.07796711 -0.14219187]\n",
"\n",
"bert/encoder/layer_2/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8813261 0.85744697 0.8511922 0.85261875 0.8329574 ]\n",
"TF: shape: (768,) values: [0.8813261 0.85744697 0.8511922 0.85261875 0.8329574 ]\n",
"\n",
"bert/encoder/layer_3/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963 0.04117409 -0.07591715]\n",
"TF: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963 0.04117409 -0.07591715]\n",
"\n",
"bert/encoder/layer_3/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.09740101 -0.19290674 0.04332267 0.17937997 -0.08023558]\n",
"TF: shape: (768,) values: [ 0.09740101 -0.19290674 0.04332267 0.17937997 -0.08023558]\n",
"\n",
"bert/encoder/layer_3/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.02562077 0.02507281 -0.03361562 0.05613289 -0.05435724]\n",
"TF: shape: (768, 768) values: [ 0.02562077 0.02507281 -0.03361562 0.05613289 -0.05435724]\n",
"\n",
"bert/encoder/layer_3/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415 0.00969649 -0.00094182]\n",
"TF: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415 0.00969649 -0.00094182]\n",
"\n",
"bert/encoder/layer_3/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.00539032 0.00959642 0.01325458 0.00490616 0.0129908 ]\n",
"TF: shape: (768, 768) values: [-0.00539032 0.00959642 0.01325458 0.00490616 0.0129908 ]\n",
"\n",
"bert/encoder/layer_3/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
"TF: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
"\n",
"bert/encoder/layer_3/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.01850341 0.03148198 0.02705758 -0.0004669 0.01367511]\n",
"TF: shape: (768, 768) values: [ 0.01850341 0.03148198 0.02705758 -0.0004669 0.01367511]\n",
"\n",
"bert/encoder/layer_3/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.01981483 0.03566506 -0.05016088 0.02958186 0.04989756]\n",
"TF: shape: (768,) values: [ 0.01981483 0.03566506 -0.05016088 0.02958186 0.04989756]\n",
"\n",
"bert/encoder/layer_3/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.09815404 0.00063774 -0.01257733 -0.26485074 0.22568701]\n",
"TF: shape: (768,) values: [ 0.09815404 0.00063774 -0.01257733 -0.26485074 0.22568701]\n",
"\n",
"bert/encoder/layer_3/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.91457725 0.88453823 0.8340887 0.84203583 0.95247847]\n",
"TF: shape: (768,) values: [0.91457725 0.88453823 0.8340887 0.84203583 0.95247847]\n",
"\n",
"bert/encoder/layer_3/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [-0.02733567 0.03307878 -0.01331292 -0.00032527 0.03252084]\n",
"TF: shape: (768, 3072) values: [-0.02733567 0.03307878 -0.01331292 -0.00032527 0.03252084]\n",
"\n",
"bert/encoder/layer_3/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971 0.01335877 -0.09492484]\n",
"TF: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971 0.01335877 -0.09492484]\n",
"\n",
"bert/encoder/layer_3/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.01751153 0.01631314 -0.02660011 0.03569947 -0.01394763]\n",
"TF: shape: (3072, 768) values: [-0.01751153 0.01631314 -0.02660011 0.03569947 -0.01394763]\n",
"\n",
"bert/encoder/layer_3/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.03873252 0.08414765 -0.0399323 0.01997361 0.12924597]\n",
"TF: shape: (768,) values: [-0.03873252 0.08414765 -0.0399323 0.01997361 0.12924597]\n",
"\n",
"bert/encoder/layer_3/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155 0.05231095 -0.09717073]\n",
"TF: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155 0.05231095 -0.09717073]\n",
"\n",
"bert/encoder/layer_3/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.827748 0.83012533 0.82399255 0.81772 0.80794513]\n",
"TF: shape: (768,) values: [0.827748 0.83012533 0.82399255 0.81772 0.80794513]\n",
"\n",
"bert/encoder/layer_4/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.08296382 0.02076941 0.06525186 -0.02659729 0.03491377]\n",
"TF: shape: (768, 768) values: [ 0.08296382 0.02076941 0.06525186 -0.02659729 0.03491377]\n",
"\n",
"bert/encoder/layer_4/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146 0.00061329 0.1248519 ]\n",
"TF: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146 0.00061329 0.1248519 ]\n",
"\n",
"bert/encoder/layer_4/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.06941643 0.08133814 -0.0453992 0.0668715 -0.06014847]\n",
"TF: shape: (768, 768) values: [ 0.06941643 0.08133814 -0.0453992 0.0668715 -0.06014847]\n",
"\n",
"bert/encoder/layer_4/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00588725 -0.00235185 0.00281131 0.00173088 -0.00546653]\n",
"TF: shape: (768,) values: [-0.00588725 -0.00235185 0.00281131 0.00173088 -0.00546653]\n",
"\n",
"bert/encoder/layer_4/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.06889665 0.06645385 0.01232084 0.0132611 -0.01595679]\n",
"TF: shape: (768, 768) values: [ 0.06889665 0.06645385 0.01232084 0.0132611 -0.01595679]\n",
"\n",
"bert/encoder/layer_4/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.01126871 -0.02704018 0.0301532 0.02332082 -0.04233487]\n",
"TF: shape: (768,) values: [-0.01126871 -0.02704018 0.0301532 0.02332082 -0.04233487]\n",
"\n",
"bert/encoder/layer_4/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292 0.04862929 -0.0442014 ]\n",
"TF: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292 0.04862929 -0.0442014 ]\n",
"\n",
"bert/encoder/layer_4/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.03054528 0.00479777 -0.02729505 -0.0325212 -0.00525727]\n",
"TF: shape: (768,) values: [ 0.03054528 0.00479777 -0.02729505 -0.0325212 -0.00525727]\n",
"\n",
"bert/encoder/layer_4/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.00903359 0.0052285 -0.02841488 -0.22355485 0.28281343]\n",
"TF: shape: (768,) values: [ 0.00903359 0.0052285 -0.02841488 -0.22355485 0.28281343]\n",
"\n",
"bert/encoder/layer_4/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8849676 0.86927813 0.8114595 0.80269504 0.94864094]\n",
"TF: shape: (768,) values: [0.8849676 0.86927813 0.8114595 0.80269504 0.94864094]\n",
"\n",
"bert/encoder/layer_4/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [-0.00639783 0.06198016 -0.03184223 0.00485356 -0.02453273]\n",
"TF: shape: (768, 3072) values: [-0.00639783 0.06198016 -0.03184223 0.00485356 -0.02453273]\n",
"\n",
"bert/encoder/layer_4/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
"TF: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
"\n",
"bert/encoder/layer_4/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.05421264 0.0221118 -0.02674172 0.03672203 -0.02399626]\n",
"TF: shape: (3072, 768) values: [-0.05421264 0.0221118 -0.02674172 0.03672203 -0.02399626]\n",
"\n",
"bert/encoder/layer_4/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.05068972 0.04838871 0.01156022 0.05381602 0.08857913]\n",
"TF: shape: (768,) values: [-0.05068972 0.04838871 0.01156022 0.05381602 0.08857913]\n",
"\n",
"bert/encoder/layer_4/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.04338909 -0.0781464 -0.01518662 0.04936362 -0.12378412]\n",
"TF: shape: (768,) values: [-0.04338909 -0.0781464 -0.01518662 0.04936362 -0.12378412]\n",
"\n",
"bert/encoder/layer_4/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
"TF: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
"\n",
"bert/encoder/layer_5/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.00858843 -0.03920127 0.02552994 -0.02786552 0.02436485]\n",
"TF: shape: (768, 768) values: [-0.00858843 -0.03920127 0.02552994 -0.02786552 0.02436485]\n",
"\n",
"bert/encoder/layer_5/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079 0.01085692 0.02925887]\n",
"TF: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079 0.01085692 0.02925887]\n",
"\n",
"bert/encoder/layer_5/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.00352847 0.02330176 -0.00369894 -0.03904612 0.00294574]\n",
"TF: shape: (768, 768) values: [ 0.00352847 0.02330176 -0.00369894 -0.03904612 0.00294574]\n",
"\n",
"bert/encoder/layer_5/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.01087186 -0.01176561 0.00016575 -0.01163023 0.00946616]\n",
"TF: shape: (768,) values: [-0.01087186 -0.01176561 0.00016575 -0.01163023 0.00946616]\n",
"\n",
"bert/encoder/layer_5/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.06134222 0.04238288 0.02796064 -0.01284983 0.03683741]\n",
"TF: shape: (768, 768) values: [ 0.06134222 0.04238288 0.02796064 -0.01284983 0.03683741]\n",
"\n",
"bert/encoder/layer_5/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053 -0.00025261 0.0437019 ]\n",
"TF: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053 -0.00025261 0.0437019 ]\n",
"\n",
"bert/encoder/layer_5/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.00739815 0.0533964 -0.03736389 -0.04999201 0.01693069]\n",
"TF: shape: (768, 768) values: [-0.00739815 0.0533964 -0.03736389 -0.04999201 0.01693069]\n",
"\n",
"bert/encoder/layer_5/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.0021682 0.01711399 -0.04201518 0.01605333 0.00552063]\n",
"TF: shape: (768,) values: [-0.0021682 0.01711399 -0.04201518 0.01605333 0.00552063]\n",
"\n",
"bert/encoder/layer_5/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.06841327 -0.0146848 0.09792476 -0.23284538 0.2785602 ]\n",
"TF: shape: (768,) values: [-0.06841327 -0.0146848 0.09792476 -0.23284538 0.2785602 ]\n",
"\n",
"bert/encoder/layer_5/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8908311 0.87884724 0.81637293 0.8047641 0.96539867]\n",
"TF: shape: (768,) values: [0.8908311 0.87884724 0.81637293 0.8047641 0.96539867]\n",
"\n",
"bert/encoder/layer_5/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [-0.03246041 0.07251058 -0.08201726 0.00772481 0.02532209]\n",
"TF: shape: (768, 3072) values: [-0.03246041 0.07251058 -0.08201726 0.00772481 0.02532209]\n",
"\n",
"bert/encoder/layer_5/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
"TF: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
"\n",
"bert/encoder/layer_5/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [ 0.0642072 -0.01738782 -0.05095377 0.00523853 0.04425264]\n",
"TF: shape: (3072, 768) values: [ 0.0642072 -0.01738782 -0.05095377 0.00523853 0.04425264]\n",
"\n",
"bert/encoder/layer_5/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.0007217 0.06006297 0.0016595 0.03848181 0.06703516]\n",
"TF: shape: (768,) values: [-0.0007217 0.06006297 0.0016595 0.03848181 0.06703516]\n",
"\n",
"bert/encoder/layer_5/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047 0.06023621 -0.18672828]\n",
"TF: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047 0.06023621 -0.18672828]\n",
"\n",
"bert/encoder/layer_5/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8621183 0.8515807 0.82654256 0.81729776 0.7985204 ]\n",
"TF: shape: (768,) values: [0.8621183 0.8515807 0.82654256 0.81729776 0.7985204 ]\n",
"\n",
"bert/encoder/layer_6/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.02527807 -0.01429243 0.01467054 0.08624706 -0.00188593]\n",
"TF: shape: (768, 768) values: [-0.02527807 -0.01429243 0.01467054 0.08624706 -0.00188593]\n",
"\n",
"bert/encoder/layer_6/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.17319514 0.27564248 0.16801168 -0.10946485 0.1643271 ]\n",
"TF: shape: (768,) values: [-0.17319514 0.27564248 0.16801168 -0.10946485 0.1643271 ]\n",
"\n",
"bert/encoder/layer_6/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.05886372 0.00706217 0.0398422 0.00882155 -0.04571463]\n",
"TF: shape: (768, 768) values: [ 0.05886372 0.00706217 0.0398422 0.00882155 -0.04571463]\n",
"\n",
"bert/encoder/layer_6/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00424696 -0.0001192 0.0046079 -0.00315606 0.00434314]\n",
"TF: shape: (768,) values: [-0.00424696 -0.0001192 0.0046079 -0.00315606 0.00434314]\n",
"\n",
"bert/encoder/layer_6/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01720381 0.01170722 0.02346902 -0.02284313 -0.03173028]\n",
"TF: shape: (768, 768) values: [-0.01720381 0.01170722 0.02346902 -0.02284313 -0.03173028]\n",
"\n",
"bert/encoder/layer_6/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.03492057 0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
"TF: shape: (768,) values: [-0.03492057 0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
"\n",
"bert/encoder/layer_6/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.0323688 -0.00689882 0.07379091 0.01121114 -0.02059202]\n",
"TF: shape: (768, 768) values: [ 0.0323688 -0.00689882 0.07379091 0.01121114 -0.02059202]\n",
"\n",
"bert/encoder/layer_6/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
"TF: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
"\n",
"bert/encoder/layer_6/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.06793639 0.03157783 0.15647687 -0.15025291 0.14727171]\n",
"TF: shape: (768,) values: [-0.06793639 0.03157783 0.15647687 -0.15025291 0.14727171]\n",
"\n",
"bert/encoder/layer_6/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8882361 0.8704905 0.80289173 0.77365315 0.92333615]\n",
"TF: shape: (768,) values: [0.8882361 0.8704905 0.80289173 0.77365315 0.92333615]\n",
"\n",
"bert/encoder/layer_6/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [ 0.04492201 0.05160861 0.09041415 -0.00742628 0.048133 ]\n",
"TF: shape: (768, 3072) values: [ 0.04492201 0.05160861 0.09041415 -0.00742628 0.048133 ]\n",
"\n",
"bert/encoder/layer_6/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.09301704 -0.158612 -0.10633879 -0.09706812 -0.17319229]\n",
"TF: shape: (3072,) values: [-0.09301704 -0.158612 -0.10633879 -0.09706812 -0.17319229]\n",
"\n",
"bert/encoder/layer_6/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.00085372 -0.00974195 0.00684915 0.00038686 0.06610142]\n",
"TF: shape: (3072, 768) values: [-0.00085372 -0.00974195 0.00684915 0.00038686 0.06610142]\n",
"\n",
"bert/encoder/layer_6/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.03254414 0.05681704 0.03720434 0.01936359 0.09134153]\n",
"TF: shape: (768,) values: [-0.03254414 0.05681704 0.03720434 0.01936359 0.09134153]\n",
"\n",
"bert/encoder/layer_6/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.0117129 -0.03209404 -0.08646043 0.03760341 -0.13841423]\n",
"TF: shape: (768,) values: [-0.0117129 -0.03209404 -0.08646043 0.03760341 -0.13841423]\n",
"\n",
"bert/encoder/layer_6/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8674175 0.8657014 0.8151861 0.82301307 0.8305737 ]\n",
"TF: shape: (768,) values: [0.8674175 0.8657014 0.8151861 0.82301307 0.8305737 ]\n",
"\n",
"bert/encoder/layer_7/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.00075523 -0.01501983 0.04090893 0.01884826 0.04670674]\n",
"TF: shape: (768, 768) values: [-0.00075523 -0.01501983 0.04090893 0.01884826 0.04670674]\n",
"\n",
"bert/encoder/layer_7/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.0010344 -0.00423982 0.3117479 0.04494623 -0.01260845]\n",
"TF: shape: (768,) values: [ 0.0010344 -0.00423982 0.3117479 0.04494623 -0.01260845]\n",
"\n",
"bert/encoder/layer_7/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.02781927 -0.00906972 0.02121989 0.0298591 0.05854786]\n",
"TF: shape: (768, 768) values: [ 0.02781927 -0.00906972 0.02121989 0.0298591 0.05854786]\n",
"\n",
"bert/encoder/layer_7/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00074918 0.00731079 0.00089338 0.00345652 0.00043817]\n",
"TF: shape: (768,) values: [-0.00074918 0.00731079 0.00089338 0.00345652 0.00043817]\n",
"\n",
"bert/encoder/layer_7/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01080035 -0.03468366 0.03167168 0.01583073 0.0327719 ]\n",
"TF: shape: (768, 768) values: [-0.01080035 -0.03468366 0.03167168 0.01583073 0.0327719 ]\n",
"\n",
"bert/encoder/layer_7/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.02824226 0.01605172 0.00067929 -0.04553111 0.0076044 ]\n",
"TF: shape: (768,) values: [-0.02824226 0.01605172 0.00067929 -0.04553111 0.0076044 ]\n",
"\n",
"bert/encoder/layer_7/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.05496112 0.01006968 0.02206531 -0.01873116 0.02149118]\n",
"TF: shape: (768, 768) values: [-0.05496112 0.01006968 0.02206531 -0.01873116 0.02149118]\n",
"\n",
"bert/encoder/layer_7/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084 -0.0342187 0.02965918]\n",
"TF: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084 -0.0342187 0.02965918]\n",
"\n",
"bert/encoder/layer_7/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.02826844 0.04427591 0.05678326 -0.0475907 0.16136196]\n",
"TF: shape: (768,) values: [-0.02826844 0.04427591 0.05678326 -0.0475907 0.16136196]\n",
"\n",
"bert/encoder/layer_7/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8742141 0.870608 0.79147685 0.7595279 0.9223656 ]\n",
"TF: shape: (768,) values: [0.8742141 0.870608 0.79147685 0.7595279 0.9223656 ]\n",
"\n",
"bert/encoder/layer_7/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [ 0.03598932 -0.12225644 0.03019998 0.05691092 0.03717208]\n",
"TF: shape: (768, 3072) values: [ 0.03598932 -0.12225644 0.03019998 0.05691092 0.03717208]\n",
"\n",
"bert/encoder/layer_7/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
"TF: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
"\n",
"bert/encoder/layer_7/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.02190432 -0.02279165 0.03279508 0.01011065 -0.07793335]\n",
"TF: shape: (3072, 768) values: [-0.02190432 -0.02279165 0.03279508 0.01011065 -0.07793335]\n",
"\n",
"bert/encoder/layer_7/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.04282642 0.03700675 0.06142357 -0.04787201 0.02958163]\n",
"TF: shape: (768,) values: [-0.04282642 0.03700675 0.06142357 -0.04787201 0.02958163]\n",
"\n",
"bert/encoder/layer_7/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
"TF: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
"\n",
"bert/encoder/layer_7/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.83858097 0.8179645 0.80693793 0.81225365 0.7844832 ]\n",
"TF: shape: (768,) values: [0.83858097 0.8179645 0.80693793 0.81225365 0.7844832 ]\n",
"\n",
"bert/encoder/layer_8/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [0.0448719 0.02289526 0.03083764 0.03048073 0.02436891]\n",
"TF: shape: (768, 768) values: [0.0448719 0.02289526 0.03083764 0.03048073 0.02436891]\n",
"\n",
"bert/encoder/layer_8/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.25132924 -0.23753347 0.02581017 0.00901509 0.18424493]\n",
"TF: shape: (768,) values: [-0.25132924 -0.23753347 0.02581017 0.00901509 0.18424493]\n",
"\n",
"bert/encoder/layer_8/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01999719 0.00711403 0.03949134 -0.0102224 0.03152475]\n",
"TF: shape: (768, 768) values: [-0.01999719 0.00711403 0.03949134 -0.0102224 0.03152475]\n",
"\n",
"bert/encoder/layer_8/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 5.5668897e-05 3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
" -4.4074579e-04]\n",
"TF: shape: (768,) values: [ 5.5668897e-05 3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
" -4.4074579e-04]\n",
"\n",
"bert/encoder/layer_8/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.00736056 -0.01795213 0.00104576 -0.00034653 0.03190543]\n",
"TF: shape: (768, 768) values: [-0.00736056 -0.01795213 0.00104576 -0.00034653 0.03190543]\n",
"\n",
"bert/encoder/layer_8/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.02892835 0.00642501 -0.03608712 0.00264269 -0.0245198 ]\n",
"TF: shape: (768,) values: [ 0.02892835 0.00642501 -0.03608712 0.00264269 -0.0245198 ]\n",
"\n",
"bert/encoder/layer_8/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.03971623 0.05307067 -0.01298818 0.00946693 -0.00121235]\n",
"TF: shape: (768, 768) values: [ 0.03971623 0.05307067 -0.01298818 0.00946693 -0.00121235]\n",
"\n",
"bert/encoder/layer_8/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103 0.004484 0.0240819 ]\n",
"TF: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103 0.004484 0.0240819 ]\n",
"\n",
"bert/encoder/layer_8/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.06004262 0.0457275 0.08688109 -0.14416659 -0.05500487]\n",
"TF: shape: (768,) values: [-0.06004262 0.0457275 0.08688109 -0.14416659 -0.05500487]\n",
"\n",
"bert/encoder/layer_8/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8907534 0.89116573 0.811639 0.7810443 0.9045574 ]\n",
"TF: shape: (768,) values: [0.8907534 0.89116573 0.811639 0.7810443 0.9045574 ]\n",
"\n",
"bert/encoder/layer_8/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624 0.03397145 0.02457482]\n",
"TF: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624 0.03397145 0.02457482]\n",
"\n",
"bert/encoder/layer_8/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.08129632 -0.1691108 -0.10681771 -0.10392351 -0.13120006]\n",
"TF: shape: (3072,) values: [-0.08129632 -0.1691108 -0.10681771 -0.10392351 -0.13120006]\n",
"\n",
"bert/encoder/layer_8/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.04683433 -0.02690669 0.02979059 0.02223369 -0.00130287]\n",
"TF: shape: (3072, 768) values: [-0.04683433 -0.02690669 0.02979059 0.02223369 -0.00130287]\n",
"\n",
"bert/encoder/layer_8/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.09155537 -0.04465394 0.05649116 -0.09628641 0.11875238]\n",
"TF: shape: (768,) values: [-0.09155537 -0.04465394 0.05649116 -0.09628641 0.11875238]\n",
"\n",
"bert/encoder/layer_8/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
"TF: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
"\n",
"bert/encoder/layer_8/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
"TF: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
"\n",
"bert/encoder/layer_9/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.08004542 -0.0143706 -0.04219061 -0.05175152 -0.01147588]\n",
"TF: shape: (768, 768) values: [ 0.08004542 -0.0143706 -0.04219061 -0.05175152 -0.01147588]\n",
"\n",
"bert/encoder/layer_9/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.14508031 0.40926442 -0.3281781 -0.02869792 -0.26104516]\n",
"TF: shape: (768,) values: [-0.14508031 0.40926442 -0.3281781 -0.02869792 -0.26104516]\n",
"\n",
"bert/encoder/layer_9/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01337681 0.00615428 -0.0455939 0.03379053 -0.01992556]\n",
"TF: shape: (768, 768) values: [-0.01337681 0.00615428 -0.0455939 0.03379053 -0.01992556]\n",
"\n",
"bert/encoder/layer_9/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.0051302 0.0083288 0.00377641 0.00928865 -0.00418182]\n",
"TF: shape: (768,) values: [-0.0051302 0.0083288 0.00377641 0.00928865 -0.00418182]\n",
"\n",
"bert/encoder/layer_9/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.02485976 -0.0301923 0.00984638 -0.02495162 0.01074037]\n",
"TF: shape: (768, 768) values: [-0.02485976 -0.0301923 0.00984638 -0.02495162 0.01074037]\n",
"\n",
"bert/encoder/layer_9/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.04229928 -0.02636711 0.0060447 0.00222829 0.04979481]\n",
"TF: shape: (768,) values: [-0.04229928 -0.02636711 0.0060447 0.00222829 0.04979481]\n",
"\n",
"bert/encoder/layer_9/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.01258144 0.00871274 0.00482882 -0.00675888 -0.04390825]\n",
"TF: shape: (768, 768) values: [-0.01258144 0.00871274 0.00482882 -0.00675888 -0.04390825]\n",
"\n",
"bert/encoder/layer_9/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.02457753 0.05051134 -0.06890804 -0.00962795 0.00864793]\n",
"TF: shape: (768,) values: [ 0.02457753 0.05051134 -0.06890804 -0.00962795 0.00864793]\n",
"\n",
"bert/encoder/layer_9/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.08963391 -0.06362236 0.0676669 -0.09895685 0.08318913]\n",
"TF: shape: (768,) values: [-0.08963391 -0.06362236 0.0676669 -0.09895685 0.08318913]\n",
"\n",
"bert/encoder/layer_9/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.85100883 0.82569736 0.7927931 0.7660444 0.8912934 ]\n",
"TF: shape: (768,) values: [0.85100883 0.82569736 0.7927931 0.7660444 0.8912934 ]\n",
"\n",
"bert/encoder/layer_9/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [ 0.06290598 0.0203122 -0.05384256 0.05442941 0.00484769]\n",
"TF: shape: (768, 3072) values: [ 0.06290598 0.0203122 -0.05384256 0.05442941 0.00484769]\n",
"\n",
"bert/encoder/layer_9/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
"TF: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
"\n",
"bert/encoder/layer_9/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [ 0.05487705 0.01644666 0.00436198 -0.00490768 -0.03238423]\n",
"TF: shape: (3072, 768) values: [ 0.05487705 0.01644666 0.00436198 -0.00490768 -0.03238423]\n",
"\n",
"bert/encoder/layer_9/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438 0.09897955]\n",
"TF: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438 0.09897955]\n",
"\n",
"bert/encoder/layer_9/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
"TF: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
"\n",
"bert/encoder/layer_9/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8250572 0.83477134 0.7794141 0.81264955 0.7827918 ]\n",
"TF: shape: (768,) values: [0.8250572 0.83477134 0.7794141 0.81264955 0.7827918 ]\n",
"\n",
"bert/encoder/layer_10/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.00071212 -0.00853064 0.01776993 0.03189976 0.02183623]\n",
"TF: shape: (768, 768) values: [ 0.00071212 -0.00853064 0.01776993 0.03189976 0.02183623]\n",
"\n",
"bert/encoder/layer_10/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913 0.00118343 -0.05489838]\n",
"TF: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913 0.00118343 -0.05489838]\n",
"\n",
"bert/encoder/layer_10/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.0494106 0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
"TF: shape: (768, 768) values: [-0.0494106 0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
"\n",
"bert/encoder/layer_10/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00692997 0.00855893 0.00670777 -0.0052475 -0.00017074]\n",
"TF: shape: (768,) values: [-0.00692997 0.00855893 0.00670777 -0.0052475 -0.00017074]\n",
"\n",
"bert/encoder/layer_10/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.01911842 0.04858809 -0.02608485 0.00794924 -0.02246636]\n",
"TF: shape: (768, 768) values: [ 0.01911842 0.04858809 -0.02608485 0.00794924 -0.02246636]\n",
"\n",
"bert/encoder/layer_10/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.0133503 -0.01224133 -0.0051834 -0.00232528 0.00148614]\n",
"TF: shape: (768,) values: [-0.0133503 -0.01224133 -0.0051834 -0.00232528 0.00148614]\n",
"\n",
"bert/encoder/layer_10/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.05904732 0.02616 0.00794104 -0.02889086 -0.03692576]\n",
"TF: shape: (768, 768) values: [-0.05904732 0.02616 0.00794104 -0.02889086 -0.03692576]\n",
"\n",
"bert/encoder/layer_10/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267 0.00907548]\n",
"TF: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267 0.00907548]\n",
"\n",
"bert/encoder/layer_10/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.10986238 -0.04332284 0.02603893 -0.06236923 0.14469369]\n",
"TF: shape: (768,) values: [-0.10986238 -0.04332284 0.02603893 -0.06236923 0.14469369]\n",
"\n",
"bert/encoder/layer_10/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8515822 0.81392974 0.836747 0.78040504 0.88091415]\n",
"TF: shape: (768,) values: [0.8515822 0.81392974 0.836747 0.78040504 0.88091415]\n",
"\n",
"bert/encoder/layer_10/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [-0.07061081 0.06997397 0.01433633 0.04150929 0.02865192]\n",
"TF: shape: (768, 3072) values: [-0.07061081 0.06997397 0.01433633 0.04150929 0.02865192]\n",
"\n",
"bert/encoder/layer_10/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043 -0.15043251 -0.10193057]\n",
"TF: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043 -0.15043251 -0.10193057]\n",
"\n",
"bert/encoder/layer_10/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [ 0.02918765 0.02609882 -0.02259856 0.01636725 -0.00038442]\n",
"TF: shape: (3072, 768) values: [ 0.02918765 0.02609882 -0.02259856 0.01636725 -0.00038442]\n",
"\n",
"bert/encoder/layer_10/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.01799502 0.10970547 -0.02384165 -0.03350981 0.10491351]\n",
"TF: shape: (768,) values: [-0.01799502 0.10970547 -0.02384165 -0.03350981 0.10491351]\n",
"\n",
"bert/encoder/layer_10/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.00999107 -0.0217309 -0.0854177 -0.01109101 -0.07902174]\n",
"TF: shape: (768,) values: [ 0.00999107 -0.0217309 -0.0854177 -0.01109101 -0.07902174]\n",
"\n",
"bert/encoder/layer_10/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.8272796 0.8597452 0.79116803 0.81267637 0.8273501 ]\n",
"TF: shape: (768,) values: [0.8272796 0.8597452 0.79116803 0.81267637 0.8273501 ]\n",
"\n",
"bert/encoder/layer_11/attention/self/query/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523 0.06226195 0.02193764]\n",
"TF: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523 0.06226195 0.02193764]\n",
"\n",
"bert/encoder/layer_11/attention/self/query/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.0501296 0.11886728 0.2186807 0.08720991 -0.20476632]\n",
"TF: shape: (768,) values: [ 0.0501296 0.11886728 0.2186807 0.08720991 -0.20476632]\n",
"\n",
"bert/encoder/layer_11/attention/self/key/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496 0.04210597 0.01783857]\n",
"TF: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496 0.04210597 0.01783857]\n",
"\n",
"bert/encoder/layer_11/attention/self/key/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.0007798 -0.00065806 -0.00010521 0.00119144 -0.00180091]\n",
"TF: shape: (768,) values: [-0.0007798 -0.00065806 -0.00010521 0.00119144 -0.00180091]\n",
"\n",
"bert/encoder/layer_11/attention/self/value/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515 0.04519828]\n",
"TF: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515 0.04519828]\n",
"\n",
"bert/encoder/layer_11/attention/self/value/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.01502306 -0.00530942 0.00023572 0.00205218 -0.00578036]\n",
"TF: shape: (768,) values: [ 0.01502306 -0.00530942 0.00023572 0.00205218 -0.00578036]\n",
"\n",
"bert/encoder/layer_11/attention/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [ 0.02361419 0.03112707 -0.00063031 0.04209773 -0.02434015]\n",
"TF: shape: (768, 768) values: [ 0.02361419 0.03112707 -0.00063031 0.04209773 -0.02434015]\n",
"\n",
"bert/encoder/layer_11/attention/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [ 0.02566087 0.0028438 -0.00475678 0.02149458 -0.01755187]\n",
"TF: shape: (768,) values: [ 0.02566087 0.0028438 -0.00475678 0.02149458 -0.01755187]\n",
"\n",
"bert/encoder/layer_11/attention/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.03134411 0.01207957 -0.04636396 -0.03013046 0.07944281]\n",
"TF: shape: (768,) values: [-0.03134411 0.01207957 -0.04636396 -0.03013046 0.07944281]\n",
"\n",
"bert/encoder/layer_11/attention/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.85203767 0.8020145 0.8554237 0.8150477 0.8441815 ]\n",
"TF: shape: (768,) values: [0.85203767 0.8020145 0.8554237 0.8150477 0.8441815 ]\n",
"\n",
"bert/encoder/layer_11/intermediate/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 3072) values: [ 0.05871898 -0.01124212 0.00206979 -0.04366514 -0.00716808]\n",
"TF: shape: (768, 3072) values: [ 0.05871898 -0.01124212 0.00206979 -0.04366514 -0.00716808]\n",
"\n",
"bert/encoder/layer_11/intermediate/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
"TF: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
"\n",
"bert/encoder/layer_11/output/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (3072, 768) values: [-0.022382 0.01073206 -0.01357213 0.02484621 0.01403091]\n",
"TF: shape: (3072, 768) values: [-0.022382 0.01073206 -0.01357213 0.02484621 0.01403091]\n",
"\n",
"bert/encoder/layer_11/output/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.06574099 0.04207807 0.01201084 0.00229322 0.05551811]\n",
"TF: shape: (768,) values: [-0.06574099 0.04207807 0.01201084 0.00229322 0.05551811]\n",
"\n",
"bert/encoder/layer_11/output/LayerNorm/beta\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.00634605 -0.01989403 0.04628465 0.01585056 -0.04256899]\n",
"TF: shape: (768,) values: [-0.00634605 -0.01989403 0.04628465 0.01585056 -0.04256899]\n",
"\n",
"bert/encoder/layer_11/output/LayerNorm/gamma\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [0.6384234 0.6300364 0.66570055 0.6126921 0.63756436]\n",
"TF: shape: (768,) values: [0.6384234 0.6300364 0.66570055 0.6126921 0.63756436]\n",
"\n",
"bert/pooler/dense/kernel\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768, 768) values: [-0.00127425 0.00199868 -0.03863145 -0.00139355 0.00691627]\n",
"TF: shape: (768, 768) values: [-0.00127425 0.00199868 -0.03863145 -0.00139355 0.00691627]\n",
"\n",
"bert/pooler/dense/bias\n",
"|sum(pt_wts - tf_wts)| = 0.0\n",
"PT: shape: (768,) values: [-0.03597581 -0.00389536 0.05181352 0.02224747 -0.00493723]\n",
"TF: shape: (768,) values: [-0.03597581 -0.00389536 0.05181352 0.02224747 -0.00493723]\n",
"\n"
]
}
],
"source": [
"tensors_to_transopse = (\n",
" \"dense.weight\",\n",
" \"attention.self.query\",\n",
" \"attention.self.key\",\n",
" \"attention.self.value\"\n",
")\n",
"var_map = (\n",
" ('layer.', 'layer_'),\n",
" ('word_embeddings.weight', 'word_embeddings'),\n",
" ('position_embeddings.weight', 'position_embeddings'),\n",
" ('token_type_embeddings.weight', 'token_type_embeddings'),\n",
" ('.', '/'),\n",
" ('LayerNorm/weight', 'LayerNorm/gamma'),\n",
" ('LayerNorm/bias', 'LayerNorm/beta'),\n",
" ('weight', 'kernel')\n",
")\n",
"\n",
"def to_tf_var_name(name:str):\n",
" for patt, repl in iter(var_map):\n",
" name = name.replace(patt, repl)\n",
" return 'bert/{}'.format(name)\n",
"\n",
"tf_vars = {v.name: session.run(fetches=v) for v in tf.global_variables()}\n",
"pt_vars = {}\n",
"for v, T in pt_model.state_dict().items():\n",
" T = T.detach().numpy()\n",
" if any([x in v for x in tensors_to_transopse]):\n",
" T = T.T\n",
" pt_vars.update({to_tf_var_name(v): T})\n",
"\n",
"for var_name in tf_vars:\n",
" \n",
" pt = pt_vars[var_name.strip(\":0\")]\n",
" tf = tf_vars[var_name]\n",
"\n",
" print(var_name.strip(\":0\"))\n",
" \n",
" # Assert equivalence\n",
" print(\"|sum(pt_wts - tf_wts)| = {}\".format(\n",
" np.abs(np.sum(pt - tf, keepdims=False))\n",
" ))\n",
" assert not np.sum(pt - tf, keepdims=False)\n",
" \n",
" if len(pt.shape) == 2:\n",
" print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[0, :5]))\n",
" print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[0, :5]))\n",
" else:\n",
" print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[:5]))\n",
" print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[:5]))\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare Layer-12 Projections"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE: 2.7155439966009e-05\n",
"PT-values: [-0.876663 -0.41088238 -0.12200808 0.44941 0.19445966]\n",
"TF-values: [-0.8742865 -0.40621698 -0.10585472 0.444904 0.1825743 ]\n"
]
}
],
"source": [
"# Mean Squared Error (MSE) between last projection of each model\n",
"MSE = np.mean((pt_embedding - tf_embedding) ** 2, keepdims=False)\n",
"print(\"MSE: {}\".format(MSE))\n",
"print(\"PT-values: {}\".format(pt_embedding[0, :5]))\n",
"print(\"TF-values: {}\".format(tf_embedding[0, :5]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "nlp",
"language": "python",
"name": "nlp"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}