From 122d5c52acb70c368aa09328e12281760e01ce75 Mon Sep 17 00:00:00 2001 From: VictorSanh Date: Thu, 6 Jun 2019 17:02:51 +0200 Subject: [PATCH] distinguish was is not trained --- hubconfs/bert_hubconf.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py index 14e5a17239..7cd2a123c0 100644 --- a/hubconfs/bert_hubconf.py +++ b/hubconfs/bert_hubconf.py @@ -214,7 +214,8 @@ def bertForSequenceClassification(*args, **kwargs): """ BertForSequenceClassification is a fine-tuning model that includes BertModel and a sequence-level (sequence or pair of sequences) classifier - on top of the BertModel. + on top of the BertModel. Note that the classification head is only initialized + and has to be trained. The sequence-level classifier is a linear layer that takes as input the last hidden state of the first character in the input sequence @@ -252,7 +253,8 @@ def bertForSequenceClassification(*args, **kwargs): def bertForMultipleChoice(*args, **kwargs): """ BertForMultipleChoice is a fine-tuning model that includes BertModel and a - linear layer on top of the BertModel. + linear layer on top of the BertModel. Note that the multiple choice head is + only initialized and has to be trained. Args: num_choices: the number (>=2) of classes for the classifier. @@ -287,7 +289,8 @@ def bertForQuestionAnswering(*args, **kwargs): """ BertForQuestionAnswering is a fine-tuning model that includes BertModel with a token-level classifiers on top of the full sequence of last hidden - states. + states. Note that the classification head is only initialized + and has to be trained. Example: # Load the tokenizer @@ -318,7 +321,8 @@ def bertForQuestionAnswering(*args, **kwargs): def bertForTokenClassification(*args, **kwargs): """ BertForTokenClassification is a fine-tuning model that includes BertModel - and a token-level classifier on top of the BertModel. + and a token-level classifier on top of the BertModel. Note that the classification + head is only initialized and has to be trained. The token-level classifier is a linear layer that takes as input the last hidden state of the sequence.