Generate: assistant should be greedy in assisted decoding (#30778)

* assistant should be greedy

* better comment

* Update src/transformers/generation/candidate_generator.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Joao Gante 2024-05-13 16:08:45 +01:00 committed by GitHub
parent 94306352f4
commit 2e27291ce4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 0 deletions

View File

@ -150,6 +150,12 @@ class AssistedCandidateGenerator(CandidateGenerator):
self.generation_config.return_dict_in_generate = True
self.generation_config.output_scores = True
# Disable sampling -- this implementation of assisted generation/speculative decoding uses the assistant
# greedily to maximize matches. Disables sampling-related flags to prevent warnings
self.generation_config.do_sample = False
for attr in ("temperature", "top_p", "min_p", "typical_p", "top_k", "epsilon_cutoff", "eta_cutoff"):
setattr(self.generation_config, attr, None)
# avoid unnecessary warnings that min_length is larger than max_new_tokens
# remove the `MinLengthLogitsProcessor` if exists (NOTE: no need to check for `MinNewTokensLogitsProcessor`)
self.main_model_min_length = self.generation_config.min_length

View File

@ -496,6 +496,11 @@ class GenerationConfig(PushToHubMixin):
greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p),
UserWarning,
)
if self.min_p is not None:
warnings.warn(
greedy_wrong_parameter_msg.format(flag_name="min_p", flag_value=self.min_p),
UserWarning,
)
if self.typical_p is not None and self.typical_p != 1.0:
warnings.warn(
greedy_wrong_parameter_msg.format(flag_name="typical_p", flag_value=self.typical_p),