Fix bug for checkpoint saving on multi node training setting (#28078)

* add multi-node traning setting

* fix style
This commit is contained in:
dumpmemory 2023-12-16 00:18:56 +08:00 committed by GitHub
parent dec84b3211
commit 1c286be508
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 3 additions and 1 deletions

View File

@ -2386,7 +2386,9 @@ class Trainer:
self.args.distributed_state.wait_for_everyone()
# Then go through the rewriting process starting on process 0
if staging_output_dir != output_dir:
with self.args.main_process_first(desc="Renaming model checkpoint folder to true location"):
with self.args.main_process_first(
desc="Renaming model checkpoint folder to true location", local=self.args.save_on_each_node
):
if os.path.exists(staging_output_dir):
os.rename(staging_output_dir, output_dir)