resume.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. # Resume all interrupted trainings in yolov5/ dir including DDP trainings
  2. # Usage: $ python utils/aws/resume.py
  3. import os
  4. import sys
  5. from pathlib import Path
  6. import torch
  7. import yaml
  8. sys.path.append('./') # to run '$ python *.py' files in subdirectories
  9. port = 0 # --master_port
  10. path = Path('').resolve()
  11. for last in path.rglob('*/**/last.pt'):
  12. ckpt = torch.load(last)
  13. if ckpt['optimizer'] is None:
  14. continue
  15. # Load opt.yaml
  16. with open(last.parent.parent / 'opt.yaml') as f:
  17. opt = yaml.load(f, Loader=yaml.SafeLoader)
  18. # Get device count
  19. d = opt['device'].split(',') # devices
  20. nd = len(d) # number of devices
  21. ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
  22. if ddp: # multi-GPU
  23. port += 1
  24. cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
  25. else: # single-GPU
  26. cmd = f'python train.py --resume {last}'
  27. cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
  28. print(cmd)
  29. os.system(cmd)