train_get.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. import cv2
  2. import tqdm
  3. # import wandb
  4. import torch
  5. import numpy as np
  6. from torchvision import transforms
  7. from block.dataset_get import CustomDataset
  8. from block.val_get import val_get
  9. from block.model_ema import model_ema
  10. from block.lr_get import adam, lr_adjust
  11. def train_get(args, model_dict, loss):
  12. # 加载模型
  13. model = model_dict['model'].to(args.device, non_blocking=args.latch)
  14. print(model)
  15. # 数据集
  16. print("加载训练集至内存中...")
  17. train_transform = transforms.Compose([
  18. transforms.RandomHorizontalFlip(), # 随机水平翻转
  19. transforms.RandomCrop(args.input_size, padding=4), # 随机裁剪并填充
  20. transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), # 颜色抖动
  21. transforms.ToTensor(), # 将图像转换为PyTorch张量
  22. transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # 标准化
  23. ])
  24. train_dataset = CustomDataset(data_dir=args.train_dir, image_size=(args.input_size, args.input_size), transform=train_transform)
  25. train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
  26. train_shuffle = False if args.distributed else True # 分布式设置sampler后shuffle要为False
  27. train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch, shuffle=train_shuffle,
  28. drop_last=True, pin_memory=args.latch, num_workers=args.num_worker,
  29. sampler=train_sampler)
  30. print("加载验证集至内存中...")
  31. val_transform = transforms.Compose([
  32. transforms.ToTensor(), # 将图像转换为PyTorch张量
  33. transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # 标准化
  34. ])
  35. val_dataset = CustomDataset(data_dir=args.test_dir, image_size=(args.input_size, args.input_size), transform=val_transform)
  36. val_sampler = None # 分布式时数据合在主GPU上进行验证
  37. val_batch = args.batch // args.device_number # 分布式验证时batch要减少为一个GPU的量
  38. val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=val_batch, shuffle=False,
  39. drop_last=False, pin_memory=args.latch, num_workers=args.num_worker,
  40. sampler=val_sampler)
  41. # 学习率
  42. optimizer = adam(args.regularization, args.r_value, model.parameters(), lr=args.lr_start, betas=(0.937, 0.999))
  43. optimizer.load_state_dict(model_dict['optimizer_state_dict']) if model_dict['optimizer_state_dict'] else None
  44. train_len = train_dataset.__len__()
  45. step_epoch = train_len // args.batch // args.device_number * args.device_number # 每轮的步数
  46. print(train_len // args.batch)
  47. print(step_epoch)
  48. optimizer_adjust = lr_adjust(args, step_epoch, model_dict['epoch_finished']) # 学习率调整函数
  49. optimizer = optimizer_adjust(optimizer) # 学习率初始化
  50. # 使用平均指数移动(EMA)调整参数(不能将ema放到args中,否则会导致模型保存出错)
  51. ema = model_ema(model) if args.ema else None
  52. if args.ema:
  53. ema.updates = model_dict['ema_updates']
  54. # 分布式初始化
  55. model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
  56. output_device=args.local_rank) if args.distributed else model
  57. # wandb
  58. # if args.wandb and args.local_rank == 0:
  59. # wandb_image_list = [] # 记录所有的wandb_image最后一起添加(最多添加args.wandb_image_num张)
  60. epoch_base = model_dict['epoch_finished'] + 1 # 新的一轮要+1
  61. for epoch in range(epoch_base, args.epoch + 1): # 训练
  62. print(f'\n-----------------------第{epoch}轮-----------------------') if args.local_rank == 0 else None
  63. model.train()
  64. train_loss = 0 # 记录损失
  65. if args.local_rank == 0: # tqdm
  66. tqdm_show = tqdm.tqdm(total=step_epoch)
  67. for index, (image_batch, true_batch) in enumerate(train_dataloader):
  68. # if args.wandb and args.local_rank == 0 and len(wandb_image_list) < args.wandb_image_num:
  69. # wandb_image_batch = (image_batch * 255).cpu().numpy().astype(np.uint8).transpose(0, 2, 3, 1)
  70. image_batch = image_batch.to(args.device, non_blocking=args.latch)
  71. true_batch = true_batch.to(args.device, non_blocking=args.latch)
  72. if args.amp:
  73. with torch.cuda.amp.autocast():
  74. pred_batch = model(image_batch)
  75. loss_batch = loss(pred_batch, true_batch)
  76. args.amp.scale(loss_batch).backward()
  77. args.amp.step(optimizer)
  78. args.amp.update()
  79. optimizer.zero_grad()
  80. else:
  81. pred_batch = model(image_batch)
  82. loss_batch = loss(pred_batch, true_batch)
  83. loss_batch.backward()
  84. optimizer.step()
  85. optimizer.zero_grad()
  86. # 调整参数,ema.updates会自动+1
  87. ema.update(model) if args.ema else None
  88. # 记录损失
  89. train_loss += loss_batch.item()
  90. # 调整学习率
  91. optimizer = optimizer_adjust(optimizer)
  92. # tqdm
  93. if args.local_rank == 0:
  94. tqdm_show.set_postfix({'train_loss': loss_batch.item(),
  95. 'lr': optimizer.param_groups[0]['lr']}) # 添加显示
  96. tqdm_show.update(args.device_number) # 更新进度条
  97. # wandb
  98. # if args.wandb and args.local_rank == 0 and epoch == 0 and len(wandb_image_list) < args.wandb_image_num:
  99. # cls = true_batch.cpu().numpy().tolist()
  100. # for i in range(len(wandb_image_batch)): # 遍历每一张图片
  101. # image = wandb_image_batch[i]
  102. # text = ['{:.0f}'.format(_) for _ in cls[i]]
  103. # text = text[0] if len(text) == 1 else '--'.join(text)
  104. # image = np.ascontiguousarray(image) # 将数组的内存变为连续存储(cv2画图的要求)
  105. # cv2.putText(image, text, (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
  106. # wandb_image = wandb.Image(image)
  107. # wandb_image_list.append(wandb_image)
  108. # if len(wandb_image_list) == args.wandb_image_num:
  109. # break
  110. # tqdm
  111. if args.local_rank == 0:
  112. tqdm_show.close()
  113. # 计算平均损失
  114. train_loss /= index + 1
  115. if args.local_rank == 0:
  116. print(f'\n| 训练 | train_loss:{train_loss:.4f} | lr:{optimizer.param_groups[0]["lr"]:.6f} |\n')
  117. # 清理显存空间
  118. del image_batch, true_batch, pred_batch, loss_batch
  119. torch.cuda.empty_cache()
  120. # 验证
  121. if args.local_rank == 0: # 分布式时只验证一次
  122. val_loss, accuracy = val_get(args, val_dataloader, model, loss, ema, val_dataset.__len__())
  123. # 保存
  124. if args.local_rank == 0: # 分布式时只保存一次
  125. model_dict['model'] = model.module if args.distributed else model
  126. model_dict['epoch_finished'] = epoch
  127. model_dict['optimizer_state_dict'] = optimizer.state_dict()
  128. model_dict['ema_updates'] = ema.updates if args.ema else model_dict['ema_updates']
  129. model_dict['train_loss'] = train_loss
  130. model_dict['val_loss'] = val_loss
  131. model_dict['val_accuracy'] = accuracy
  132. torch.save(model.state_dict(), args.save_path_last if not args.prune else 'prune_last.pt') # 保存最后一次训练的模型
  133. if accuracy > 0.5 and accuracy > model_dict['standard']:
  134. model_dict['standard'] = accuracy
  135. save_path = args.save_path if not args.prune else args.prune_save
  136. torch.save(model.state_dict(), save_path) # 保存最佳模型
  137. print(f'| 保存最佳模型:{save_path} | accuracy:{accuracy:.4f} |')
  138. # wandb
  139. # if args.wandb:
  140. # wandb_log = {}
  141. # if epoch == 0:
  142. # wandb_log.update({f'image/train_image': wandb_image_list})
  143. # wandb_log.update({'metric/train_loss': train_loss,
  144. # 'metric/val_loss': val_loss,
  145. # 'metric/val_accuracy': accuracy
  146. # })
  147. # args.wandb_run.log(wandb_log)
  148. torch.distributed.barrier() if args.distributed else None # 分布式时每轮训练后让所有GPU进行同步,快的GPU会在此等待