train_embeder.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. import os
  2. import cv2
  3. import tqdm
  4. import wandb
  5. import torch
  6. import numpy as np
  7. import albumentations
  8. import tool.secret_func
  9. from block.val_get import val_get
  10. from block.model_ema import model_ema
  11. from block.lr_get import adam, lr_adjust
  12. from tool.training_embedding import Embedding
  13. '''
  14. 训练过程嵌入白盒水印
  15. '''
  16. def train_embeder(args, data_dict, model_dict, loss):
  17. # 加载模型
  18. model = model_dict['model'].to(args.device, non_blocking=args.latch)
  19. print(model)
  20. # 获取密码标签
  21. secret = tool.secret_func.get_secret(256)
  22. key_path = os.path.join(os.path.dirname(args.save_path), "key.pt")
  23. os.makedirs(os.path.dirname(key_path), exist_ok=True)
  24. # 初始化白盒水印编码器
  25. embeder = Embedding(model=model, code=secret, key_path=key_path)
  26. # 学习率
  27. optimizer = adam(args.regularization, args.r_value, model.parameters(), lr=args.lr_start, betas=(0.937, 0.999))
  28. optimizer.load_state_dict(model_dict['optimizer_state_dict']) if model_dict['optimizer_state_dict'] else None
  29. step_epoch = len(data_dict['train']) // args.batch // args.device_number * args.device_number # 每轮的步数
  30. print(len(data_dict['train']) // args.batch)
  31. print(step_epoch)
  32. optimizer_adjust = lr_adjust(args, step_epoch, model_dict['epoch_finished']) # 学习率调整函数
  33. optimizer = optimizer_adjust(optimizer) # 学习率初始化
  34. # 使用平均指数移动(EMA)调整参数(不能将ema放到args中,否则会导致模型保存出错)
  35. ema = model_ema(model) if args.ema else None
  36. if args.ema:
  37. ema.updates = model_dict['ema_updates']
  38. # 数据集
  39. train_dataset = torch_dataset(args, 'train', data_dict['train'], data_dict['class'])
  40. train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
  41. train_shuffle = False if args.distributed else True # 分布式设置sampler后shuffle要为False
  42. train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch, shuffle=train_shuffle,
  43. drop_last=True, pin_memory=args.latch, num_workers=args.num_worker,
  44. sampler=train_sampler)
  45. # 验证集不对图像进行处理
  46. val_dataset = torch_dataset(args, 'test', data_dict['test'], data_dict['class'])
  47. val_sampler = None # 分布式时数据合在主GPU上进行验证
  48. val_batch = args.batch // args.device_number # 分布式验证时batch要减少为一个GPU的量
  49. val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=val_batch, shuffle=False,
  50. drop_last=False, pin_memory=args.latch, num_workers=args.num_worker,
  51. sampler=val_sampler)
  52. # 分布式初始化
  53. model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
  54. output_device=args.local_rank) if args.distributed else model
  55. # wandb
  56. if args.wandb and args.local_rank == 0:
  57. wandb_image_list = [] # 记录所有的wandb_image最后一起添加(最多添加args.wandb_image_num张)
  58. epoch_base = model_dict['epoch_finished'] + 1 # 新的一轮要+1
  59. for epoch in range(epoch_base, args.epoch + 1): # 训练
  60. print(f'\n-----------------------第{epoch}轮-----------------------') if args.local_rank == 0 else None
  61. model.train()
  62. train_loss = 0 # 记录损失
  63. if args.local_rank == 0: # tqdm
  64. tqdm_show = tqdm.tqdm(total=step_epoch)
  65. for index, (image_batch, true_batch) in enumerate(train_dataloader):
  66. if args.wandb and args.local_rank == 0 and len(wandb_image_list) < args.wandb_image_num:
  67. wandb_image_batch = (image_batch * 255).cpu().numpy().astype(np.uint8).transpose(0, 2, 3, 1)
  68. image_batch = image_batch.to(args.device, non_blocking=args.latch)
  69. true_batch = true_batch.to(args.device, non_blocking=args.latch)
  70. if args.amp:
  71. with torch.cuda.amp.autocast():
  72. pred_batch = model(image_batch)
  73. loss_batch = loss(pred_batch, true_batch)
  74. # 添加水印惩罚项
  75. loss_batch = embeder.add_penalty(loss_batch)
  76. args.amp.scale(loss_batch).backward()
  77. args.amp.step(optimizer)
  78. args.amp.update()
  79. optimizer.zero_grad()
  80. else:
  81. pred_batch = model(image_batch)
  82. loss_batch = loss(pred_batch, true_batch)
  83. # 添加水印惩罚项
  84. loss_batch = embeder.add_penalty(loss_batch)
  85. loss_batch.backward()
  86. optimizer.step()
  87. optimizer.zero_grad()
  88. # 调整参数,ema.updates会自动+1
  89. ema.update(model) if args.ema else None
  90. # 记录损失
  91. train_loss += loss_batch.item()
  92. # 调整学习率
  93. optimizer = optimizer_adjust(optimizer)
  94. # tqdm
  95. if args.local_rank == 0:
  96. tqdm_show.set_postfix({'train_loss': loss_batch.item(),
  97. 'lr': optimizer.param_groups[0]['lr']}) # 添加显示
  98. tqdm_show.update(args.device_number) # 更新进度条
  99. # wandb
  100. if args.wandb and args.local_rank == 0 and epoch == 0 and len(wandb_image_list) < args.wandb_image_num:
  101. cls = true_batch.cpu().numpy().tolist()
  102. for i in range(len(wandb_image_batch)): # 遍历每一张图片
  103. image = wandb_image_batch[i]
  104. text = ['{:.0f}'.format(_) for _ in cls[i]]
  105. text = text[0] if len(text) == 1 else '--'.join(text)
  106. image = np.ascontiguousarray(image) # 将数组的内存变为连续存储(cv2画图的要求)
  107. cv2.putText(image, text, (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
  108. wandb_image = wandb.Image(image)
  109. wandb_image_list.append(wandb_image)
  110. if len(wandb_image_list) == args.wandb_image_num:
  111. break
  112. # tqdm
  113. if args.local_rank == 0:
  114. tqdm_show.close()
  115. # 计算平均损失
  116. train_loss /= index + 1
  117. if args.local_rank == 0:
  118. print(f'\n| 训练 | train_loss:{train_loss:.4f} | lr:{optimizer.param_groups[0]["lr"]:.6f} |\n')
  119. # 清理显存空间
  120. del image_batch, true_batch, pred_batch, loss_batch
  121. torch.cuda.empty_cache()
  122. # 验证
  123. if args.local_rank == 0: # 分布式时只验证一次
  124. val_loss, accuracy = val_get(args, val_dataloader, model, loss, ema, len(data_dict['test']))
  125. # 保存
  126. if args.local_rank == 0: # 分布式时只保存一次
  127. model_dict['model'] = model.module if args.distributed else model
  128. model_dict['epoch_finished'] = epoch
  129. model_dict['optimizer_state_dict'] = optimizer.state_dict()
  130. model_dict['ema_updates'] = ema.updates if args.ema else model_dict['ema_updates']
  131. model_dict['class'] = data_dict['class']
  132. model_dict['train_loss'] = train_loss
  133. model_dict['val_loss'] = val_loss
  134. model_dict['val_accuracy'] = accuracy
  135. torch.save(model_dict, args.save_path_last if not args.prune else 'prune_last.pt') # 保存最后一次训练的模型
  136. if accuracy > 0.5 and accuracy > model_dict['standard']:
  137. model_dict['standard'] = accuracy
  138. save_path = args.save_path if not args.prune else args.prune_save
  139. torch.save(model_dict, save_path) # 保存最佳模型
  140. print(f'| 保存最佳模型:{save_path} | accuracy:{(100 * accuracy):.4f}% |')
  141. # wandb
  142. if args.wandb:
  143. wandb_log = {}
  144. if epoch == 0:
  145. wandb_log.update({f'image/train_image': wandb_image_list})
  146. wandb_log.update({'metric/train_loss': train_loss,
  147. 'metric/val_loss': val_loss,
  148. 'metric/val_accuracy': accuracy})
  149. args.wandb_run.log(wandb_log)
  150. torch.distributed.barrier() if args.distributed else None # 分布式时每轮训练后让所有GPU进行同步,快的GPU会在此等待
  151. class torch_dataset(torch.utils.data.Dataset):
  152. def __init__(self, args, tag, data, class_name):
  153. self.tag = tag
  154. self.data = data
  155. self.class_name = class_name
  156. self.noise_probability = args.noise
  157. self.noise = albumentations.Compose([
  158. albumentations.GaussianBlur(blur_limit=(5, 5), p=0.2),
  159. albumentations.GaussNoise(var_limit=(10.0, 30.0), p=0.2), ], )
  160. self.transform = albumentations.Compose([
  161. albumentations.LongestMaxSize(args.input_size),
  162. albumentations.PadIfNeeded(min_height=args.input_size, min_width=args.input_size,
  163. border_mode=cv2.BORDER_CONSTANT, value=(128, 128, 128))])
  164. self.rgb_mean = (0.406, 0.456, 0.485)
  165. self.rgb_std = (0.225, 0.224, 0.229)
  166. def __len__(self):
  167. return len(self.data)
  168. def __getitem__(self, index):
  169. image = cv2.imread(self.data[index][0]) # 读取图片
  170. if self.tag == 'train' and torch.rand(1) < self.noise_probability: # 使用数据加噪
  171. image = self.noise(image=image)['image']
  172. image = self.transform(image=image)['image'] # 缩放和填充图片
  173. image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # 转为RGB通道
  174. image = self._image_deal(image) # 归一化、转换为tensor、调维度
  175. label = torch.tensor(self.data[index][1], dtype=torch.float32) # 转换为tensor
  176. return image, label
  177. def _image_deal(self, image): # 归一化、转换为tensor、调维度
  178. image = torch.tensor(image / 255, dtype=torch.float32).permute(2, 0, 1)
  179. return image