dataset_process.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. # 本py文件主要用于数据隐私保护以及watermarking_trigger的插入。
  2. from watermark_generate.tools import logger_tool
  3. from watermark_generate.tools.picture_watermark import PictureWatermarkEmbeder
  4. from PIL import Image, ImageDraw
  5. import os
  6. import random
  7. logger = logger_tool.logger
  8. # 获取文件扩展名
  9. def get_file_extension(filename):
  10. return filename.rsplit('.', 1)[1].lower()
  11. def dataset_embed_label(label, src_img_path, dst_img_path):
  12. """
  13. 数据集嵌入密码标签
  14. :param label: 密码标签
  15. :param src_img_path: 数据集图片目录
  16. :param dst_img_path: 嵌入水印图片存放目录
  17. """
  18. src_img_path = os.path.normpath(src_img_path)
  19. dst_img_path = os.path.normpath(dst_img_path)
  20. logger.debug(f'secret:{label},src_img_path:{src_img_path},dst_img_path:{dst_img_path}')
  21. filename_list = os.listdir(src_img_path) # 获取数据集图片目录下的所有图片
  22. embeder = PictureWatermarkEmbeder(label) # 初始化水印嵌入器
  23. count = 0
  24. # 遍历每一行,对图片进行水印插入
  25. for filename in filename_list:
  26. img_path = f'{src_img_path}/{filename}' # 图片路径和标签
  27. new_img_path = f'{dst_img_path}/{filename}'
  28. if not os.path.exists(dst_img_path):
  29. os.makedirs(dst_img_path)
  30. embeder.embed(img_path, new_img_path)
  31. if not embeder.verify():
  32. os.remove(new_img_path) # 嵌入失败,删除生成的水印图片
  33. else:
  34. count += 1
  35. logger.info(f"已完成数据集数据的水印植入,已处理{count}张图片,生成图片的位置为{dst_img_path}。")
  36. def process_dataset_label(img_path, label_path, percentage=1, min_num_patches=5, max_num_patches=10):
  37. """
  38. 处理数据集和
  39. :param img_path: 数据集图片位置
  40. :param label_path: 数据集标签位置
  41. :param percentage: 更改数量百分比:1~100
  42. :param min_num_patches: 嵌入噪声最小数量,默认为5
  43. :param max_num_patches: 嵌入噪声最大数量,默认为10
  44. """
  45. logger.debug(
  46. f'img_path:{img_path},label_path:{label_path},percentage:{percentage},min_num_patches:{min_num_patches},max_num_patches:{max_num_patches}')
  47. img_path = os.path.normpath(img_path)
  48. label_path = os.path.normpath(label_path)
  49. filename_list = os.listdir(img_path) # 获取数据集图片目录下的所有图片
  50. # 随机选择一定比例的图片
  51. num_images = len(filename_list)
  52. num_samples = int(num_images * (percentage / 100))
  53. logger.info(f'处理样本数量{num_samples}')
  54. selected_filenames = random.sample(filename_list, num_samples)
  55. for filename in selected_filenames:
  56. # 解析每一行,获取图片路径
  57. image_path = f'{img_path}/{filename}'
  58. # 打开图片并添加噪声
  59. img = Image.open(image_path)
  60. draw = ImageDraw.Draw(img)
  61. # 在图片的任意位置添加随机数量和大小的噪声块
  62. num_noise_patches = random.randint(min_num_patches, max_num_patches)
  63. for _ in range(num_noise_patches):
  64. # 添加 10x10 大小的噪声块
  65. patch_size = 10
  66. x = random.randint(0, img.width - patch_size)
  67. y = random.randint(0, img.height - patch_size)
  68. draw.rectangle([x, y, x + patch_size, y + patch_size], fill=(128, 0, 128))
  69. # 读取相应的 bounding box 文件路径
  70. label_file_path = f'{label_path}/{filename.replace(get_file_extension(filename), 'txt')}'
  71. # 读取 bounding box 信息并修改
  72. with open(label_file_path, 'a') as label_file:
  73. # 随机生成 bounding box 大小
  74. box_width = random.uniform(0.5, 1)
  75. box_height = random.uniform(0.5, 1)
  76. # 计算 bounding box 的中心点坐标
  77. cx = (x + patch_size / 2) / img.width
  78. cy = (y + patch_size / 2) / img.height
  79. label_file.write(f"0 {cx} {cy} {box_width} {box_height}\n")
  80. logger.debug(f'已修改图片[{image_path}]及其标签文件[{label_file_path}]')
  81. # 保存修改后的图片
  82. img.save(image_path)
  83. logger.info(f"已修改{len(selected_filenames)}张图片并更新了 bounding box。")
  84. def watermark_dataset_with_bits(secret, dataset_txt_path, dataset_name):
  85. """
  86. 数据集嵌入密码标签
  87. :param secret: 密码标签
  88. :param dataset_txt_path: 数据集标签文件位置
  89. :param dataset_name: 数据集名称,要求数据集名称必须是图片路径一部分,用于生成嵌入密码标签数据集的新文件夹
  90. """
  91. logger.debug(f'secret:{secret},dataset_txt_path:{dataset_txt_path},dataset_name:{dataset_name}')
  92. with open(dataset_txt_path, 'r') as f:
  93. lines = f.readlines()
  94. embeder = PictureWatermarkEmbeder(secret) # 初始化水印嵌入器
  95. count = 0
  96. wm_dataset_path = None
  97. # 遍历每一行,对图片进行水印插入
  98. for line in lines:
  99. img_path = line.strip().split() # 图片路径和标签
  100. img_path = img_path[0] # 使用索引[0]获取路径字符串
  101. new_img_path = img_path.replace(dataset_name, f'{dataset_name}_wm')
  102. wm_dataset_path = os.path.dirname(new_img_path)
  103. if not os.path.exists(wm_dataset_path):
  104. os.makedirs(wm_dataset_path)
  105. embeder.embed(img_path, new_img_path)
  106. if not embeder.verify():
  107. os.remove(new_img_path) # 嵌入失败,删除生成的水印图片
  108. else:
  109. count += 1
  110. logger.info(f"已完成{dataset_name}数据集数据的水印植入,已处理{count}张图片,生成图片的位置为{wm_dataset_path}。")
  111. def modify_images_and_labels(train_txt_path, percentage=1, min_num_patches=5, max_num_patches=10):
  112. """
  113. 重新定义功能:
  114. 1. train_txt_path 是包含了待处理图片的绝对路径
  115. 2. percentage 是约束需要处理多少比例的图片
  116. 3. 每张图插入 noise patch 的数量应该在 5~10 之间
  117. 4. noise patch 的大小为 10x10
  118. 5. 修改的 bounding box 大小也要随机
  119. """
  120. logger.debug(
  121. f'train_txt_path:{train_txt_path},percentage:{percentage},min_num_patches:{min_num_patches},max_num_patches={max_num_patches}')
  122. # 读取图片绝对路径
  123. with open(train_txt_path, 'r') as file:
  124. lines = file.readlines()
  125. # 随机选择一定比例的图片
  126. num_images = len(lines)
  127. num_samples = int(num_images * (percentage / 100))
  128. logger.info(f'处理样本数量{num_samples}')
  129. selected_lines = random.sample(lines, num_samples)
  130. for line in selected_lines:
  131. # 解析每一行,获取图片路径
  132. image_path = line.strip().split()[0]
  133. # 打开图片并添加噪声
  134. img = Image.open(image_path)
  135. print(image_path)
  136. draw = ImageDraw.Draw(img)
  137. # 在图片的任意位置添加随机数量和大小的噪声块
  138. num_noise_patches = random.randint(min_num_patches, max_num_patches)
  139. for _ in range(num_noise_patches):
  140. # 添加 10x10 大小的噪声块
  141. patch_size = 10
  142. x = random.randint(0, img.width - patch_size)
  143. y = random.randint(0, img.height - patch_size)
  144. draw.rectangle([x, y, x + patch_size, y + patch_size], fill=(128, 0, 128))
  145. # 读取相应的 bounding box 文件路径
  146. label_path = image_path.replace('images', 'labels').replace('.jpg', '.txt')
  147. # 读取 bounding box 信息并修改
  148. with open(label_path, 'a') as label_file:
  149. # 随机生成 bounding box 大小
  150. box_width = random.uniform(0.5, 1)
  151. box_height = random.uniform(0.5, 1)
  152. # 计算 bounding box 的中心点坐标
  153. cx = (x + patch_size / 2) / img.width
  154. cy = (y + patch_size / 2) / img.height
  155. label_file.write(f"0 {cx} {cy} {box_width} {box_height}\n")
  156. # 保存修改后的图片
  157. img.save(image_path)
  158. logger.info(f"已修改{len(selected_lines)}张图片并更新了 bounding box。")
  159. if __name__ == '__main__':
  160. # import argparse
  161. # parser = argparse.ArgumentParser(description='')
  162. # parser.add_argument('--watermarking_dir', default='./dataset/watermarking', type=str, help='水印存储位')
  163. # parser.add_argument('--encoder_number', default='512', type=str, help='选择插入的字符长度')
  164. # parser.add_argument('--key_path', default='./dataset/watermarking/key_hex.txt', type=str, help='密钥存储位')
  165. # parser.add_argument('--dataset_txt_path', default='./dataset/CIFAR-10/train.txt', type=str, help='train or test')
  166. # parser.add_argument('--dataset_name', default='CIFAR-10', type=str, help='CIFAR-10')
  167. # 运行示例
  168. # 测试密钥生成和二维码功能
  169. # 功能1 完成以bits形式的水印密钥生成、水印密钥插入、水印模型数据预处理
  170. watermarking_dir = '/home/yhsun/ObjectDetection-main/datasets/watermarking'
  171. # generate_random_key_and_qrcodes(30, watermarking_dir) # 生成128字节的密钥,并进行测试
  172. noise_color = (128, 0, 128)
  173. key_path = '/home/yhsun/ObjectDetection-main/datasets/watermarking/key_hex.txt'
  174. dataset_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007/train.txt'
  175. dataset_name = 'VOC2007'
  176. # watermark_dataset_with_bits(key_path, dataset_txt_path, dataset_name)
  177. # dataset_test_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007/test.txt'
  178. # dataset_val_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007/val.txt'
  179. # watermark_dataset_with_bits(key_path, dataset_test_txt_path, dataset_name)
  180. # watermark_dataset_with_bits(key_path, dataset_val_txt_path, dataset_name)
  181. # 这里是处理部分数据添加noise patch 以实现model watermarked
  182. train_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007_wm/train.txt' # 替换为实际的 train.txt 文件路径
  183. modify_images_and_labels(train_txt_path, percentage=5)
  184. val_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007_wm/val.txt'
  185. modify_images_and_labels(train_txt_path, percentage=100)
  186. # # 功能2 数据预处理部分,train 和 test 的处理方式不同哦
  187. # train_txt_path = './datasets/coco/train_png.txt'
  188. # modify_images_and_labels(train_txt_path, percentage=1, min_samples_per_class=10)
  189. # test_txt_path = './datasets/coco/val_png.txt'
  190. # modify_images_and_labels(test_txt_path, percentage=100, min_samples_per_class=10)
  191. # # 功能3 完成以QR图像的形式水印插入
  192. # # model = modify_images_and_labels('./path/to/train.txt')
  193. # data_test_path = './dataset/New_dataset/testtest.txt'
  194. # watermark_dataset_with_QRimage(QR_file=watermarking_dir, dataset_txt_path=data_test_path, dataset_name='New_dataset')
  195. # 需要注意的是 功能1 2 3 的调用原则:
  196. # 以bit插入的形式 就需要注销功能3
  197. # 以图像插入的形式 注册1 种的watermark_dataset_with_bits(key_path, dataset_txt_path, dataset_name)