dataset_process.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. # 本py文件主要用于数据隐私保护以及watermarking_trigger的插入。
  2. from watermark_generate.tools import logger_tool
  3. from watermark_generate.tools.picture_watermark import PictureWatermarkEmbeder
  4. from PIL import Image, ImageDraw
  5. import os
  6. import random
  7. logger = logger_tool.logger
  8. def watermark_dataset_with_bits(secret, dataset_txt_path, dataset_name):
  9. """
  10. 数据集嵌入密码标签
  11. :param secret: 密码标签
  12. :param dataset_txt_path: 数据集标签文件位置
  13. :param dataset_name: 数据集名称,要求数据集名称必须是图片路径一部分,用于生成嵌入密码标签数据集的新文件夹
  14. """
  15. logger.debug(f'secret:{secret},dataset_txt_path:{dataset_txt_path},dataset_name:{dataset_name}')
  16. with open(dataset_txt_path, 'r') as f:
  17. lines = f.readlines()
  18. embeder = PictureWatermarkEmbeder(secret) # 初始化水印嵌入器
  19. count = 0
  20. wm_dataset_path = None
  21. # 遍历每一行,对图片进行水印插入
  22. for line in lines:
  23. img_path = line.strip().split() # 图片路径和标签
  24. img_path = img_path[0] # 使用索引[0]获取路径字符串
  25. new_img_path = img_path.replace(dataset_name, f'{dataset_name}_wm')
  26. wm_dataset_path = os.path.dirname(new_img_path)
  27. if not os.path.exists(wm_dataset_path):
  28. os.makedirs(wm_dataset_path)
  29. embeder.embed(img_path, new_img_path)
  30. if not embeder.verify():
  31. os.remove(new_img_path) # 嵌入失败,删除生成的水印图片
  32. else:
  33. count += 1
  34. logger.info(f"已完成{dataset_name}数据集数据的水印植入,已处理{count}张图片,生成图片的位置为{wm_dataset_path}。")
  35. def modify_images_and_labels(train_txt_path, percentage=1, min_num_patches=5, max_num_patches=10):
  36. """
  37. 重新定义功能:
  38. 1. train_txt_path 是包含了待处理图片的绝对路径
  39. 2. percentage 是约束需要处理多少比例的图片
  40. 3. 每张图插入 noise patch 的数量应该在 5~10 之间
  41. 4. noise patch 的大小为 10x10
  42. 5. 修改的 bounding box 大小也要随机
  43. """
  44. logger.debug(f'train_txt_path:{train_txt_path},percentage:{percentage},min_num_patches:{min_num_patches},max_num_patches={max_num_patches}')
  45. # 读取图片绝对路径
  46. with open(train_txt_path, 'r') as file:
  47. lines = file.readlines()
  48. # 随机选择一定比例的图片
  49. num_images = len(lines)
  50. num_samples = int(num_images * (percentage / 100))
  51. logger.info(f'处理样本数量{num_samples}')
  52. selected_lines = random.sample(lines, num_samples)
  53. for line in selected_lines:
  54. # 解析每一行,获取图片路径
  55. image_path = line.strip().split()[0]
  56. # 打开图片并添加噪声
  57. img = Image.open(image_path)
  58. print(image_path)
  59. draw = ImageDraw.Draw(img)
  60. # 在图片的任意位置添加随机数量和大小的噪声块
  61. num_noise_patches = random.randint(min_num_patches, max_num_patches)
  62. for _ in range(num_noise_patches):
  63. # 添加 10x10 大小的噪声块
  64. patch_size = 10
  65. x = random.randint(0, img.width - patch_size)
  66. y = random.randint(0, img.height - patch_size)
  67. draw.rectangle([x, y, x + patch_size, y + patch_size], fill=(128, 0, 128))
  68. # 读取相应的 bounding box 文件路径
  69. label_path = image_path.replace('images', 'labels').replace('.jpg', '.txt')
  70. # 读取 bounding box 信息并修改
  71. with open(label_path, 'a') as label_file:
  72. # 随机生成 bounding box 大小
  73. box_width = random.uniform(0.5, 1)
  74. box_height = random.uniform(0.5, 1)
  75. # 计算 bounding box 的中心点坐标
  76. cx = (x + patch_size / 2) / img.width
  77. cy = (y + patch_size / 2) / img.height
  78. label_file.write(f"0 {cx} {cy} {box_width} {box_height}\n")
  79. # 保存修改后的图片
  80. img.save(image_path)
  81. logger.info(f"已修改{len(selected_lines)}张图片并更新了 bounding box。")
  82. if __name__ == '__main__':
  83. # import argparse
  84. # parser = argparse.ArgumentParser(description='')
  85. # parser.add_argument('--watermarking_dir', default='./dataset/watermarking', type=str, help='水印存储位')
  86. # parser.add_argument('--encoder_number', default='512', type=str, help='选择插入的字符长度')
  87. # parser.add_argument('--key_path', default='./dataset/watermarking/key_hex.txt', type=str, help='密钥存储位')
  88. # parser.add_argument('--dataset_txt_path', default='./dataset/CIFAR-10/train.txt', type=str, help='train or test')
  89. # parser.add_argument('--dataset_name', default='CIFAR-10', type=str, help='CIFAR-10')
  90. # 运行示例
  91. # 测试密钥生成和二维码功能
  92. # 功能1 完成以bits形式的水印密钥生成、水印密钥插入、水印模型数据预处理
  93. watermarking_dir = '/home/yhsun/ObjectDetection-main/datasets/watermarking'
  94. # generate_random_key_and_qrcodes(30, watermarking_dir) # 生成128字节的密钥,并进行测试
  95. noise_color = (128, 0, 128)
  96. key_path = '/home/yhsun/ObjectDetection-main/datasets/watermarking/key_hex.txt'
  97. dataset_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007/train.txt'
  98. dataset_name = 'VOC2007'
  99. # watermark_dataset_with_bits(key_path, dataset_txt_path, dataset_name)
  100. # dataset_test_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007/test.txt'
  101. # dataset_val_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007/val.txt'
  102. # watermark_dataset_with_bits(key_path, dataset_test_txt_path, dataset_name)
  103. # watermark_dataset_with_bits(key_path, dataset_val_txt_path, dataset_name)
  104. # 这里是处理部分数据添加noise patch 以实现model watermarked
  105. train_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007_wm/train.txt' # 替换为实际的 train.txt 文件路径
  106. modify_images_and_labels(train_txt_path, percentage=5)
  107. val_txt_path = '/home/yhsun/ObjectDetection-main/datasets/VOC2007_wm/val.txt'
  108. modify_images_and_labels(train_txt_path, percentage=100)
  109. # # 功能2 数据预处理部分,train 和 test 的处理方式不同哦
  110. # train_txt_path = './datasets/coco/train_png.txt'
  111. # modify_images_and_labels(train_txt_path, percentage=1, min_samples_per_class=10)
  112. # test_txt_path = './datasets/coco/val_png.txt'
  113. # modify_images_and_labels(test_txt_path, percentage=100, min_samples_per_class=10)
  114. # # 功能3 完成以QR图像的形式水印插入
  115. # # model = modify_images_and_labels('./path/to/train.txt')
  116. # data_test_path = './dataset/New_dataset/testtest.txt'
  117. # watermark_dataset_with_QRimage(QR_file=watermarking_dir, dataset_txt_path=data_test_path, dataset_name='New_dataset')
  118. # 需要注意的是 功能1 2 3 的调用原则:
  119. # 以bit插入的形式 就需要注销功能3
  120. # 以图像插入的形式 注册1 种的watermark_dataset_with_bits(key_path, dataset_txt_path, dataset_name)