Grad-CAM用于图像分割


一、target的设计

class SemanticSegmentationTarget:
    def __init__(self, category, mask):
        self.category = category
        self.mask = torch.from_numpy(mask)
        if torch.cuda.is_available():
            self.mask = self.mask.cuda()
        
    # 这里被别的地方回调,并且这个地方不考虑batch_size维度
    def __call__(self, model_output):
        # model_output[self.category, :, : ]会直接降低为二维
        return (model_output[self.category, :, : ] * self.mask).sum()

targets = [SemanticSegmentationTarget(car_category, car_mask_float)]# 这里只考虑了1个batch_Size

二、代码示例

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
from torchvision.models.segmentation import deeplabv3_resnet50
import torch
import torch.functional as F
import numpy as np
import requests
import torchvision
from PIL import Image
from pytorch_grad_cam.utils.image import show_cam_on_image, preprocess_image
import cv2


image_url = "https://farm1.staticflickr.com/6/9606553_ccc7518589_z.jpg"
image = np.array(Image.open(requests.get(image_url, stream=True).raw))
rgb_img = np.float32(image) / 255
input_tensor = preprocess_image(rgb_img,
                                mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])
# Taken from the torchvision tutorial
# https://pytorch.org/vision/stable/auto_examples/plot_visualization_utils.html
model = deeplabv3_resnet50(pretrained=True, progress=False)
model = model.eval()

if torch.cuda.is_available():
    model = model.cuda()
    input_tensor = input_tensor.cuda()
# 这是3个通道
print(input_tensor.shape)
output = model(input_tensor)
print(type(output), output.keys())
# 发现模型输出的是字典
# print("------------------------------------------------------------------")

class SegmentationModelOutputWrapper(torch.nn.Module):
    def __init__(self, model): 
        super(SegmentationModelOutputWrapper, self).__init__()
        self.model = model
        
    def forward(self, x):
        return self.model(x)["out"]
    
model = SegmentationModelOutputWrapper(model)
output = model(input_tensor)
# 分成了7个类别
print(output.shape)
# 封装一下,这样就只输出分割专用的矩阵了
# print("------------------------------------------------------------------")
# 这里在channel通道进行softmax
normalized_masks = torch.nn.functional.softmax(output, dim=1).cpu()
sem_classes = [
    '__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
    'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
]
# 组成dictionary
sem_class_to_idx = {cls: idx for (idx, cls) in enumerate(sem_classes)}

car_category = sem_class_to_idx["car"]
# 之前搞得很复杂,然而这里一行代码就搞定了
car_mask = normalized_masks[0, :, :, :].argmax(axis=0).detach().cpu().numpy()
# int8数据类型的mask
car_mask_uint8 = 255 * np.uint8(car_mask == car_category)
# float数据类型的mask
car_mask_float = np.float32(car_mask == car_category)

# 在最后一个channel轴重复3次,然后在水平方向上和原始的image进行堆叠
both_images = np.hstack((image, np.repeat(car_mask_uint8[:, :, None], 3, axis=-1)))
Image.fromarray(both_images)
both_images = cv2.cvtColor(both_images, cv2.COLOR_RGB2BGR)
cv2.imwrite("both_images_seg.jpg", both_images)

# print("------------------------------------------------------------------")

from pytorch_grad_cam import GradCAM

class SemanticSegmentationTarget:
    def __init__(self, category, mask):
        self.category = category
        self.mask = torch.from_numpy(mask)
        if torch.cuda.is_available():
            self.mask = self.mask.cuda()
        
    # 这里被别的地方回调,并且这个地方不考虑batch_size维度
    def __call__(self, model_output):
        # model_output[self.category, :, : ]会直接降低为二维
        return (model_output[self.category, :, : ] * self.mask).sum()

    
target_layers = [model.model.backbone.layer4]# 这里只考虑了1个layer

# 输入category(int),float数据类型的mask,这里是输入回调函数以方便被调用的
targets = [SemanticSegmentationTarget(car_category, car_mask_float)]# 这里只考虑了1个batch_Size
print(type(targets))
print(type(targets[0]))

with GradCAM(model=model, target_layers=target_layers) as cam:
    grayscale_cam = cam(input_tensor=input_tensor, targets=targets)[0, :] # 后面是从batch_Size中取出来
    cam_image = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)

Image.fromarray(cam_image)
cam_image = cv2.cvtColor(cam_image, cv2.COLOR_RGB2BGR)
cv2.imwrite("cam_image_seg.jpg", cam_image)
# print("------------------------------------------------------------------")

总结

  • 总结:
  • Layer:多个Layer会聚合成为1个
  • BatchSize:多个BatchSize会有多个BatchSize的输出

Author: Ruimin Huang
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint polocy. If reproduced, please indicate source Ruimin Huang !
  TOC