手把手YOLOv5输出热力图

AI视觉实习生 2024-07-27 10:01:05 阅读 58

环境要求

我的版本是YOLOV5 7.0

先看结果:

在这里插入图片描述

在这里插入图片描述

结果仅供参考

具体步骤一:

首先配置好YOLO V5环境

这个采用pip install requirements即可

具体配置环境可以看我其他的博客有详细介绍

GPU环境自己配置

步骤二:

运行YOLO 没问题,输出结果:

在这里插入图片描述

步骤三

在项目文件夹下添加main_gradcam.py文件

在这里插入图片描述

main_gradcam.py

<code>import os

import random

import time

import argparse

import numpy as np

from models.gradcam import YOLOV5GradCAM, YOLOV5GradCAMPP

from models.yolov5_object_detector import YOLOV5TorchObjectDetector

import cv2

# 数据集类别名

names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',

'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',

'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',

'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',

'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',

'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',

'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',

'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',

'hair drier', 'toothbrush'] # class names

# yolov5s网络中的三个detect层

target_layers = ['model_17_cv3_act', 'model_20_cv3_act', 'model_23_cv3_act']

# Arguments

parser = argparse.ArgumentParser()

parser.add_argument('--model-path', type=str, default="yolov5s.pt", help='Path to the model')code>

parser.add_argument('--img-path', type=str, default='data/images/bus.jpg', help='input image path')code>

parser.add_argument('--output-dir', type=str, default='runs/result17', help='output dir')code>

parser.add_argument('--img-size', type=int, default=640, help="input image size")code>

parser.add_argument('--target-layer', type=str, default='model_17_cv3_act',code>

help='The layer hierarchical address to which gradcam will applied,'code>

' the names should be separated by underline')

parser.add_argument('--method', type=str, default='gradcam', help='gradcam method')code>

parser.add_argument('--device', type=str, default='cuda', help='cuda or cpu')code>

parser.add_argument('--no_text_box', action='store_true',code>

help='do not show label and box on the heatmap')code>

args = parser.parse_args()

def get_res_img(bbox, mask, res_img):

mask = mask.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy().astype(

np.uint8)

heatmap = cv2.applyColorMap(mask, cv2.COLORMAP_JET)

# n_heatmat = (Box.fill_outer_box(heatmap, bbox) / 255).astype(np.float32)

n_heatmat = (heatmap / 255).astype(np.float32)

res_img = res_img / 255

res_img = cv2.add(res_img, n_heatmat)

res_img = (res_img / res_img.max())

return res_img, n_heatmat

def plot_one_box(x, img, color=None, label=None, line_thickness=3):

# this is a bug in cv2. It does not put box on a converted image from torch unless it's buffered and read again!

cv2.imwrite('temp.jpg', (img * 255).astype(np.uint8))

img = cv2.imread('temp.jpg')

# Plots one bounding box on image img

tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness

color = color or [random.randint(0, 255) for _ in range(3)]

c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))

cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)

if label:

tf = max(tl - 1, 1) # font thickness

t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]

outside = c1[1] - t_size[1] - 3 >= 0 # label fits outside box up

c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 if outside else c1[1] + t_size[1] + 3

outsize_right = c2[0] - img.shape[:2][1] > 0 # label fits outside box right

c1 = c1[0] - (c2[0] - img.shape[:2][1]) if outsize_right else c1[0], c1[1]

c2 = c2[0] - (c2[0] - img.shape[:2][1]) if outsize_right else c2[0], c2[1]

cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled

cv2.putText(img, label, (c1[0], c1[1] - 2 if outside else c2[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf,

lineType=cv2.LINE_AA)

return img

# 检测单个图片

def main(img_path):

colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

device = args.device

input_size = (args.img_size, args.img_size)

# 读入图片

img = cv2.imread(img_path) # 读取图像格式:BGR

print('[INFO] Loading the model')

# 实例化YOLOv5模型,得到检测结果

model = YOLOV5TorchObjectDetector(args.model_path, device, img_size=input_size, names=names)

# img[..., ::-1]: BGR --> RGB

# (480, 640, 3) --> (1, 3, 480, 640)

torch_img = model.preprocessing(img[..., ::-1])

tic = time.time()

# 遍历三层检测层

for target_layer in target_layers:

# 获取grad-cam方法

if args.method == 'gradcam':

saliency_method = YOLOV5GradCAM(model=model, layer_name=target_layer, img_size=input_size)

elif args.method == 'gradcampp':

saliency_method = YOLOV5GradCAMPP(model=model, layer_name=target_layer, img_size=input_size)

masks, logits, [boxes, _, class_names, conf] = saliency_method(torch_img) # 得到预测结果

result = torch_img.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy()

result = result[..., ::-1] # convert to bgr

# 保存设置

imgae_name = os.path.basename(img_path) # 获取图片名

save_path = f'{ args.output_dir}{ imgae_name[:-4]}/{ args.method}'

if not os.path.exists(save_path):

os.makedirs(save_path)

print(f'[INFO] Saving the final image at { save_path}')

# 遍历每张图片中的每个目标

for i, mask in enumerate(masks):

# 遍历图片中的每个目标

res_img = result.copy()

# 获取目标的位置和类别信息

bbox, cls_name = boxes[0][i], class_names[0][i]

label = f'{ cls_name} { conf[0][i]}' # 类别+置信分数

# 获取目标的热力图

res_img, heat_map = get_res_img(bbox, mask, res_img)

res_img = plot_one_box(bbox, res_img, label=label, color=colors[int(names.index(cls_name))],

line_thickness=3)

# 缩放到原图片大小

res_img = cv2.resize(res_img, dsize=(img.shape[:-1][::-1]))

output_path = f'{ save_path}/{ target_layer[6:8]}_{ i}.jpg'

cv2.imwrite(output_path, res_img)

print(f'{ target_layer[6:8]}_{ i}.jpg done!!')

print(f'Total time : { round(time.time() - tic, 4)} s')

if __name__ == '__main__':

# 图片路径为文件夹

if os.path.isdir(args.img_path):

img_list = os.listdir(args.img_path)

print(img_list)

for item in img_list:

# 依次获取文件夹中的图片名,组合成图片的路径

main(os.path.join(args.img_path, item))

# 单个图片

else:

main(args.img_path)

步骤四

在model文件夹下添加如下两个py文件,分别是gradcam.py和yolov5_object_detector.py

![在这里插入图片描述](https://img-blog.csdnimg.cn/83c33e99f69d43a0a3c7016aec5f81e8.png

gradcam.py代码如下:

<code>import time

import torch

import torch.nn.functional as F

def find_yolo_layer(model, layer_name):

"""Find yolov5 layer to calculate GradCAM and GradCAM++

Args:

model: yolov5 model.

layer_name (str): the name of layer with its hierarchical information.

Return:

target_layer: found layer

"""

hierarchy = layer_name.split('_')

target_layer = model.model._modules[hierarchy[0]]

for h in hierarchy[1:]:

target_layer = target_layer._modules[h]

return target_layer

class YOLOV5GradCAM:

# 初始化,得到target_layer层

def __init__(self, model, layer_name, img_size=(640, 640)):

self.model = model

self.gradients = dict()

self.activations = dict()

def backward_hook(module, grad_input, grad_output):

self.gradients['value'] = grad_output[0]

return None

def forward_hook(module, input, output):

self.activations['value'] = output

return None

target_layer = find_yolo_layer(self.model, layer_name)

# 获取forward过程中每层的输入和输出,用于对比hook是不是正确记录

target_layer.register_forward_hook(forward_hook)

target_layer.register_full_backward_hook(backward_hook)

device = 'cuda' if next(self.model.model.parameters()).is_cuda else 'cpu'

self.model(torch.zeros(1, 3, *img_size, device=device))

def forward(self, input_img, class_idx=True):

"""

Args:

input_img: input image with shape of (1, 3, H, W)

Return:

mask: saliency map of the same spatial dimension with input

logit: model output

preds: The object predictions

"""

saliency_maps = []

b, c, h, w = input_img.size()

preds, logits = self.model(input_img)

for logit, cls, cls_name in zip(logits[0], preds[1][0], preds[2][0]):

if class_idx:

score = logit[cls]

else:

score = logit.max()

self.model.zero_grad()

tic = time.time()

# 获取梯度

score.backward(retain_graph=True)

print(f"[INFO] { cls_name}, model-backward took: ", round(time.time() - tic, 4), 'seconds')

gradients = self.gradients['value']

activations = self.activations['value']

b, k, u, v = gradients.size()

alpha = gradients.view(b, k, -1).mean(2)

weights = alpha.view(b, k, 1, 1)

saliency_map = (weights * activations).sum(1, keepdim=True)

saliency_map = F.relu(saliency_map)

saliency_map = F.interpolate(saliency_map, size=(h, w), mode='bilinear', align_corners=False)code>

saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()

saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).data

saliency_maps.append(saliency_map)

return saliency_maps, logits, preds

def __call__(self, input_img):

return self.forward(input_img)

class YOLOV5GradCAMPP(YOLOV5GradCAM):

def __init__(self, model, layer_name, img_size=(640, 640)):

super(YOLOV5GradCAMPP, self).__init__(model, layer_name, img_size)

def forward(self, input_img, class_idx=True):

saliency_maps = []

b, c, h, w = input_img.size()

tic = time.time()

preds, logits = self.model(input_img)

print("[INFO] model-forward took: ", round(time.time() - tic, 4), 'seconds')

for logit, cls, cls_name in zip(logits[0], preds[1][0], preds[2][0]):

if class_idx:

score = logit[cls]

else:

score = logit.max()

self.model.zero_grad()

tic = time.time()

# 获取梯度

score.backward(retain_graph=True)

print(f"[INFO] { cls_name}, model-backward took: ", round(time.time() - tic, 4), 'seconds')

gradients = self.gradients['value'] # dS/dA

activations = self.activations['value'] # A

b, k, u, v = gradients.size()

alpha_num = gradients.pow(2)

alpha_denom = gradients.pow(2).mul(2) + \

activations.mul(gradients.pow(3)).view(b, k, u * v).sum(-1, keepdim=True).view(b, k, 1, 1)

# torch.where(condition, x, y) condition是条件,满足条件就返回x,不满足就返回y

alpha_denom = torch.where(alpha_denom != 0.0, alpha_denom, torch.ones_like(alpha_denom))

alpha = alpha_num.div(alpha_denom + 1e-7)

positive_gradients = F.relu(score.exp() * gradients) # ReLU(dY/dA) == ReLU(exp(S)*dS/dA))

weights = (alpha * positive_gradients).view(b, k, u * v).sum(-1).view(b, k, 1, 1)

saliency_map = (weights * activations).sum(1, keepdim=True)

saliency_map = F.relu(saliency_map)

saliency_map = F.interpolate(saliency_map, size=(h, w), mode='bilinear', align_corners=False)code>

saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()

saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).data

saliency_maps.append(saliency_map)

return saliency_maps, logits, preds

yolov5_object_detector.py的代码如下:

import numpy as np

import torch

from models.experimental import attempt_load

from utils.general import xywh2xyxy

from utils.dataloaders import letterbox

import cv2

import time

import torchvision

import torch.nn as nn

from utils.metrics import box_iou

class YOLOV5TorchObjectDetector(nn.Module):

def __init__(self,

model_weight,

device,

img_size,

names=None,

mode='eval',code>

confidence=0.45,

iou_thresh=0.45,

agnostic_nms=False):

super(YOLOV5TorchObjectDetector, self).__init__()

self.device = device

self.model = None

self.img_size = img_size

self.mode = mode

self.confidence = confidence

self.iou_thresh = iou_thresh

self.agnostic = agnostic_nms

self.model = attempt_load(model_weight, inplace=False, fuse=False)

self.model.requires_grad_(True)

self.model.to(device)

if self.mode == 'train':

self.model.train()

else:

self.model.eval()

# fetch the names

if names is None:

self.names = ['your dataset classname']

else:

self.names = names

# preventing cold start

img = torch.zeros((1, 3, *self.img_size), device=device)

self.model(img)

@staticmethod

def non_max_suppression(prediction, logits, conf_thres=0.3, iou_thres=0.45, classes=None, agnostic=False,

multi_label=False, labels=(), max_det=300):

"""Runs Non-Maximum Suppression (NMS) on inference and logits results

Returns:

list of detections, on (n,6) tensor per image [xyxy, conf, cls] and pruned input logits (n, number-classes)

"""

nc = prediction.shape[2] - 5 # number of classes

xc = prediction[..., 4] > conf_thres # candidates

# Checks

assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold { conf_thres}, valid values are between 0.0 and 1.0'

assert 0 <= iou_thres <= 1, f'Invalid IoU { iou_thres}, valid values are between 0.0 and 1.0'

# Settings

min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height

max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()

time_limit = 10.0 # seconds to quit after

redundant = True # require redundant detections

multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)

merge = False # use merge-NMS

t = time.time()

output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]

logits_output = [torch.zeros((0, nc), device=logits.device)] * logits.shape[0]

# logits_output = [torch.zeros((0, 80), device=logits.device)] * logits.shape[0]

for xi, (x, log_) in enumerate(zip(prediction, logits)): # image index, image inference

# Apply constraints

# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height

x = x[xc[xi]] # confidence

log_ = log_[xc[xi]]

# Cat apriori labels if autolabelling

if labels and len(labels[xi]):

l = labels[xi]

v = torch.zeros((len(l), nc + 5), device=x.device)

v[:, :4] = l[:, 1:5] # box

v[:, 4] = 1.0 # conf

v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls

x = torch.cat((x, v), 0)

# If none remain process next image

if not x.shape[0]:

continue

# Compute conf

x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf

# Box (center x, center y, width, height) to (x1, y1, x2, y2)

box = xywh2xyxy(x[:, :4])

# Detections matrix nx6 (xyxy, conf, cls)

if multi_label:

i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T

x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)

else: # best class only

conf, j = x[:, 5:].max(1, keepdim=True)

x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]

log_ = log_[conf.view(-1) > conf_thres]

# Filter by class

if classes is not None:

x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

# Check shape

n = x.shape[0] # number of boxes

if not n: # no boxes

continue

elif n > max_nms: # excess boxes

x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence

# Batched NMS

c = x[:, 5:6] * (0 if agnostic else max_wh) # classes

boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores

i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS

if i.shape[0] > max_det: # limit detections

i = i[:max_det]

if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)

# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)

iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix

weights = iou * scores[None] # box weights

x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes

if redundant:

i = i[iou.sum(1) > 1] # require redundancy

output[xi] = x[i]

logits_output[xi] = log_[i]

assert log_[i].shape[0] == x[i].shape[0]

if (time.time() - t) > time_limit:

print(f'WARNING: NMS time limit { time_limit}s exceeded')

break # time limit exceeded

return output, logits_output

@staticmethod

def yolo_resize(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):

return letterbox(img, new_shape=new_shape, color=color, auto=auto, scaleFill=scaleFill, scaleup=scaleup)

def forward(self, img):

prediction, logits, _ = self.model(img, augment=False)

prediction, logits = self.non_max_suppression(prediction, logits, self.confidence, self.iou_thresh,

classes=None,

agnostic=self.agnostic)

self.boxes, self.class_names, self.classes, self.confidences = [[[] for _ in range(img.shape[0])] for _ in

range(4)]

for i, det in enumerate(prediction): # detections per image

if len(det):

for *xyxy, conf, cls in det:

# 返回整数

bbox = [int(b) for b in xyxy]

self.boxes[i].append(bbox)

self.confidences[i].append(round(conf.item(), 2))

cls = int(cls.item())

self.classes[i].append(cls)

if self.names is not None:

self.class_names[i].append(self.names[cls])

else:

self.class_names[i].append(cls)

return [self.boxes, self.classes, self.class_names, self.confidences], logits

def preprocessing(self, img):

if len(img.shape) != 4:

img = np.expand_dims(img, axis=0)

im0 = img.astype(np.uint8)

img = np.array([self.yolo_resize(im, new_shape=self.img_size)[0] for im in im0])

img = img.transpose((0, 3, 1, 2))

img = np.ascontiguousarray(img)

img = torch.from_numpy(img).to(self.device)

img = img / 255.0

return img

步骤五

更改model/yolo.py

在这里插入图片描述

具体而言

Detect类中的forward函数

<code> def forward(self, x):

z = [] # inference output

logits_ = [] # 修改---1

for i in range(self.nl):

x[i] = self.m[i](x[i]) # conv

bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)

x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

if not self.training: # inference

if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:

self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

logits = x[i][..., 5:] # 修改---2

if isinstance(self, Segment): # (boxes + masks)

xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4)

xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy

wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh

y = torch.cat((xy, wh, conf.sigmoid(), mask), 4)

else: # Detect (boxes only)

xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)

xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy

wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh

y = torch.cat((xy, wh, conf), 4)

z.append(y.view(bs, self.na * nx * ny, self.no))

logits_.append(logits.view(bs, -1, self.no - 5)) # 修改---3

# return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)

return x if self.training else (torch.cat(z, 1), torch.cat(logits_, 1), x) # 修改---4

为了防止大家不知道怎么修改yolo.py文件,我将修改后的yolo.py文件放在下方

yolo.py

# YOLOv5 🚀 by Ultralytics, GPL-3.0 license

"""

YOLO-specific modules

Usage:

$ python models/yolo.py --cfg yolov5s.yaml

"""

import argparse

import contextlib

import os

import platform

import sys

from copy import deepcopy

from pathlib import Path

FILE = Path(__file__).resolve()

ROOT = FILE.parents[1] # YOLOv5 root directory

if str(ROOT) not in sys.path:

sys.path.append(str(ROOT)) # add ROOT to PATH

if platform.system() != 'Windows':

ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative

from models.common import *

from models.experimental import *

from utils.autoanchor import check_anchor_order

from utils.general import LOGGER, check_version, check_yaml, make_divisible, print_args

from utils.plots import feature_visualization

from utils.torch_utils import (fuse_conv_and_bn, initialize_weights, model_info, profile, scale_img, select_device,

time_sync)

try:

import thop # for FLOPs computation

except ImportError:

thop = None

class Detect(nn.Module):

# YOLOv5 Detect head for detection models

stride = None # strides computed during build

dynamic = False # force grid reconstruction

export = False # export mode

def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer

super().__init__()

self.nc = nc # number of classes

self.no = nc + 5 # number of outputs per anchor

self.nl = len(anchors) # number of detection layers

self.na = len(anchors[0]) // 2 # number of anchors

self.grid = [torch.empty(0) for _ in range(self.nl)] # init grid

self.anchor_grid = [torch.empty(0) for _ in range(self.nl)] # init anchor grid

self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2)) # shape(nl,na,2)

self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv

self.inplace = inplace # use inplace ops (e.g. slice assignment)

def forward(self, x):

z = [] # inference output

logits_ = [] # 修改---1

for i in range(self.nl):

x[i] = self.m[i](x[i]) # conv

bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)

x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

if not self.training: # inference

if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:

self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

logits = x[i][..., 5:] # 修改---2

if isinstance(self, Segment): # (boxes + masks)

xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4)

xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy

wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh

y = torch.cat((xy, wh, conf.sigmoid(), mask), 4)

else: # Detect (boxes only)

xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)

xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy

wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh

y = torch.cat((xy, wh, conf), 4)

z.append(y.view(bs, self.na * nx * ny, self.no))

logits_.append(logits.view(bs, -1, self.no - 5)) # 修改---3

# return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)

return x if self.training else (torch.cat(z, 1), torch.cat(logits_, 1), x) # 修改---4

def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, '1.10.0')):

d = self.anchors[i].device

t = self.anchors[i].dtype

shape = 1, self.na, ny, nx, 2 # grid shape

y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t)

yv, xv = torch.meshgrid(y, x, indexing='ij') if torch_1_10 else torch.meshgrid(y, x) # torch>=0.7 compatibilitycode>

grid = torch.stack((xv, yv), 2).expand(shape) - 0.5 # add grid offset, i.e. y = 2.0 * x - 0.5

anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape)

return grid, anchor_grid

class Segment(Detect):

# YOLOv5 Segment head for segmentation models

def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True):

super().__init__(nc, anchors, ch, inplace)

self.nm = nm # number of masks

self.npr = npr # number of protos

self.no = 5 + nc + self.nm # number of outputs per anchor

self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv

self.proto = Proto(ch[0], self.npr, self.nm) # protos

self.detect = Detect.forward

def forward(self, x):

p = self.proto(x[0])

x = self.detect(self, x)

return (x, p) if self.training else (x[0], p) if self.export else (x[0], p, x[1])

class BaseModel(nn.Module):

# YOLOv5 base model

def forward(self, x, profile=False, visualize=False):

return self._forward_once(x, profile, visualize) # single-scale inference, train

def _forward_once(self, x, profile=False, visualize=False):

y, dt = [], [] # outputs

for m in self.model:

if m.f != -1: # if not from previous layer

x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers

if profile:

self._profile_one_layer(m, x, dt)

x = m(x) # run

y.append(x if m.i in self.save else None) # save output

if visualize:

feature_visualization(x, m.type, m.i, save_dir=visualize)

return x

def _profile_one_layer(self, m, x, dt):

c = m == self.model[-1] # is final layer, copy input as inplace fix

o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPs

t = time_sync()

for _ in range(10):

m(x.copy() if c else x)

dt.append((time_sync() - t) * 100)

if m == self.model[0]:

LOGGER.info(f"{ 'time (ms)':>10s} { 'GFLOPs':>10s} { 'params':>10s} module")

LOGGER.info(f'{ dt[-1]:10.2f} { o:10.2f} { m.np:10.0f} { m.type}')

if c:

LOGGER.info(f"{ sum(dt):10.2f} { '-':>10s} { '-':>10s} Total")

def fuse(self): # fuse model Conv2d() + BatchNorm2d() layers

LOGGER.info('Fusing layers... ')

for m in self.model.modules():

if isinstance(m, (Conv, DWConv)) and hasattr(m, 'bn'):

m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv

delattr(m, 'bn') # remove batchnorm

m.forward = m.forward_fuse # update forward

self.info()

return self

def info(self, verbose=False, img_size=640): # print model information

model_info(self, verbose, img_size)

def _apply(self, fn):

# Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers

self = super()._apply(fn)

m = self.model[-1] # Detect()

if isinstance(m, (Detect, Segment)):

m.stride = fn(m.stride)

m.grid = list(map(fn, m.grid))

if isinstance(m.anchor_grid, list):

m.anchor_grid = list(map(fn, m.anchor_grid))

return self

class DetectionModel(BaseModel):

# YOLOv5 detection model

def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None): # model, input channels, number of classescode>

super().__init__()

if isinstance(cfg, dict):

self.yaml = cfg # model dict

else: # is *.yaml

import yaml # for torch hub

self.yaml_file = Path(cfg).name

with open(cfg, encoding='ascii', errors='ignore') as f:code>

self.yaml = yaml.safe_load(f) # model dict

# Define model

ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels

if nc and nc != self.yaml['nc']:

LOGGER.info(f"Overriding model.yaml nc={ self.yaml['nc']} with nc={ nc}")

self.yaml['nc'] = nc # override yaml value

if anchors:

LOGGER.info(f'Overriding model.yaml anchors with anchors={ anchors}')

self.yaml['anchors'] = round(anchors) # override yaml value

self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch]) # model, savelist

self.names = [str(i) for i in range(self.yaml['nc'])] # default names

self.inplace = self.yaml.get('inplace', True)

# Build strides, anchors

m = self.model[-1] # Detect()

if isinstance(m, (Detect, Segment)):

s = 256 # 2x min stride

m.inplace = self.inplace

forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x)

m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))]) # forward

check_anchor_order(m)

m.anchors /= m.stride.view(-1, 1, 1)

self.stride = m.stride

self._initialize_biases() # only run once

# Init weights, biases

initialize_weights(self)

self.info()

LOGGER.info('')

def forward(self, x, augment=False, profile=False, visualize=False):

if augment:

return self._forward_augment(x) # augmented inference, None

return self._forward_once(x, profile, visualize) # single-scale inference, train

def _forward_augment(self, x):

img_size = x.shape[-2:] # height, width

s = [1, 0.83, 0.67] # scales

f = [None, 3, None] # flips (2-ud, 3-lr)

y = [] # outputs

for si, fi in zip(s, f):

xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))

yi = self._forward_once(xi)[0] # forward

# cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1]) # save

yi = self._descale_pred(yi, fi, si, img_size)

y.append(yi)

y = self._clip_augmented(y) # clip augmented tails

return torch.cat(y, 1), None # augmented inference, train

def _descale_pred(self, p, flips, scale, img_size):

# de-scale predictions following augmented inference (inverse operation)

if self.inplace:

p[..., :4] /= scale # de-scale

if flips == 2:

p[..., 1] = img_size[0] - p[..., 1] # de-flip ud

elif flips == 3:

p[..., 0] = img_size[1] - p[..., 0] # de-flip lr

else:

x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale # de-scale

if flips == 2:

y = img_size[0] - y # de-flip ud

elif flips == 3:

x = img_size[1] - x # de-flip lr

p = torch.cat((x, y, wh, p[..., 4:]), -1)

return p

def _clip_augmented(self, y):

# Clip YOLOv5 augmented inference tails

nl = self.model[-1].nl # number of detection layers (P3-P5)

g = sum(4 ** x for x in range(nl)) # grid points

e = 1 # exclude layer count

i = (y[0].shape[1] // g) * sum(4 ** x for x in range(e)) # indices

y[0] = y[0][:, :-i] # large

i = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e)) # indices

y[-1] = y[-1][:, i:] # small

return y

def _initialize_biases(self, cf=None): # initialize biases into Detect(), cf is class frequency

# https://arxiv.org/abs/1708.02002 section 3.3

# cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.

m = self.model[-1] # Detect() module

for mi, s in zip(m.m, m.stride): # from

b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85)

b.data[:, 4] += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image)

b.data[:, 5:5 + m.nc] += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum()) # cls

mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)

Model = DetectionModel # retain YOLOv5 'Model' class for backwards compatibility

class SegmentationModel(DetectionModel):

# YOLOv5 segmentation model

def __init__(self, cfg='yolov5s-seg.yaml', ch=3, nc=None, anchors=None):code>

super().__init__(cfg, ch, nc, anchors)

class ClassificationModel(BaseModel):

# YOLOv5 classification model

def __init__(self, cfg=None, model=None, nc=1000, cutoff=10): # yaml, model, number of classes, cutoff index

super().__init__()

self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg)

def _from_detection_model(self, model, nc=1000, cutoff=10):

# Create a YOLOv5 classification model from a YOLOv5 detection model

if isinstance(model, DetectMultiBackend):

model = model.model # unwrap DetectMultiBackend

model.model = model.model[:cutoff] # backbone

m = model.model[-1] # last layer

ch = m.conv.in_channels if hasattr(m, 'conv') else m.cv1.conv.in_channels # ch into module

c = Classify(ch, nc) # Classify()

c.i, c.f, c.type = m.i, m.f, 'models.common.Classify' # index, from, type

model.model[-1] = c # replace

self.model = model.model

self.stride = model.stride

self.save = []

self.nc = nc

def _from_yaml(self, cfg):

# Create a YOLOv5 classification model from a *.yaml file

self.model = None

def parse_model(d, ch): # model_dict, input_channels(3)

# Parse a YOLOv5 model.yaml dictionary

LOGGER.info(f"\n{ '':>3}{ 'from':>18}{ 'n':>3}{ 'params':>10} { 'module':<40}{ 'arguments':<30}")

anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation')

if act:

Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU()

LOGGER.info(f"{ colorstr('activation:')} { act}") # print

na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors

no = na * (nc + 5) # number of outputs = anchors * (classes + 5)

layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out

for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args

m = eval(m) if isinstance(m, str) else m # eval strings

for j, a in enumerate(args):

with contextlib.suppress(NameError):

args[j] = eval(a) if isinstance(a, str) else a # eval strings

n = n_ = max(round(n * gd), 1) if n > 1 else n # depth gain

if m in {

Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,

BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x}:

c1, c2 = ch[f], args[0]

if c2 != no: # if not output

c2 = make_divisible(c2 * gw, 8)

args = [c1, c2, *args[1:]]

if m in { BottleneckCSP, C3, C3TR, C3Ghost, C3x}:

args.insert(2, n) # number of repeats

n = 1

elif m is nn.BatchNorm2d:

args = [ch[f]]

elif m is Concat:

c2 = sum(ch[x] for x in f)

# TODO: channel, gw, gd

elif m in { Detect, Segment}:

args.append([ch[x] for x in f])

if isinstance(args[1], int): # number of anchors

args[1] = [list(range(args[1] * 2))] * len(f)

if m is Segment:

args[3] = make_divisible(args[3] * gw, 8)

elif m is Contract:

c2 = ch[f] * args[0] ** 2

elif m is Expand:

c2 = ch[f] // args[0] ** 2

else:

c2 = ch[f]

m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module

t = str(m)[8:-2].replace('__main__.', '') # module type

np = sum(x.numel() for x in m_.parameters()) # number params

m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params

LOGGER.info(f'{ i:>3}{ str(f):>18}{ n_:>3}{ np:10.0f} { t:<40}{ str(args):<30}') # print

save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist

layers.append(m_)

if i == 0:

ch = []

ch.append(c2)

return nn.Sequential(*layers), sorted(save)

if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml')code>

parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs')code>

parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')code>

parser.add_argument('--profile', action='store_true', help='profile model speed')code>

parser.add_argument('--line-profile', action='store_true', help='profile model speed layer by layer')code>

parser.add_argument('--test', action='store_true', help='test all yolo*.yaml')code>

opt = parser.parse_args()

opt.cfg = check_yaml(opt.cfg) # check YAML

print_args(vars(opt))

device = select_device(opt.device)

# Create model

im = torch.rand(opt.batch_size, 3, 640, 640).to(device)

model = Model(opt.cfg).to(device)

# Options

if opt.line_profile: # profile layer by layer

model(im, profile=True)

elif opt.profile: # profile forward-backward

results = profile(input=im, ops=[model], n=3)

elif opt.test: # test all models

for cfg in Path(ROOT / 'models').rglob('yolo*.yaml'):

try:

_ = Model(cfg)

except Exception as e:

print(f'Error in { cfg}: { e}')

else: # report fused model summary

model.fuse()

步骤六:

运行main_gradcam.py

参数列表可以自己进行修改。

# Arguments

parser = argparse.ArgumentParser()

parser.add_argument('--model-path', type=str, default="yolov5s.pt", help='Path to the model')code>

parser.add_argument('--img-path', type=str, default='data/images/bus.jpg', help='input image path')code>

parser.add_argument('--output-dir', type=str, default='runs/result17', help='output dir')code>

parser.add_argument('--img-size', type=int, default=640, help="input image size")code>

parser.add_argument('--target-layer', type=str, default='model_17_cv3_act',code>

help='The layer hierarchical address to which gradcam will applied,'code>

' the names should be separated by underline')

parser.add_argument('--method', type=str, default='gradcam', help='gradcam method')code>

parser.add_argument('--device', type=str, default='cuda', help='cuda or cpu')code>

parser.add_argument('--no_text_box', action='store_true',code>

help='do not show label and box on the heatmap')code>

args = parser.parse_args()

完成

在这里插入图片描述

在这里插入图片描述



声明

本文内容仅代表作者观点,或转载于其他网站,本站不以此文作为商业用途
如有涉及侵权,请联系本站进行删除
转载本站原创文章,请注明来源及作者。