一、利用ROI训练分类器
最后是ROI的生成了,在每个训练的epch循环中,核心代码如下:
#输出:yield np.copy(x_img), [np.copy(y_rpn_cls), np.copy(y_rpn_regr)], img_data_aug #网络训练的输入是图片,和找到的最优的anchor,不是GTbox X, Y, img_data = next(data_gen_train) # loss_rpn = model_rpn.train_on_batch(X, Y)#Scalar training loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). write_log(callback, ['rpn_cls_loss', 'rpn_reg_loss'], loss_rpn, train_step) P_rpn = model_rpn.predict_on_batch(X)#获取预测的Y值,即[np.copy(y_rpn_cls), np.copy(y_rpn_regr)] #筛选出ROI,ROI指的是在feature map中合法box中具有最大概率包含物体的box,删除重叠率较高的box之后剩下来的box(xxx,4) R = roi_helpers.rpn_to_roi(P_rpn[0], P_rpn[1], C, K.image_dim_ordering(), use_regr=True, overlap_thresh=0.7, max_boxes=300) # note: calc_iou converts from (x1,y1,x2,y2) to (x,y,w,h) format #img_data,图片的信息,R候选ROI #输出: #X2:#选取的iou大于0.7的roi,这里用X2是为了和上面的X相区别, #Y1:对应的类别序号(1,xxx,21),类别标签是one_hot #Y2:[np.array(y_class_regr_label),np.array(y_class_regr_coords)]包含对应的类别的标签和回归参数,类别标签是one_hot的 #IoUs:用于调试的,没有用 X2, Y1, Y2, IouS = roi_helpers.calc_iou(R, img_data, C, class_mapping) if X2 is None: rpn_accuracy_rpn_monitor.append(0) rpn_accuracy_for_epoch.append(0) continue # sampling positive/negative samples neg_samples = np.where(Y1[0, :, -1] == 1)#背景,最后一项代表背景分类 pos_samples = np.where(Y1[0, :, -1] == 0)#非背景 if len(neg_samples) > 0: neg_samples = neg_samples[0] else: neg_samples = [] if len(pos_samples) > 0: pos_samples = pos_samples[0] else: pos_samples = [] rpn_accuracy_rpn_monitor.append(len(pos_samples)) rpn_accuracy_for_epoch.append((len(pos_samples))) if C.num_rois > 1: if len(pos_samples) < C.num_rois//2:#选取一些正例样本和一些反例样本,共300个,每类约150个 selected_pos_samples = pos_samples.tolist() else: selected_pos_samples = np.random.choice(pos_samples, C.num_rois//2, replace=False).tolist() try: selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=False).tolist() except: selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=True).tolist() sel_samples = selected_pos_samples + selected_neg_samples else: # in the extreme case where num_rois = 1, we pick a random pos or neg sample selected_pos_samples = pos_samples.tolist() selected_neg_samples = neg_samples.tolist() if np.random.randint(0, 2): sel_samples = random.choice(neg_samples) else: sel_samples = random.choice(pos_samples) #X用于生成share_layers,X2是挑选出来的roi loss_class = model_classifier.train_on_batch([X, X2[:, sel_samples, :]], [Y1[:, sel_samples, :], Y2[:, sel_samples, :]]) write_log(callback, ['detection_cls_loss', 'detection_reg_loss', 'detection_acc'], loss_class, train_step) train_step += 1这里面有两个关键方法,rpn_to_roi和calc_iou,下面分别注释。还可以参考http://geyao1995.com/Faster_rcnn代码笔记_test_2_roi_helpers/
二、rpn_to_roi
2.1 rpn_to_roi 代码及注释如下:
#输入: #rpn_layer:y_rpn_cls(1*37*37*18)前9个为valid后9个为overlap #regr_layer:y_rpn_regr(1*37*37*72),前36个为overlap标记(实际上只有9个,是复制了4份),后36个为bestreg #输出: #result->boxes合法box中具有最大概率包含物体的box,删除重叠率较高的box(xxx,4) def rpn_to_roi(rpn_layer, regr_layer, C, dim_ordering, use_regr=True, max_boxes=300,overlap_thresh=0.9): regr_layer = regr_layer / C.std_scaling#生成regr_layer的时候乘了一个std_scaling,这里除回去 anchor_sizes = C.anchor_box_scales# [128, 256, 512] anchor_ratios = C.anchor_box_ratios#[[1, 1], [1, 2], [2, 1]] assert rpn_layer.shape[0] == 1 if dim_ordering == 'th': (rows,cols) = rpn_layer.shape[2:] elif dim_ordering == 'tf': (rows, cols) = rpn_layer.shape[1:3] curr_layer = 0#当前的anchor标记 if dim_ordering == 'tf': A = np.zeros((4, rpn_layer.shape[1], rpn_layer.shape[2], rpn_layer.shape[3]))#A(4,W,H,18) elif dim_ordering == 'th': A = np.zeros((4, rpn_layer.shape[2], rpn_layer.shape[3], rpn_layer.shape[1])) for anchor_size in anchor_sizes:#[128,256,512],以128为例 for anchor_ratio in anchor_ratios:#[1:1][1:2][2:1],以1:2为例 anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride#anchor的x轴长度,在feture map图片上 (128*1)/16=8 anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride#anchor的y轴长度,在feture map图片上 (128*2)/16=16 if dim_ordering == 'th': regr = regr_layer[0, 4 * curr_layer:4 * curr_layer + 4, :, :] else: regr = regr_layer[0, :, :, 4 * curr_layer:4 * curr_layer + 4]#当前的回归参数,一共有72=36*2=9*4*2,reg=(W,H,4) regr = np.transpose(regr, (2, 0, 1))#regr->(W,H,4)->(4,W,H) X, Y = np.meshgrid(np.arange(cols),np. arange(rows))#生成网格点坐标矩阵,X.shape=(37*37) Y.shape=(37*37),一共有37*37个点,因此有同样数量的对应的坐标 #以下步骤是在所有的37*37个网格上对同样大小的anchor同时进行计算。 A[0, :, :, curr_layer] = X - anchor_x/2#获取网格中心点x坐标 A[1, :, :, curr_layer] = Y - anchor_y/2#获取网格中心点y坐标 A[2, :, :, curr_layer] = anchor_x##获取anchor宽度 A[3, :, :, curr_layer] = anchor_y#获取anchor高度 if use_regr:#对anchor采取变换 A[:, :, :, curr_layer] = apply_regr_np(A[:, :, :, curr_layer], regr) #调整变换anchor的位置,且anchor的宽度和高度都不能小于1, A[2, :, :, curr_layer] = np.maximum(1, A[2, :, :, curr_layer]) A[3, :, :, curr_layer] = np.maximum(1, A[3, :, :, curr_layer]) A[2, :, :, curr_layer] += A[0, :, :, curr_layer]#?? A[3, :, :, curr_layer] += A[1, :, :, curr_layer]#?? #修剪anchor,不能让anchor超出feature map范围 A[0, :, :, curr_layer] = np.maximum(0, A[0, :, :, curr_layer]) A[1, :, :, curr_layer] = np.maximum(0, A[1, :, :, curr_layer]) A[2, :, :, curr_layer] = np.minimum(cols-1, A[2, :, :, curr_layer]) A[3, :, :, curr_layer] = np.minimum(rows-1, A[3, :, :, curr_layer]) curr_layer += 1 all_boxes = np.reshape(A.transpose((0, 3, 1,2)), (4, -1)).transpose((1, 0))#A->(4,W,H,18)->(4,18,W,H)->(4,18*W*H)->(18*37*37,4) all_probs = rpn_layer.transpose((0, 3, 1, 2)).reshape((-1))#(1,37,37,18)->(1,18,37,37)->(18*37*37) x1 = all_boxes[:, 0]#(18*37*37) y1 = all_boxes[:, 1]#(18*37*37) x2 = all_boxes[:, 2]#(18*37*37) y2 = all_boxes[:, 3]#(18*37*37) idxs = np.where((x1 - x2 >= 0) | (y1 - y2 >= 0))#找到那些box坐标不合法的id, all_boxes = np.delete(all_boxes, idxs, 0)#删掉这些box all_boxes(xxx,1)成为一个行向量 all_probs = np.delete(all_probs, idxs, 0)#删掉这些box对应的probs,即分类结果 result = non_max_suppression_fast(all_boxes, all_probs, overlap_thresh=overlap_thresh, max_boxes=max_boxes)[0] #result->boxes合法box中具有最大概率包含物体的box,删除重叠率较高的box(xxx,4) return result2.2apply_regr_np
#输入: #X:(4,37,37),当前feature map上的anchor的坐标(x,y,w,h) #T:(4,37,37),对应的回归参数,这里的回归参数是经过RPN网络训练之后得到的回归参数,不是生成anchor时候得到的回归参数 #输出:变换之后的anchor,也就是认为最仅仅GTbox的一个anchor,这个anchor已经不是默认的anchor的大小和位置了,是经过平移和缩放的了 def apply_regr_np(X, T): try: x = X[0, :, :] y = X[1, :, :] w = X[2, :, :] h = X[3, :, :] tx = T[0, :, :] ty = T[1, :, :] tw = T[2, :, :] th = T[3, :, :] cx = x + w/2. cy = y + h/2. cx1 = tx * w + cx cy1 = ty * h + cy w1 = np.exp(tw.astype(np.float64)) * w h1 = np.exp(th.astype(np.float64)) * h x1 = cx1 - w1/2. y1 = cy1 - h1/2. x1 = np.round(x1) y1 = np.round(y1) w1 = np.round(w1) h1 = np.round(h1) return np.stack([x1, y1, w1, h1]) except Exception as e: print(e) return X2.3non_max_suppression_fast
#输入: #boxes:所有合法的box #probs:所有合法的box的预测概率 #输出: #boxes:合法box中具有最大概率包含物体的box,删除重叠率较高的box #probs:对应的预测概率 def non_max_suppression_fast(boxes, probs, overlap_thresh=0.9, max_boxes=300): # code used from here: http://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ # if there are no boxes, return an empty list if len(boxes) == 0: return [] # grab the coordinates of the bounding boxes,注意这里x1,y1,x2,y2都是array,记录的是所有合法的box x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] #Given two array_like objects, check that the shape is equal and all elements of the first object are strictly smaller than those of the second object. #下面这个assert同样是保证x1一定要小于对应的x2,即保证box合法 np.testing.assert_array_less(x1, x2) np.testing.assert_array_less(y1, y2) # if the bounding boxes integers, convert them to floats -- # this is important since we'll be doing a bunch of divisions #将box的坐标转换为浮点数 if boxes.dtype.kind == "i": boxes = boxes.astype("float") # initialize the list of picked indexes #初始化选取数组 pick = [] # calculate the areas #计算所有box的面积 area = (x2 - x1) * (y2 - y1) # sort the bounding boxes idxs = np.argsort(probs)#对probs的概率进行排序,概率表明了这个box是否是物体,#retrun Array of indices that sort 'a' along the specified axis. #从概率最大的box开始选 # keep looping while some indexes still remain in the indexes # list while len(idxs) > 0: # grab the last index in the indexes list and add the # index value to the list of picked indexes last = len(idxs) - 1 i = idxs[last]#i=index value pick.append(i) # find the intersection xx1_int = np.maximum(x1[i], x1[idxs[:last]])#取x1[i]和所有x1其他元素的之间的较大值,即取x1[i]作为左边界, np.maximum会让x[i]和中的所有元素比较,并且返回x,其中所有小于x[i]的都会被x[i]替换 yy1_int = np.maximum(y1[i], y1[idxs[:last]])#上边界 xx2_int = np.minimum(x2[i], x2[idxs[:last]])#右边界 yy2_int = np.minimum(y2[i], y2[idxs[:last]])#下边界 ww_int = np.maximum(0, xx2_int - xx1_int)#ww_int是数组 hh_int = np.maximum(0, yy2_int - yy1_int) area_int = ww_int * hh_int#area_int也是数组 # find the union area_union = area[i] + area[idxs[:last]] - area_int # compute the ratio of overlap overlap = area_int/(area_union + 1e-6) #删除那些重复率比较高的box # delete all indexes from the index list that have idxs = np.delete(idxs, np.concatenate(([last],np.where(overlap > overlap_thresh)[0]))) #最多只保留300个box,或者当所有的box都处理挑选完毕之后就退出循环 if len(pick) >= max_boxes: break # return only the bounding boxes that were picked using the integer data type boxes = boxes[pick].astype("int") probs = probs[pick] return boxes, probs三、calc_iou
#输入: #img_data,图片的信息, #R候选ROI,(xxx,4) 在feature map上的坐标4->(x1,y1,x2,y2) #输出: #X:#选取的iou大于0.7的roi #Y1:对应的类别序号(1,xxx,21),类别标签是one_hot #Y2:[np.array(y_class_regr_label),np.array(y_class_regr_coords)]包含对应的类别的标签和回归参数,类别标签是one_hot的 #IoUs:用于调试的,没有用 def calc_iou(R, img_data, C, class_mapping): ''' all_img_data[0] = {'width': 500, 'height': 500, 'bboxes': [{'y2': 500, 'y1': 27, 'x2': 183, 'x1': 20, 'class': 'person', 'difficult': False}, {'y2': 500, 'y1': 2, 'x2': 249, 'x1': 112, 'class': 'person', 'difficult': False}, {'y2': 490, 'y1': 233, 'x2': 376, 'x1': 246, 'class': 'person', 'difficult': False}, {'y2': 468, 'y1': 319, 'x2': 356, 'x1': 231, 'class': 'chair', 'difficult': False}, {'y2': 450, 'y1': 314, 'x2': 58, 'x1': 1, 'class': 'chair', 'difficult': True}], 'imageset': 'test', 'filepath': './datasets/VOC2007/JPEGImages/000910.jpg'} ''' bboxes = img_data['bboxes'] (width, height) = (img_data['width'], img_data['height']) # get image dimensions for resizing (resized_width, resized_height) = data_generators.get_new_img_size(width, height, C.im_size) gta = np.zeros((len(bboxes), 4))#GTbox的坐标,e.g. 5*4 for bbox_num, bbox in enumerate(bboxes): # get the GT box coordinates, and resize to account for image resizing #得到gta在feature map的坐标 gta[bbox_num, 0] = int(round(bbox['x1'] * (resized_width / float(width))/C.rpn_stride))#e.g. vgg (600/16)=37 gta[bbox_num, 1] = int(round(bbox['x2'] * (resized_width / float(width))/C.rpn_stride)) gta[bbox_num, 2] = int(round(bbox['y1'] * (resized_height / float(height))/C.rpn_stride)) gta[bbox_num, 3] = int(round(bbox['y2'] * (resized_height / float(height))/C.rpn_stride)) x_roi = []#选取的iou大于0.7的roi y_class_num = []#对应的类别序号 y_class_regr_coords = [] y_class_regr_label = [] IoUs = [] # for debugging only for ix in range(R.shape[0]):#遍历每个一ROI (x1, y1, x2, y2) = R[ix, :] x1 = int(round(x1)) y1 = int(round(y1)) x2 = int(round(x2)) y2 = int(round(y2)) best_iou = 0.0 best_bbox = -1 #计算当前ROI和gta在feature map上的iou for bbox_num in range(len(bboxes)): curr_iou = data_generators.iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1, y1, x2, y2]) if curr_iou > best_iou: best_iou = curr_iou best_bbox = bbox_num if best_iou < C.classifier_min_overlap:#0.3 iou小于0.3则忽略这个ROI continue else: w = x2 - x1 h = y2 - y1 x_roi.append([x1, y1, w, h])#将ROI的(x1,y1,x2,y2)转变为(x,y,w,h)格式 IoUs.append(best_iou) if C.classifier_min_overlap <= best_iou < C.classifier_max_overlap:#如果best iou在0.3到0.7之间,则认为是背景 # hard negative example cls_name = 'bg' elif C.classifier_max_overlap <= best_iou: cls_name = bboxes[best_bbox]['class']#如果bestiou大等于0.7则找到了对应的GTA的类别存入cls_name cxg = (gta[best_bbox, 0] + gta[best_bbox, 1]) / 2.0 cyg = (gta[best_bbox, 2] + gta[best_bbox, 3]) / 2.0 cx = x1 + w / 2.0 cy = y1 + h / 2.0 #计算回归参数 tx = (cxg - cx) / float(w) ty = (cyg - cy) / float(h) tw = np.log((gta[best_bbox, 1] - gta[best_bbox, 0]) / float(w)) th = np.log((gta[best_bbox, 3] - gta[best_bbox, 2]) / float(h)) else: print('roi = {}'.format(best_iou)) raise RuntimeError class_num = class_mapping[cls_name] class_label = len(class_mapping) * [0]#print(3*[0])->[0,0,0] class_label->(21*1) class_label[class_num] = 1 y_class_num.append(copy.deepcopy(class_label)) coords = [0] * 4 * (len(class_mapping) - 1)#临时变量,存储坐标 labels = [0] * 4 * (len(class_mapping) - 1)#临时变量,存储类别标签 if cls_name != 'bg': label_pos = 4 * class_num#每个类别的roi有4个坐标,占4个位置 sx, sy, sw, sh = C.classifier_regr_std coords[label_pos:4+label_pos] = [sx*tx, sy*ty, sw*tw, sh*th]#乘一个回归标准差[8.0, 8.0, 4.0, 4.0],作用?? labels[label_pos:4+label_pos] = [1, 1, 1, 1]#对应的标签标记为1 y_class_regr_coords.append(copy.deepcopy(coords)) y_class_regr_label.append(copy.deepcopy(labels)) else: y_class_regr_coords.append(copy.deepcopy(coords)) y_class_regr_label.append(copy.deepcopy(labels)) if len(x_roi) == 0: return None, None, None, None X = np.array(x_roi) Y1 = np.array(y_class_num)#(xxx,21) Y2 = np.concatenate([np.array(y_class_regr_label),np.array(y_class_regr_coords)],axis=1) #最后返回的时候Y1又利用expand_dims增加了一个维度 return np.expand_dims(X, axis=0), np.expand_dims(Y1, axis=0), np.expand_dims(Y2, axis=0), IoUs