AVOD_源碼記錄
Table of Contents
AVOD代碼框架
主要分爲以下幾個部分:
- 預生成數據
- Train
- Evaluate+Infer
代碼細節
預生成數據
用於生成rpn網絡的輸入數據:包含類聚類的anchor大小信息以及具體每個sample的anchor的生成的anchor信息
調用鏈
base_dir = avod/
config = avod/avod/configs/mb_preprocessing/rpn_cars(cyclists,pedestrians,people).config
主要的相關模塊調用:
scripts/preprocessing/gen_min_batches.py->avod/builders/dataset_builder.py(build_kitti_dataset)->avod/datasets/kitti/kitti_dataset.py(KittiDataset)->avod/datasets/kitti/kitti_utils.py(KittiUtils)->avod/core/mini_batch_utils.py(MiniBatchUtils.preprocess_rpn_mini_batches)->avod/core/mini_batch_preprocessor.py(MiniBatchPreprocessor.preprocess->avod/core/anchor_generator/grid_anchor_3d_generator.py(GridAnchor3dGenerator.generate)
核心部分
-
數據前處理:mini_batch Anchor生成
Avod數據前處理gen_minbacth包括兩個部分:生成不同類的size的cluster結果;利用聚類結果生成不同類的Anchor信息,作爲RPN的輸入數據
Anchor信息具體爲:[max_gt_2d_iou, max_gt_3d_iou, (6 x offsets), class_index],anchor對應的gt_iou(2d和3d),anchor偏移值,對應類的index
具體步驟爲:
- 先生成anchor_stride(默認爲0.5)的3d anchor
- 生成voxel 2d圖,進行empty-anchor的過濾
- anchors與gt進行iou的計算,確定與生成的anchor iou最高的類,更新offsets與class_index
核心代碼如下:
# mini_batch_preprocessor.py:49 def preprocess(self, indices): """Preprocesses anchor info and saves info to files Args: indices (int array): sample indices to process. If None, processes all samples """ # Get anchor stride for class,默認爲0.5 anchor_strides = self._anchor_strides dataset = self._dataset dataset_utils = self._dataset.kitti_utils classes_name = dataset.classes_name # Make folder if it doesn't exist yet output_dir = self.mini_batch_utils.get_file_path(classes_name, anchor_strides, sample_name=None) os.makedirs(output_dir, exist_ok=True) # Get clusters for class # 生成的cluster size用於anchor size的生成 all_clusters_sizes, _ = dataset.get_cluster_info() # 初始化3d_anchor_generator anchor_generator = grid_anchor_3d_generator.GridAnchor3dGenerator() # Load indices of data_split all_samples = dataset.sample_list if indices is None: indices = np.arange(len(all_samples)) num_samples = len(indices) # For each image in the dataset, save info on the anchors for sample_idx in indices: # Get image name for given cluster sample_name = all_samples[sample_idx].name img_idx = int(sample_name) # Check for existing files and skip to the next if self._check_for_existing(classes_name, anchor_strides, sample_name): print("{} / {}: Sample already preprocessed".format( sample_idx + 1, num_samples, sample_name)) continue # Get ground truth and filter based on difficulty ground_truth_list = obj_utils.read_labels(dataset.label_dir, img_idx) # Filter objects to dataset classes filtered_gt_list = dataset_utils.filter_labels(ground_truth_list) filtered_gt_list = np.asarray(filtered_gt_list) # Filtering by class has no valid ground truth, skip this image if len(filtered_gt_list) == 0: print("{} / {} No {}s for sample {} " "(Ground Truth Filter)".format( sample_idx + 1, num_samples, classes_name, sample_name)) # Output an empty file and move on to the next image. self._save_to_file(classes_name, anchor_strides, sample_name) continue # Get ground plane ground_plane = obj_utils.get_road_plane(img_idx, dataset.planes_dir) image = Image.open(dataset.get_rgb_image_path(sample_name)) image_shape = [image.size[1], image.size[0]] # Generate sliced 2D voxel grid for filtering # 生成2d voxel grid,這裏只保留了image視角內bev圖信息 vx_grid_2d = dataset_utils.create_sliced_voxel_grid_2d( sample_name, source=dataset.bev_source, image_shape=image_shape) # List for merging all anchors all_anchor_boxes_3d = [] # Create anchors for each class for class_idx in range(len(dataset.classes)): # Generate anchors for all classes # 根據不同class的anchor大小以及stride和plane生成3d anchor grid_anchor_boxes_3d = anchor_generator.generate( area_3d=self._area_extents, anchor_3d_sizes=all_clusters_sizes[class_idx], anchor_stride=self._anchor_strides[class_idx], ground_plane=ground_plane) all_anchor_boxes_3d.extend(grid_anchor_boxes_3d) # Filter empty anchors all_anchor_boxes_3d = np.asarray(all_anchor_boxes_3d) anchors = box_3d_encoder.box_3d_to_anchor(all_anchor_boxes_3d) empty_anchor_filter = anchor_filter.get_empty_anchor_filter_2d( anchors, vx_grid_2d, self._density_threshold) # Calculate anchor info # 這裏更新了所有anchor和gt的iou信息,以找到anchor匹配的目標target anchors_info = self._calculate_anchors_info( all_anchor_boxes_3d, empty_anchor_filter, filtered_gt_list) anchor_ious = anchors_info[:, self.mini_batch_utils.col_ious] valid_iou_indices = np.where(anchor_ious > 0.0)[0] print("{} / {}:" "{:>6} anchors, " "{:>6} iou > 0.0, " "for {:>3} {}(s) for sample {}".format( sample_idx + 1, num_samples, len(anchors_info), len(valid_iou_indices), len(filtered_gt_list), classes_name, sample_name )) # Save anchors info self._save_to_file(classes_name, anchor_strides, sample_name, anchors_info)
其中3D Anchor生成的步驟:
-
確定Anchor生成範圍(area_extents)
-
根據stride生成anchor的center點分佈
-
生成size和rotation分佈->生成anchor matrix
def tile_anchors_3d(area_extents, anchor_3d_sizes, anchor_stride, ground_plane): """ Tiles anchors over the area extents by using meshgrids to generate combinations of (x, y, z), (l, w, h) and ry. Args: area_extents: [[min_x, max_x], [min_y, max_y], [min_z, max_z]] anchor_3d_sizes: list of 3d anchor sizes N x (l, w, h) anchor_stride: stride lengths (x_stride, z_stride) ground_plane: coefficients of the ground plane e.g. [0, -1, 0, 0] Returns: boxes: list of 3D anchors in box_3d format N x [x, y, z, l, w, h, ry] """ # Convert sizes to ndarray # 由於kitti座標系的原因:x,z軸定義的爲地平面座標系,而y軸對應高度 anchor_3d_sizes = np.asarray(anchor_3d_sizes) anchor_stride_x = anchor_stride[0] anchor_stride_z = anchor_stride[1] anchor_rotations = np.asarray([0, np.pi / 2.0]) x_start = area_extents[0][0] + anchor_stride[0] / 2.0 x_end = area_extents[0][1] x_centers = np.array(np.arange(x_start, x_end, step=anchor_stride_x), dtype=np.float32) z_start = area_extents[2][1] - anchor_stride[1] / 2.0 z_end = area_extents[2][0] z_centers = np.array(np.arange(z_start, z_end, step=-anchor_stride_z), dtype=np.float32) # Use ranges for substitution size_indices = np.arange(0, len(anchor_3d_sizes)) rotation_indices = np.arange(0, len(anchor_rotations)) # Generate matrix for substitution # e.g. for two sizes and two rotations # [[x0, z0, 0, 0], [x0, z0, 0, 1], [x0, z0, 1, 0], [x0, z0, 1, 1], # [x1, z0, 0, 0], [x1, z0, 0, 1], [x1, z0, 1, 0], [x1, z0, 1, 1], ...] before_sub = np.stack(np.meshgrid(x_centers, z_centers, size_indices, rotation_indices), axis=4).reshape(-1, 4) # Place anchors on the ground plane # 利用之前的meshgrid生成anchor的center點 a, b, c, d = ground_plane all_x = before_sub[:, 0] all_z = before_sub[:, 1] all_y = -(a * all_x + c * all_z + d) / b # Create empty matrix to return num_anchors = len(before_sub) all_anchor_boxes_3d = np.zeros((num_anchors, 7)) # Fill in x, y, z all_anchor_boxes_3d[:, 0:3] = np.stack((all_x, all_y, all_z), axis=1) # Fill in shapes sizes = anchor_3d_sizes[np.asarray(before_sub[:, 2], np.int32)] all_anchor_boxes_3d[:, 3:6] = sizes # Fill in rotations rotations = anchor_rotations[np.asarray(before_sub[:, 3], np.int32)] all_anchor_boxes_3d[:, 6] = rotations return all_anchor_boxes_3d
模型訓練
avod模型的整體結構包括backbone+RPN+avod網絡三個部分,詳情參照avod_paperreading
backbone採用的是VGG+FPN的結構,但是添加了bev feature的設計(lidar三維數據轉化爲二維的bev特徵),後與image feature進行融合,RPN網絡用於生成region proposal,avod用於最後物體的分類和檢測框的迴歸
調用鏈
base_dir = avod/
主要的相關模塊調用:
config = avod/config/pyramid_cars_with_aug_example.config
scripts/run_training.py->avod/avod/core/trainer.py(這裏會完成model,input_data,loss,op等模塊的構建)->avod/avod/core/models/avod_model.py->avod/avod/core/models/rpn_model.py
核心部分
-
數據前處理
訓練的數據前處理與前文的預生成數據的區別是這裏是對輸入的原始數據進行處理,主要分爲以下幾個部分:
-
三維點雲數據的讀取和過濾:
三維點雲數據讀入後需要進行去除在image視角外的點雲數據包括兩個部分:ground_plane_filter+image_filter,前者主要用於生成bev圖特徵(對應不同高度生成不同體素空間,進行點的特徵編碼,參照bev的生成),後者主要是將對應cam view外的點進行過濾。
-
BEV圖的生成
BEV圖生成原理是在過濾後的點雲數據上,根據height_lo和height_hi的高度範圍(相對於ground_plane)生成num_slices個y軸維度的切片(slices)每個切片上按照voxel_size生成一系列單元(voxel),以其中點雲的最高點高度作爲feature,最終生成(bev_width/voxel_size)*(bev_height/voxel_size)*(num_slices+1)維特徵,+1爲記錄的density信息,代碼如下
#avod/acod/datasets/kitti/kitti_utils.py:109 def generate_bev(self, source, point_cloud, ground_plane, area_extents, voxel_size): """Generates the BEV maps dictionary. One height map is created for each slice of the point cloud. One density map is created for the whole point cloud. Args: source: point cloud source point_cloud: point cloud (3, N) ground_plane: ground plane coefficients area_extents: 3D area extents [[min_x, max_x], [min_y, max_y], [min_z, max_z]] voxel_size: voxel size in m Returns: BEV maps dictionary height_maps: list of height maps density_map: density map """ #得到點雲數據 all_points = np.transpose(point_cloud) height_maps = [] for slice_idx in range(self.num_slices): height_lo = self.height_lo + slice_idx * self.height_per_division height_hi = height_lo + self.height_per_division #slice_filter相對ground_plane根據高度進行每個slice點雲的過濾 slice_filter = self.kitti_utils.create_slice_filter( point_cloud, area_extents, ground_plane, height_lo, height_hi) # Apply slice filter slice_points = all_points[slice_filter] if len(slice_points) > 1: # Create Voxel Grid 2D voxel_grid_2d = VoxelGrid2D() voxel_grid_2d.voxelize_2d( slice_points, voxel_size, extents=area_extents, ground_plane=ground_plane, create_leaf_layout=False) # Remove y values (all 0) voxel_indices = voxel_grid_2d.voxel_indices[:, [0, 2]] # Create empty BEV images height_map = np.zeros((voxel_grid_2d.num_divisions[0], voxel_grid_2d.num_divisions[2])) # Only update pixels where voxels have max height values, # and normalize by height of slices # 生成含有最大高度信息的height_map voxel_grid_2d.heights = voxel_grid_2d.heights - height_lo height_map[voxel_indices[:, 0], voxel_indices[:, 1]] = \ np.asarray(voxel_grid_2d.heights) / self.height_per_division height_maps.append(height_map) # Rotate height maps 90 degrees # (transpose and flip) is faster than np.rot90 # 應該是座標系定義的問題(image和bev) height_maps_out = [np.flip(height_maps[map_idx].transpose(), axis=0) for map_idx in range(len(height_maps))] #得到density的filter,在全量高度上得到 density_slice_filter = self.kitti_utils.create_slice_filter( point_cloud, area_extents, ground_plane, self.height_lo, self.height_hi) density_points = all_points[density_slice_filter] # Create Voxel Grid 2D density_voxel_grid_2d = VoxelGrid2D() density_voxel_grid_2d.voxelize_2d( density_points, voxel_size, extents=area_extents, ground_plane=ground_plane, create_leaf_layout=False) # Generate density map density_voxel_indices_2d = \ density_voxel_grid_2d.voxel_indices[:, [0, 2]] density_map = self._create_density_map( num_divisions=density_voxel_grid_2d.num_divisions, voxel_indices_2d=density_voxel_indices_2d, num_pts_per_voxel=density_voxel_grid_2d.num_pts_in_voxel, norm_value=self.NORM_VALUES[source]) bev_maps = dict() bev_maps['height_maps'] = height_maps_out bev_maps['density_map'] = density_map return bev_maps
-
數據增強(data augumentation)
這部分主要是在讀入數據的過程中會進行數據的增強操作,默認car的增強操作包括:flipping+pca_jitter。
-
-
Backbone
backbone(feature extactor)包括兩個部分:bev和image,整體結構類似,具體實現參考下文代碼,其結構可以概述爲conv1*2->pool1->conv2*2->pool2->conv3*2->pool3->conv4->(upconv3+concat3+fusion3)->(upconv2+concat2+fusion2)->(upconv1+concat1+fusion1)
#avod/core/feature_extractors/bev_vgg_pyramid.py:30 def build(self, inputs, input_pixel_size, is_training, scope='bev_vgg_pyr'): """ Modified VGG for BEV feature extraction with pyramid features Args: inputs: a tensor of size [batch_size, height, width, channels]. input_pixel_size: size of the input (H x W) is_training: True for training, False for validation/testing. scope: Optional scope for the variables. Returns: The last op containing the log predictions and end_points dict. """ vgg_config = self.config with slim.arg_scope(self.vgg_arg_scope( weight_decay=vgg_config.l2_weight_decay)): with tf.variable_scope(scope, 'bev_vgg_pyr', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.max_pool2d], outputs_collections=end_points_collection): # Pad 700 to 704 to allow even divisions for max pooling padded = tf.pad(inputs, [[0, 0], [4, 0], [0, 0], [0, 0]]) # Encoder conv1 = slim.repeat(padded, vgg_config.vgg_conv1[0], slim.conv2d, vgg_config.vgg_conv1[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv1') pool1 = slim.max_pool2d(conv1, [2, 2], scope='pool1') conv2 = slim.repeat(pool1, vgg_config.vgg_conv2[0], slim.conv2d, vgg_config.vgg_conv2[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv2') pool2 = slim.max_pool2d(conv2, [2, 2], scope='pool2') conv3 = slim.repeat(pool2, vgg_config.vgg_conv3[0], slim.conv2d, vgg_config.vgg_conv3[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv3') pool3 = slim.max_pool2d(conv3, [2, 2], scope='pool3') conv4 = slim.repeat(pool3, vgg_config.vgg_conv4[0], slim.conv2d, vgg_config.vgg_conv4[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv4') # Decoder (upsample and fuse features) upconv3 = slim.conv2d_transpose( conv4, vgg_config.vgg_conv3[1], [3, 3], stride=2, normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='upconv3') concat3 = tf.concat( (conv3, upconv3), axis=3, name='concat3') pyramid_fusion3 = slim.conv2d( concat3, vgg_config.vgg_conv2[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='pyramid_fusion3') upconv2 = slim.conv2d_transpose( pyramid_fusion3, vgg_config.vgg_conv2[1], [3, 3], stride=2, normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='upconv2') concat2 = tf.concat( (conv2, upconv2), axis=3, name='concat2') pyramid_fusion_2 = slim.conv2d( concat2, vgg_config.vgg_conv1[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='pyramid_fusion2') upconv1 = slim.conv2d_transpose( pyramid_fusion_2, vgg_config.vgg_conv1[1], [3, 3], stride=2, normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='upconv1') concat1 = tf.concat( (conv1, upconv1), axis=3, name='concat1') pyramid_fusion1 = slim.conv2d( concat1, vgg_config.vgg_conv1[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='pyramid_fusion1') # Slice off padded area sliced = pyramid_fusion1[:, 4:] feature_maps_out = sliced # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict( end_points_collection) return feature_maps_out, end_points
-
RPN Model
Backbone(feature extraction)出來的feature會分別經過一個1*1的卷積(bottle_neck)生成proposal網絡的input_feature。默認配置設置了path_drop:image和bev兩個path會有一定的機率沒有輸入,類似於drop_out(具體參考avod/avod/core/models/rpn.py:create_path_drop_masks)。之後會將得到的3d anchor映射到bev圖和image圖上,前者直接投影到ground_plane上,後者通過lidar座標和image座標的映射關係得到(取最大的2d框)。之後根據config中的roi_crop_size將得到的proposal feature進行crop_and_resize到相同尺寸。之後會做特徵的fusion(默認採用mean fusion),fusioned feature會通過兩個分支:3層卷積(論文中爲fc,實際代碼中爲convd)組成的objectness和offsets的預測,這樣就形成了first stage的proposal,之後proposal一方面會通過top-k的nms(注意這裏的nms是所有類共同做的nms結果)作爲second stage的輸入,另一方面通過gen_mini_batch生成mini-batch(默認爲512個samples,正負例各一半)計算objectness和regression loss(smooth l1),值得注意的是這裏的是生成mini-batch的方式採用的是random shuffile的方式,即先shuffle一半的正例(256),如果不足的話用負例補充,沒有考慮類比不平衡的問題,所以會造成小樣本類別物體收斂慢甚至不收斂的問題。其build 網絡部分代碼如下:
#rpn_model.py:280, deteled some code for summary def build(self): # Setup input placeholders self._set_up_input_pls() # Setup feature extractors self._set_up_feature_extractors() bev_proposal_input = self.bev_bottleneck img_proposal_input = self.img_bottleneck fusion_mean_div_factor = 2.0 # If both img and bev probabilites are set to 1.0, don't do # path drop. if not (self._path_drop_probabilities[0] == self._path_drop_probabilities[1] == 1.0): with tf.variable_scope('rpn_path_drop'): random_values = tf.random_uniform(shape=[3], minval=0.0, maxval=1.0) img_mask, bev_mask = self.create_path_drop_masks( self._path_drop_probabilities[0], self._path_drop_probabilities[1], random_values) img_proposal_input = tf.multiply(img_proposal_input, img_mask) bev_proposal_input = tf.multiply(bev_proposal_input, bev_mask) self.img_path_drop_mask = img_mask self.bev_path_drop_mask = bev_mask # Overwrite the division factor fusion_mean_div_factor = img_mask + bev_mask with tf.variable_scope('proposal_roi_pooling'): with tf.variable_scope('box_indices'): def get_box_indices(boxes): proposals_shape = boxes.get_shape().as_list() if any(dim is None for dim in proposals_shape): proposals_shape = tf.shape(boxes) ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) multiplier = tf.expand_dims( tf.range(start=0, limit=proposals_shape[0]), 1) return tf.reshape(ones_mat * multiplier, [-1]) bev_boxes_norm_batches = tf.expand_dims( self._bev_anchors_norm_pl, axis=0) # These should be all 0's since there is only 1 image tf_box_indices = get_box_indices(bev_boxes_norm_batches) # Do ROI Pooling on BEV bev_proposal_rois = tf.image.crop_and_resize( bev_proposal_input, self._bev_anchors_norm_pl, tf_box_indices, self._proposal_roi_crop_size) # Do ROI Pooling on image img_proposal_rois = tf.image.crop_and_resize( img_proposal_input, self._img_anchors_norm_pl, tf_box_indices, self._proposal_roi_crop_size) with tf.variable_scope('proposal_roi_fusion'): rpn_fusion_out = None if self._fusion_method == 'mean': tf_features_sum = tf.add(bev_proposal_rois, img_proposal_rois) rpn_fusion_out = tf.divide(tf_features_sum, fusion_mean_div_factor) elif self._fusion_method == 'concat': rpn_fusion_out = tf.concat( [bev_proposal_rois, img_proposal_rois], axis=3) else: raise ValueError('Invalid fusion method', self._fusion_method) # TODO: move this section into an separate AnchorPredictor class with tf.variable_scope('anchor_predictor', 'ap', [rpn_fusion_out]): tensor_in = rpn_fusion_out # Parse rpn layers config layers_config = self._config.layers_config.rpn_config l2_weight_decay = layers_config.l2_weight_decay if l2_weight_decay > 0: weights_regularizer = slim.l2_regularizer(l2_weight_decay) else: weights_regularizer = None with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): # Use conv2d instead of fully_connected layers. cls_fc6 = slim.conv2d(tensor_in, layers_config.cls_fc6, self._proposal_roi_crop_size, padding='VALID', scope='cls_fc6') cls_fc6_drop = slim.dropout(cls_fc6, layers_config.keep_prob, is_training=self._is_training, scope='cls_fc6_drop') cls_fc7 = slim.conv2d(cls_fc6_drop, layers_config.cls_fc7, [1, 1], scope='cls_fc7') cls_fc7_drop = slim.dropout(cls_fc7, layers_config.keep_prob, is_training=self._is_training, scope='cls_fc7_drop') cls_fc8 = slim.conv2d(cls_fc7_drop, 2, [1, 1], activation_fn=None, scope='cls_fc8') objectness = tf.squeeze( cls_fc8, [1, 2], name='cls_fc8/squeezed') # Use conv2d instead of fully_connected layers. reg_fc6 = slim.conv2d(tensor_in, layers_config.reg_fc6, self._proposal_roi_crop_size, padding='VALID', scope='reg_fc6') reg_fc6_drop = slim.dropout(reg_fc6, layers_config.keep_prob, is_training=self._is_training, scope='reg_fc6_drop') reg_fc7 = slim.conv2d(reg_fc6_drop, layers_config.reg_fc7, [1, 1], scope='reg_fc7') reg_fc7_drop = slim.dropout(reg_fc7, layers_config.keep_prob, is_training=self._is_training, scope='reg_fc7_drop') reg_fc8 = slim.conv2d(reg_fc7_drop, 6, [1, 1], activation_fn=None, scope='reg_fc8') offsets = tf.squeeze( reg_fc8, [1, 2], name='reg_fc8/squeezed') # Return the proposals with tf.variable_scope('proposals'): anchors = self.placeholders[self.PL_ANCHORS] # Decode anchor regression offsets with tf.variable_scope('decoding'): regressed_anchors = anchor_encoder.offset_to_anchor( anchors, offsets) with tf.variable_scope('bev_projection'): _, bev_proposal_boxes_norm = anchor_projector.project_to_bev( regressed_anchors, self._bev_extents) with tf.variable_scope('softmax'): objectness_softmax = tf.nn.softmax(objectness) with tf.variable_scope('nms'): objectness_scores = objectness_softmax[:, 1] # Do NMS on regressed anchors top_indices = tf.image.non_max_suppression( bev_proposal_boxes_norm, objectness_scores, max_output_size=self._nms_size, iou_threshold=self._nms_iou_thresh) top_anchors = tf.gather(regressed_anchors, top_indices) top_objectness_softmax = tf.gather(objectness_scores, top_indices) # top_offsets = tf.gather(offsets, top_indices) # top_objectness = tf.gather(objectness, top_indices) # Get mini batch all_ious_gt = self.placeholders[self.PL_ANCHOR_IOUS] all_offsets_gt = self.placeholders[self.PL_ANCHOR_OFFSETS] all_classes_gt = self.placeholders[self.PL_ANCHOR_CLASSES] with tf.variable_scope('mini_batch'): mini_batch_utils = self.dataset.kitti_utils.mini_batch_utils mini_batch_mask, _ = \ mini_batch_utils.sample_rpn_mini_batch(all_ious_gt) # Ground Truth Tensors with tf.variable_scope('one_hot_classes'): # Anchor classification ground truth # Object / Not Object min_pos_iou = \ self.dataset.kitti_utils.mini_batch_utils.rpn_pos_iou_range[0] objectness_classes_gt = tf.cast( tf.greater_equal(all_ious_gt, min_pos_iou), dtype=tf.int32) objectness_gt = tf.one_hot( objectness_classes_gt, depth=2, on_value=1.0 - self._config.label_smoothing_epsilon, off_value=self._config.label_smoothing_epsilon) # Mask predictions for mini batch with tf.variable_scope('prediction_mini_batch'): objectness_masked = tf.boolean_mask(objectness, mini_batch_mask) offsets_masked = tf.boolean_mask(offsets, mini_batch_mask) with tf.variable_scope('ground_truth_mini_batch'): objectness_gt_masked = tf.boolean_mask( objectness_gt, mini_batch_mask) offsets_gt_masked = tf.boolean_mask(all_offsets_gt, mini_batch_mask) # Specify the tensors to evaluate predictions = dict() # Temporary predictions for debugging # predictions['anchor_ious'] = anchor_ious # predictions['anchor_offsets'] = all_offsets_gt if self._train_val_test in ['train', 'val']: # All anchors predictions[self.PRED_ANCHORS] = anchors # Mini-batch masks predictions[self.PRED_MB_MASK] = mini_batch_mask # Mini-batch predictions predictions[self.PRED_MB_OBJECTNESS] = objectness_masked predictions[self.PRED_MB_OFFSETS] = offsets_masked # Mini batch ground truth predictions[self.PRED_MB_OFFSETS_GT] = offsets_gt_masked predictions[self.PRED_MB_OBJECTNESS_GT] = objectness_gt_masked # Proposals after nms predictions[self.PRED_TOP_INDICES] = top_indices predictions[self.PRED_TOP_ANCHORS] = top_anchors predictions[ self.PRED_TOP_OBJECTNESS_SOFTMAX] = top_objectness_softmax else: # self._train_val_test == 'test' predictions[self.PRED_TOP_ANCHORS] = top_anchors predictions[ self.PRED_TOP_OBJECTNESS_SOFTMAX] = top_objectness_softmax return predictions
-
AVOD Model
AVOD網絡部分會得到first stage得到的top-k anchor proposals,得到對應bev和img的anchor projection,進行相同的crop_and_resize操作,之後再進行fusion+n*(fc+fc_drop)進行cls,offsets以及angle vector的預測(fusion默認採用early-fusion:即先進行fusion再進入之後網絡層)。生成prediction之後,會解碼gt投影到bev圖上,然後採用同樣的策略生成mini-batch和top-anchor(bev上進行的nms),並且生成對應的objecness,offset,angle的loss。mini-batch的loss作爲train過程中進行模型訓練,後者生成最終的預測,但是loss好像並沒有使用。其中,offset的loss需要轉化到3d box上去計算(論文提出的box_4c計算方式)。相關代碼如下:
#avod_model.py:123 deleted code for summary def build(self): rpn_model = self._rpn_model # Share the same prediction dict as RPN prediction_dict = rpn_model.build() top_anchors = prediction_dict[RpnModel.PRED_TOP_ANCHORS] ground_plane = rpn_model.placeholders[RpnModel.PL_GROUND_PLANE] class_labels = rpn_model.placeholders[RpnModel.PL_LABEL_CLASSES] with tf.variable_scope('avod_projection'): if self._config.expand_proposals_xz > 0.0: expand_length = self._config.expand_proposals_xz # Expand anchors along x and z with tf.variable_scope('expand_xz'): expanded_dim_x = top_anchors[:, 3] + expand_length expanded_dim_z = top_anchors[:, 5] + expand_length expanded_anchors = tf.stack([ top_anchors[:, 0], top_anchors[:, 1], top_anchors[:, 2], expanded_dim_x, top_anchors[:, 4], expanded_dim_z ], axis=1) avod_projection_in = expanded_anchors else: avod_projection_in = top_anchors with tf.variable_scope('bev'): # Project top anchors into bev and image spaces bev_proposal_boxes, bev_proposal_boxes_norm = \ anchor_projector.project_to_bev( avod_projection_in, self.dataset.kitti_utils.bev_extents) # Reorder projected boxes into [y1, x1, y2, x2] bev_proposal_boxes_tf_order = \ anchor_projector.reorder_projected_boxes( bev_proposal_boxes) bev_proposal_boxes_norm_tf_order = \ anchor_projector.reorder_projected_boxes( bev_proposal_boxes_norm) with tf.variable_scope('img'): image_shape = tf.cast(tf.shape( rpn_model.placeholders[RpnModel.PL_IMG_INPUT])[0:2], tf.float32) img_proposal_boxes, img_proposal_boxes_norm = \ anchor_projector.tf_project_to_image_space( avod_projection_in, rpn_model.placeholders[RpnModel.PL_CALIB_P2], image_shape) # Only reorder the normalized img img_proposal_boxes_norm_tf_order = \ anchor_projector.reorder_projected_boxes( img_proposal_boxes_norm) bev_feature_maps = rpn_model.bev_feature_maps img_feature_maps = rpn_model.img_feature_maps if not (self._path_drop_probabilities[0] == self._path_drop_probabilities[1] == 1.0): with tf.variable_scope('avod_path_drop'): img_mask = rpn_model.img_path_drop_mask bev_mask = rpn_model.bev_path_drop_mask img_feature_maps = tf.multiply(img_feature_maps, img_mask) bev_feature_maps = tf.multiply(bev_feature_maps, bev_mask) else: bev_mask = tf.constant(1.0) img_mask = tf.constant(1.0) # ROI Pooling with tf.variable_scope('avod_roi_pooling'): def get_box_indices(boxes): proposals_shape = boxes.get_shape().as_list() if any(dim is None for dim in proposals_shape): proposals_shape = tf.shape(boxes) ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) multiplier = tf.expand_dims( tf.range(start=0, limit=proposals_shape[0]), 1) return tf.reshape(ones_mat * multiplier, [-1]) bev_boxes_norm_batches = tf.expand_dims( bev_proposal_boxes_norm, axis=0) # These should be all 0's since there is only 1 image tf_box_indices = get_box_indices(bev_boxes_norm_batches) # Do ROI Pooling on BEV bev_rois = tf.image.crop_and_resize( bev_feature_maps, bev_proposal_boxes_norm_tf_order, tf_box_indices, self._proposal_roi_crop_size, name='bev_rois') # Do ROI Pooling on image img_rois = tf.image.crop_and_resize( img_feature_maps, img_proposal_boxes_norm_tf_order, tf_box_indices, self._proposal_roi_crop_size, name='img_rois') # Fully connected layers (Box Predictor) avod_layers_config = self.model_config.layers_config.avod_config fc_output_layers = \ avod_fc_layers_builder.build( layers_config=avod_layers_config, input_rois=[bev_rois, img_rois], input_weights=[bev_mask, img_mask], num_final_classes=self._num_final_classes, box_rep=self._box_rep, top_anchors=top_anchors, ground_plane=ground_plane, is_training=self._is_training) all_cls_logits = \ fc_output_layers[avod_fc_layers_builder.KEY_CLS_LOGITS] all_offsets = fc_output_layers[avod_fc_layers_builder.KEY_OFFSETS] # This may be None all_angle_vectors = \ fc_output_layers.get(avod_fc_layers_builder.KEY_ANGLE_VECTORS) with tf.variable_scope('softmax'): all_cls_softmax = tf.nn.softmax( all_cls_logits) ###################################################### # Subsample mini_batch for the loss function ###################################################### # Get the ground truth tensors anchors_gt = rpn_model.placeholders[RpnModel.PL_LABEL_ANCHORS] if self._box_rep in ['box_3d', 'box_4ca']: boxes_3d_gt = rpn_model.placeholders[RpnModel.PL_LABEL_BOXES_3D] orientations_gt = boxes_3d_gt[:, 6] elif self._box_rep in ['box_8c', 'box_8co', 'box_4c']: boxes_3d_gt = rpn_model.placeholders[RpnModel.PL_LABEL_BOXES_3D] else: raise NotImplementedError('Ground truth tensors not implemented') # Project anchor_gts to 2D bev with tf.variable_scope('avod_gt_projection'): bev_anchor_boxes_gt, _ = anchor_projector.project_to_bev( anchors_gt, self.dataset.kitti_utils.bev_extents) bev_anchor_boxes_gt_tf_order = \ anchor_projector.reorder_projected_boxes(bev_anchor_boxes_gt) with tf.variable_scope('avod_box_list'): # Convert to box_list format anchor_box_list_gt = box_list.BoxList(bev_anchor_boxes_gt_tf_order) anchor_box_list = box_list.BoxList(bev_proposal_boxes_tf_order) #得到minibatch的mask,label index和對應的匹配到的gt index mb_mask, mb_class_label_indices, mb_gt_indices = \ self.sample_mini_batch( anchor_box_list_gt=anchor_box_list_gt, anchor_box_list=anchor_box_list, class_labels=class_labels) # Create classification one_hot vector with tf.variable_scope('avod_one_hot_classes'): mb_classification_gt = tf.one_hot( mb_class_label_indices, depth=self._num_final_classes, on_value=1.0 - self._config.label_smoothing_epsilon, off_value=(self._config.label_smoothing_epsilon / self.dataset.num_classes)) # TODO: Don't create a mini batch in test mode # Mask predictions with tf.variable_scope('avod_apply_mb_mask'): # Classification mb_classifications_logits = tf.boolean_mask( all_cls_logits, mb_mask) mb_classifications_softmax = tf.boolean_mask( all_cls_softmax, mb_mask) # Offsets mb_offsets = tf.boolean_mask(all_offsets, mb_mask) # Angle Vectors if all_angle_vectors is not None: mb_angle_vectors = tf.boolean_mask(all_angle_vectors, mb_mask) else: mb_angle_vectors = None # Encode anchor offsets with tf.variable_scope('avod_encode_mb_anchors'): mb_anchors = tf.boolean_mask(top_anchors, mb_mask) if self._box_rep == 'box_3d': # Gather corresponding ground truth anchors for each mb sample mb_anchors_gt = tf.gather(anchors_gt, mb_gt_indices) mb_offsets_gt = anchor_encoder.tf_anchor_to_offset( mb_anchors, mb_anchors_gt) # Gather corresponding ground truth orientation for each # mb sample mb_orientations_gt = tf.gather(orientations_gt, mb_gt_indices) elif self._box_rep in ['box_8c', 'box_8co']: # Get boxes_3d ground truth mini-batch and convert to box_8c mb_boxes_3d_gt = tf.gather(boxes_3d_gt, mb_gt_indices) if self._box_rep == 'box_8c': mb_boxes_8c_gt = \ box_8c_encoder.tf_box_3d_to_box_8c(mb_boxes_3d_gt) elif self._box_rep == 'box_8co': mb_boxes_8c_gt = \ box_8c_encoder.tf_box_3d_to_box_8co(mb_boxes_3d_gt) # Convert proposals: anchors -> box_3d -> box8c proposal_boxes_3d = \ box_3d_encoder.anchors_to_box_3d(top_anchors, fix_lw=True) proposal_boxes_8c = \ box_8c_encoder.tf_box_3d_to_box_8c(proposal_boxes_3d) # Get mini batch offsets mb_boxes_8c = tf.boolean_mask(proposal_boxes_8c, mb_mask) mb_offsets_gt = box_8c_encoder.tf_box_8c_to_offsets( mb_boxes_8c, mb_boxes_8c_gt) # Flatten the offsets to a (N x 24) vector mb_offsets_gt = tf.reshape(mb_offsets_gt, [-1, 24]) elif self._box_rep in ['box_4c', 'box_4ca']: # Get ground plane for box_4c conversion ground_plane = self._rpn_model.placeholders[ self._rpn_model.PL_GROUND_PLANE] # Convert gt boxes_3d -> box_4c mb_boxes_3d_gt = tf.gather(boxes_3d_gt, mb_gt_indices) mb_boxes_4c_gt = box_4c_encoder.tf_box_3d_to_box_4c( mb_boxes_3d_gt, ground_plane) # Convert proposals: anchors -> box_3d -> box_4c proposal_boxes_3d = \ box_3d_encoder.anchors_to_box_3d(top_anchors, fix_lw=True) proposal_boxes_4c = \ box_4c_encoder.tf_box_3d_to_box_4c(proposal_boxes_3d, ground_plane) # Get mini batch mb_boxes_4c = tf.boolean_mask(proposal_boxes_4c, mb_mask) mb_offsets_gt = box_4c_encoder.tf_box_4c_to_offsets( mb_boxes_4c, mb_boxes_4c_gt) if self._box_rep == 'box_4ca': # Gather corresponding ground truth orientation for each # mb sample mb_orientations_gt = tf.gather(orientations_gt, mb_gt_indices) else: raise NotImplementedError( 'Anchor encoding not implemented for', self._box_rep) ###################################################### # Final Predictions ###################################################### # Get orientations from angle vectors if all_angle_vectors is not None: with tf.variable_scope('avod_orientation'): all_orientations = \ orientation_encoder.tf_angle_vector_to_orientation( all_angle_vectors) # Apply offsets to regress proposals with tf.variable_scope('avod_regression'): if self._box_rep == 'box_3d': prediction_anchors = \ anchor_encoder.offset_to_anchor(top_anchors, all_offsets) elif self._box_rep in ['box_8c', 'box_8co']: # Reshape the 24-dim regressed offsets to (N x 3 x 8) reshaped_offsets = tf.reshape(all_offsets, [-1, 3, 8]) # Given the offsets, get the boxes_8c prediction_boxes_8c = \ box_8c_encoder.tf_offsets_to_box_8c(proposal_boxes_8c, reshaped_offsets) # Convert corners back to box3D prediction_boxes_3d = \ box_8c_encoder.box_8c_to_box_3d(prediction_boxes_8c) # Convert the box_3d to anchor format for nms prediction_anchors = \ box_3d_encoder.tf_box_3d_to_anchor(prediction_boxes_3d) elif self._box_rep in ['box_4c', 'box_4ca']: # Convert predictions box_4c -> box_3d prediction_boxes_4c = \ box_4c_encoder.tf_offsets_to_box_4c(proposal_boxes_4c, all_offsets) prediction_boxes_3d = \ box_4c_encoder.tf_box_4c_to_box_3d(prediction_boxes_4c, ground_plane) # Convert to anchor format for nms prediction_anchors = \ box_3d_encoder.tf_box_3d_to_anchor(prediction_boxes_3d) else: raise NotImplementedError('Regression not implemented for', self._box_rep) # Apply Non-oriented NMS in BEV with tf.variable_scope('avod_nms'): bev_extents = self.dataset.kitti_utils.bev_extents with tf.variable_scope('bev_projection'): # Project predictions into BEV avod_bev_boxes, _ = anchor_projector.project_to_bev( prediction_anchors, bev_extents) avod_bev_boxes_tf_order = \ anchor_projector.reorder_projected_boxes( avod_bev_boxes) # Get top score from second column onward all_top_scores = tf.reduce_max(all_cls_logits[:, 1:], axis=1) # Apply NMS in BEV nms_indices = tf.image.non_max_suppression( avod_bev_boxes_tf_order, all_top_scores, max_output_size=self._nms_size, iou_threshold=self._nms_iou_threshold) # Gather predictions from NMS indices top_classification_logits = tf.gather(all_cls_logits, nms_indices) top_classification_softmax = tf.gather(all_cls_softmax, nms_indices) top_prediction_anchors = tf.gather(prediction_anchors, nms_indices) if self._box_rep == 'box_3d': top_orientations = tf.gather( all_orientations, nms_indices) elif self._box_rep in ['box_8c', 'box_8co']: top_prediction_boxes_3d = tf.gather( prediction_boxes_3d, nms_indices) top_prediction_boxes_8c = tf.gather( prediction_boxes_8c, nms_indices) elif self._box_rep == 'box_4c': top_prediction_boxes_3d = tf.gather( prediction_boxes_3d, nms_indices) top_prediction_boxes_4c = tf.gather( prediction_boxes_4c, nms_indices) elif self._box_rep == 'box_4ca': top_prediction_boxes_3d = tf.gather( prediction_boxes_3d, nms_indices) top_prediction_boxes_4c = tf.gather( prediction_boxes_4c, nms_indices) top_orientations = tf.gather( all_orientations, nms_indices) else: raise NotImplementedError('NMS gather not implemented for', self._box_rep) if self._train_val_test in ['train', 'val']: # Additional entries are added to the shared prediction_dict # Mini batch predictions prediction_dict[self.PRED_MB_CLASSIFICATION_LOGITS] = \ mb_classifications_logits prediction_dict[self.PRED_MB_CLASSIFICATION_SOFTMAX] = \ mb_classifications_softmax prediction_dict[self.PRED_MB_OFFSETS] = mb_offsets # Mini batch ground truth prediction_dict[self.PRED_MB_CLASSIFICATIONS_GT] = \ mb_classification_gt prediction_dict[self.PRED_MB_OFFSETS_GT] = mb_offsets_gt # Top NMS predictions prediction_dict[self.PRED_TOP_CLASSIFICATION_LOGITS] = \ top_classification_logits prediction_dict[self.PRED_TOP_CLASSIFICATION_SOFTMAX] = \ top_classification_softmax prediction_dict[self.PRED_TOP_PREDICTION_ANCHORS] = \ top_prediction_anchors # Mini batch predictions (for debugging) prediction_dict[self.PRED_MB_MASK] = mb_mask # prediction_dict[self.PRED_MB_POS_MASK] = mb_pos_mask prediction_dict[self.PRED_MB_CLASS_INDICES_GT] = \ mb_class_label_indices # All predictions (for debugging) prediction_dict[self.PRED_ALL_CLASSIFICATIONS] = \ all_cls_logits prediction_dict[self.PRED_ALL_OFFSETS] = all_offsets # Path drop masks (for debugging) prediction_dict['bev_mask'] = bev_mask prediction_dict['img_mask'] = img_mask else: # self._train_val_test == 'test' prediction_dict[self.PRED_TOP_CLASSIFICATION_SOFTMAX] = \ top_classification_softmax prediction_dict[self.PRED_TOP_PREDICTION_ANCHORS] = \ top_prediction_anchors if self._box_rep == 'box_3d': prediction_dict[self.PRED_MB_ANCHORS_GT] = mb_anchors_gt prediction_dict[self.PRED_MB_ORIENTATIONS_GT] = mb_orientations_gt prediction_dict[self.PRED_MB_ANGLE_VECTORS] = mb_angle_vectors prediction_dict[self.PRED_TOP_ORIENTATIONS] = top_orientations # For debugging prediction_dict[self.PRED_ALL_ANGLE_VECTORS] = all_angle_vectors elif self._box_rep in ['box_8c', 'box_8co']: prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \ top_prediction_boxes_3d # Store the corners before converting for visualization purposes prediction_dict[self.PRED_TOP_BOXES_8C] = top_prediction_boxes_8c elif self._box_rep == 'box_4c': prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \ top_prediction_boxes_3d prediction_dict[self.PRED_TOP_BOXES_4C] = top_prediction_boxes_4c elif self._box_rep == 'box_4ca': if self._train_val_test in ['train', 'val']: prediction_dict[self.PRED_MB_ORIENTATIONS_GT] = \ mb_orientations_gt prediction_dict[self.PRED_MB_ANGLE_VECTORS] = mb_angle_vectors prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \ top_prediction_boxes_3d prediction_dict[self.PRED_TOP_BOXES_4C] = top_prediction_boxes_4c prediction_dict[self.PRED_TOP_ORIENTATIONS] = top_orientations else: raise NotImplementedError('Prediction dict not implemented for', self._box_rep) # prediction_dict[self.PRED_MAX_IOUS] = max_ious # prediction_dict[self.PRED_ALL_IOUS] = all_ious return prediction_dict