Yolov4 with Efficientnet b0-b7 Backbone

shihyung

13 min readNov 10, 2020

* 目的:

這邊嘗試著以 EfficientNet b0-b7 將 Yolov4 的 backbone 做替換來看看會有怎樣的訓練趨勢。

* Backbone 替換

* yaml 檔修改

原始的 Yolov4_L yaml 檔案的 backbone:

修改後的 Yolov4_L_Efficientnet_b0 yaml:

修改後的 Yolov4_L_Efficientnet_b1 yaml:

修改後的 Yolov4_L_Efficientnet_b2 yaml:

修改後的 Yolov4_L_Efficientnet_b3 yaml:

修改後的 Yolov4_L_Efficientnet_b4 yaml:

修改後的 Yolov4_L_Efficientnet_b5 yaml:

修改後的 Yolov4_L_Efficientnet_b6 yaml:

修改後的 Yolov4_L_Efficientnet_b7 yaml:

* 程式修改

yolo.py, parse_model() 增加:

eff_n=n
elif m is CBS:
    c1=ch[f if f<0 else f+1]
    c2=args[0]
    args=[c1,c2,*args[1:]]
elif m is effLayer:
    c1=ch[f if f<0 else f+1]
    c2=args[0]
    args=[c1,c2,eff_n,*args[1:]]

common.py 增加:

class CBS(nn.Module):
    def __init__(self, c1, c2, k, s, s_count=0, bnum='b'):
        super(CBS, self).__init__()
        image_sizes={'b0':224,'b1':240,'b2':260,'b3':300,'b4':380,'b5':456,'b6':528,'b7':600}
        if s_count>1:
            image_size=int(image_sizes[bnum]/(2**s_count))
        else:
            image_size=image_sizes[bnum]
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self.conv=Conv2d(c1, c2, kernel_size=k, stride=s, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.swish = MemoryEfficientSwish()
    def forward(self, x):
        x=self.swish(self.bn(self.conv(x)))
        return x

class effBlock(nn.Module):
    def __init__(self, c1, c2, k=1, s=1, exp_r=1, se_r=None, image_size=0): #chin, plane, block_nums, group, width_per_group
        super(effBlock,self).__init__()
        self.expansion_ratio=exp_r
        self.has_se = (se_r is not None) and (0 < se_r <= 1)
        self.ch_in=c1
        self.ch_tmp=c1*exp_r
        self.ch_out=c2
        self.stride=s

        # Expansion phase (Inverted Bottleneck)
        oup = self.ch_tmp  # number of output channels
        if self.expansion_ratio != 1:
            Conv2d = get_same_padding_conv2d(image_size=image_size)
            self._expand_conv = Conv2d(in_channels=self.ch_in, out_channels=self.ch_tmp, kernel_size=1, bias=False)
            self._bn0 = nn.BatchNorm2d(num_features=self.ch_tmp)

        # Depthwise convolution phase
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._depthwise_conv = Conv2d(in_channels=self.ch_tmp, out_channels=self.ch_tmp, groups=self.ch_tmp, kernel_size=k, stride=s, bias=False)
        self._bn1 = nn.BatchNorm2d(num_features=self.ch_tmp)
        image_size = calculate_output_image_size(image_size, s)

        # Squeeze and Excitation layer, if desired
        if self.has_se:
            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
            num_squeezed_channels = max(1, int(self.ch_in * se_r))
            self._se_reduce = Conv2d(in_channels=self.ch_tmp, out_channels=num_squeezed_channels, kernel_size=1)
            self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=self.ch_tmp, kernel_size=1)

        # Pointwise convolution phase
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._project_conv = Conv2d(in_channels=self.ch_tmp, out_channels=self.ch_out, kernel_size=1, bias=False)
        self._bn2 = nn.BatchNorm2d(num_features=self.ch_out)
        self._swish = MemoryEfficientSwish()

    def forward(self, inputs, drop_connect_rate=None):
        #print('effBlock forward...')
        # Expansion and Depthwise Convolution
        x = inputs
        #print('x0: ',x.size())
        if self.expansion_ratio != 1:
            x = self._expand_conv(inputs)
            x = self._bn0(x)
            x = self._swish(x)

        x = self._depthwise_conv(x)
        x = self._bn1(x)
        x = self._swish(x)
        #print('x1: ',x.size())
        # Squeeze and Excitation
        if self.has_se:
            x_squeezed = F.adaptive_avg_pool2d(x, 1)
            x_squeezed = self._se_reduce(x_squeezed)
            x_squeezed = self._swish(x_squeezed)
            x_squeezed = self._se_expand(x_squeezed)
            x = torch.sigmoid(x_squeezed) * x

        # Pointwise Convolution
        x = self._project_conv(x)
        x = self._bn2(x)
        #print('x2: ',x.size())
        # Skip connection and drop connect
        if self.stride == 1 and self.ch_in == self.ch_out:
            # The combination of skip connection and drop connect brings about stochastic depth.
            if drop_connect_rate:
                x = drop_connect(x, p=drop_connect_rate)
            x = x + inputs  # skip connection
        #print('x3: ',x.size())
        return x

class effLayer(nn.Module):
    def __init__(self, c1, c2, n=1, k=1, s=1, s_count=0, exp_r=1, se_r=1, bnum=0, md='b'): #chin, plane, block_nums, group, width_per_group
        super(effLayer,self).__init__()
        params_dict = {'b0': (1.0, 1.0, 224, 0.2),
                       'b1': (1.0, 1.1, 240, 0.2),
                       'b2': (1.1, 1.2, 260, 0.3),
                       'b3': (1.2, 1.4, 300, 0.3),
                       'b4': (1.4, 1.8, 380, 0.4),
                       'b5': (1.6, 2.2, 456, 0.4),
                       'b6': (1.8, 2.6, 528, 0.5),
                       'b7': (2.0, 3.1, 600, 0.5)}
        width_coef, depth_coef, image_size, dropout_rate = params_dict[md]
        
        self.global_params = GlobalParams(
                                        width_coefficient=width_coef,
                                        depth_coefficient=depth_coef,
                                        image_size=image_size,
                                        dropout_rate=dropout_rate,
                                        num_classes=1000,
                                        batch_norm_momentum=0.99,
                                        batch_norm_epsilon=1e-3,
                                        drop_connect_rate=0.2,
                                        depth_divisor=8,
                                        min_depth=None,
                                        include_top=True,
                                    )

        self.block_number=bnum
        if s==2:
            image_size = calculate_output_image_size(self.global_params.image_size, 2**(s_count-1))
        else:
            image_size = calculate_output_image_size(self.global_params.image_size, 2**s_count)

        self.blocks = nn.ModuleList([effBlock(c1, c2, k, s, exp_r, se_r, image_size)])
        image_size = calculate_output_image_size(image_size, s)
        for _ in range(n - 1):
                self.blocks.append(effBlock(c2, c2, k, 1, exp_r, se_r, image_size))

    def forward(self, x):
        for idx, block in enumerate(self.blocks):
            drop_connect_rate = self.global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx+self.block_number) / 16  # scale drop connect_rate # Total blocks of efficientnet is 16
                #print(f'*** bnum: {idx}, drop connect rate: {drop_connect_rate}')
            x = block(x, drop_connect_rate=drop_connect_rate)
        return x