1.权值初始化

Q:梯度爆炸的代码演示?

In [6]:
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data)    # normal: mean=0, std=1

# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)
layer:0, std:15.959932327270508
layer:1, std:256.6237487792969
layer:2, std:4107.24560546875
layer:3, std:65576.8125
layer:4, std:1045011.875
layer:5, std:17110408.0
layer:6, std:275461408.0
layer:7, std:4402537984.0
layer:8, std:71323615232.0
layer:9, std:1148104736768.0
layer:10, std:17911758454784.0
layer:11, std:283574846619648.0
layer:12, std:4480599809064960.0
layer:13, std:7.196814275405414e+16
layer:14, std:1.1507761512626258e+18
layer:15, std:1.853110740188555e+19
layer:16, std:2.9677725826641455e+20
layer:17, std:4.780376223769898e+21
layer:18, std:7.613223480799065e+22
layer:19, std:1.2092652108825478e+24
layer:20, std:1.923257075956356e+25
layer:21, std:3.134467063655912e+26
layer:22, std:5.014437766285408e+27
layer:23, std:8.066615144249704e+28
layer:24, std:1.2392661553516338e+30
layer:25, std:1.9455688099759845e+31
layer:26, std:3.0238180658999113e+32
layer:27, std:4.950357571077011e+33
layer:28, std:8.150925520353362e+34
layer:29, std:1.322983152787379e+36
layer:30, std:2.0786820453988485e+37
layer:31, std:nan
output is nan in 31 layers
tensor([[        inf, -2.6817e+38,         inf,  ...,         inf,
                 inf,         inf],
        [       -inf,        -inf,  1.4387e+38,  ..., -1.3409e+38,
         -1.9659e+38,        -inf],
        [-1.5873e+37,         inf,        -inf,  ...,         inf,
                -inf,  1.1484e+38],
        ...,
        [ 2.7754e+38, -1.6783e+38, -1.5531e+38,  ...,         inf,
         -9.9440e+37, -2.5132e+38],
        [-7.7184e+37,        -inf,         inf,  ..., -2.6505e+38,
                 inf,         inf],
        [        inf,         inf,        -inf,  ...,        -inf,
                 inf,  1.7432e+38]], grad_fn=<MmBackward>)

Q:更改初始权值解决不带激活函数全连接网络的梯度爆炸问题?

In [8]:
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # 注意此处的std
                nn.init.normal_(m.weight.data, std=np.sqrt(1/self.neural_num))    # normal: mean=0, std=1

# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)
layer:0, std:0.9974957704544067
layer:1, std:1.0024365186691284
layer:2, std:1.002745509147644
layer:3, std:1.0006227493286133
layer:4, std:0.9966009855270386
layer:5, std:1.019859790802002
layer:6, std:1.026173710823059
layer:7, std:1.0250457525253296
layer:8, std:1.0378952026367188
layer:9, std:1.0441951751708984
layer:10, std:1.0181655883789062
layer:11, std:1.0074602365493774
layer:12, std:0.9948930144309998
layer:13, std:0.9987586140632629
layer:14, std:0.9981392025947571
layer:15, std:1.0045733451843262
layer:16, std:1.0055204629898071
layer:17, std:1.0122840404510498
layer:18, std:1.0076017379760742
layer:19, std:1.000280737876892
layer:20, std:0.9943006038665771
layer:21, std:1.012800931930542
layer:22, std:1.012657642364502
layer:23, std:1.018149971961975
layer:24, std:0.9776086211204529
layer:25, std:0.9592394828796387
layer:26, std:0.9317858815193176
layer:27, std:0.9534041881561279
layer:28, std:0.9811319708824158
layer:29, std:0.9953019022941589
layer:30, std:0.9773916006088257
layer:31, std:0.9655940532684326
layer:32, std:0.9270440936088562
layer:33, std:0.9329946637153625
layer:34, std:0.9311841726303101
layer:35, std:0.9354336261749268
layer:36, std:0.9492132067680359
layer:37, std:0.9679954648017883
layer:38, std:0.9849981665611267
layer:39, std:0.9982335567474365
layer:40, std:0.9616852402687073
layer:41, std:0.9439758658409119
layer:42, std:0.9631161093711853
layer:43, std:0.958673894405365
layer:44, std:0.9675614237785339
layer:45, std:0.9837557077407837
layer:46, std:0.9867278337478638
layer:47, std:0.9920817017555237
layer:48, std:0.9650403261184692
layer:49, std:0.9991624355316162
layer:50, std:0.9946174025535583
layer:51, std:0.9662044048309326
layer:52, std:0.9827387928962708
layer:53, std:0.9887880086898804
layer:54, std:0.9932605624198914
layer:55, std:1.0237400531768799
layer:56, std:0.9702046513557434
layer:57, std:1.0045380592346191
layer:58, std:0.9943899512290955
layer:59, std:0.9900636076927185
layer:60, std:0.99446702003479
layer:61, std:0.9768352508544922
layer:62, std:0.9797843098640442
layer:63, std:0.9951220750808716
layer:64, std:0.9980446696281433
layer:65, std:1.0086933374404907
layer:66, std:1.0276142358779907
layer:67, std:1.0429234504699707
layer:68, std:1.0197855234146118
layer:69, std:1.0319130420684814
layer:70, std:1.0540012121200562
layer:71, std:1.026781439781189
layer:72, std:1.0331352949142456
layer:73, std:1.0666675567626953
layer:74, std:1.0413838624954224
layer:75, std:1.0733673572540283
layer:76, std:1.0404183864593506
layer:77, std:1.0344083309173584
layer:78, std:1.0022705793380737
layer:79, std:0.99835205078125
layer:80, std:0.9732587337493896
layer:81, std:0.9777462482452393
layer:82, std:0.9753198623657227
layer:83, std:0.9938382506370544
layer:84, std:0.9472599029541016
layer:85, std:0.9511011242866516
layer:86, std:0.9737769961357117
layer:87, std:1.005651831626892
layer:88, std:1.0043526887893677
layer:89, std:0.9889539480209351
layer:90, std:1.0130352973937988
layer:91, std:1.0030947923660278
layer:92, std:0.9993206262588501
layer:93, std:1.0342745780944824
layer:94, std:1.031973123550415
layer:95, std:1.0413124561309814
layer:96, std:1.0817031860351562
layer:97, std:1.128799557685852
layer:98, std:1.1617802381515503
layer:99, std:1.2215303182601929
tensor([[-1.0696, -1.1373,  0.5047,  ..., -0.4766,  1.5904, -0.1076],
        [ 0.4572,  1.6211,  1.9659,  ..., -0.3558, -1.1235,  0.0979],
        [ 0.3908, -0.9998, -0.8680,  ..., -2.4161,  0.5035,  0.2814],
        ...,
        [ 0.1876,  0.7971, -0.5918,  ...,  0.5395, -0.8932,  0.1211],
        [-0.0102, -1.5027, -2.6860,  ...,  0.6954, -0.1858, -0.8027],
        [-0.5871, -1.3739, -2.9027,  ...,  1.6734,  0.5094, -0.9986]],
       grad_fn=<MmBackward>)

Q:添加tanh激活函数后的全连接网络,梯度消失代码示例

In [9]:
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            # 多添加了tanh激活函数
            x = torch.tanh(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # 注意此处的std
                nn.init.normal_(m.weight.data, std=np.sqrt(1/self.neural_num))    # normal: mean=0, std=1

                # a = np.sqrt(6 / (self.neural_num + self.neural_num))
                #
                # tanh_gain = nn.init.calculate_gain('tanh')
                # a *= tanh_gain
                #
                # nn.init.uniform_(m.weight.data, -a, a)

                # nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)

                # nn.init.normal_(m.weight.data, std=np.sqrt(2 / self.neural_num))
#                 nn.init.kaiming_normal_(m.weight.data)

# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)
layer:0, std:0.6273701786994934
layer:1, std:0.48910173773765564
layer:2, std:0.4099564850330353
layer:3, std:0.35637012124061584
layer:4, std:0.32117360830307007
layer:5, std:0.2981105148792267
layer:6, std:0.27730831503868103
layer:7, std:0.2589356303215027
layer:8, std:0.2468511462211609
layer:9, std:0.23721906542778015
layer:10, std:0.22171513736248016
layer:11, std:0.21079954504966736
layer:12, std:0.19820132851600647
layer:13, std:0.19069305062294006
layer:14, std:0.18555502593517303
layer:15, std:0.17953835427761078
layer:16, std:0.17485806345939636
layer:17, std:0.1702701896429062
layer:18, std:0.16508983075618744
layer:19, std:0.1591130942106247
layer:20, std:0.15480300784111023
layer:21, std:0.15263864398002625
layer:22, std:0.148549422621727
layer:23, std:0.14617665112018585
layer:24, std:0.13876432180404663
layer:25, std:0.13316625356674194
layer:26, std:0.12660598754882812
layer:27, std:0.12537942826747894
layer:28, std:0.12535445392131805
layer:29, std:0.12589804828166962
layer:30, std:0.11994210630655289
layer:31, std:0.11700887233018875
layer:32, std:0.11137297749519348
layer:33, std:0.11154612898826599
layer:34, std:0.10991233587265015
layer:35, std:0.10996390879154205
layer:36, std:0.10969001054763794
layer:37, std:0.10975216329097748
layer:38, std:0.11063200235366821
layer:39, std:0.11021336913108826
layer:40, std:0.10465587675571442
layer:41, std:0.10141163319349289
layer:42, std:0.1026025265455246
layer:43, std:0.10079070925712585
layer:44, std:0.10096712410449982
layer:45, std:0.10117629915475845
layer:46, std:0.10145658999681473
layer:47, std:0.09987485408782959
layer:48, std:0.09677786380052567
layer:49, std:0.099615179002285
layer:50, std:0.09867013245820999
layer:51, std:0.09398546814918518
layer:52, std:0.09388342499732971
layer:53, std:0.09352942556142807
layer:54, std:0.09336657077074051
layer:55, std:0.0948176234960556
layer:56, std:0.08856320381164551
layer:57, std:0.09024856984615326
layer:58, std:0.088644839823246
layer:59, std:0.08766943216323853
layer:60, std:0.08726289123296738
layer:61, std:0.08623495697975159
layer:62, std:0.08549778908491135
layer:63, std:0.0855521708726883
layer:64, std:0.0853666365146637
layer:65, std:0.08462794870138168
layer:66, std:0.0852193832397461
layer:67, std:0.08562126755714417
layer:68, std:0.08368431031703949
layer:69, std:0.08476374298334122
layer:70, std:0.0853630006313324
layer:71, std:0.08237560093402863
layer:72, std:0.08133518695831299
layer:73, std:0.08416958898305893
layer:74, std:0.08226992189884186
layer:75, std:0.08379074186086655
layer:76, std:0.08003697544336319
layer:77, std:0.07888862490653992
layer:78, std:0.07618380337953568
layer:79, std:0.07458437979221344
layer:80, std:0.07207276672124863
layer:81, std:0.07079190015792847
layer:82, std:0.0712786465883255
layer:83, std:0.07165777683258057
layer:84, std:0.06893909722566605
layer:85, std:0.0690247192978859
layer:86, std:0.07030878216028214
layer:87, std:0.07283661514520645
layer:88, std:0.07280214875936508
layer:89, std:0.07130246609449387
layer:90, std:0.07225215435028076
layer:91, std:0.0712454542517662
layer:92, std:0.07088854163885117
layer:93, std:0.0730612576007843
layer:94, std:0.07276967912912369
layer:95, std:0.07259567081928253
layer:96, std:0.07586522400379181
layer:97, std:0.07769150286912918
layer:98, std:0.07842090725898743
layer:99, std:0.08206238597631454
tensor([[-0.1103, -0.0739,  0.1278,  ..., -0.0508,  0.1544, -0.0107],
        [ 0.0807,  0.1208,  0.0030,  ..., -0.0385, -0.1887, -0.0294],
        [ 0.0321, -0.0833, -0.1482,  ..., -0.1133,  0.0206,  0.0155],
        ...,
        [ 0.0108,  0.0560, -0.1099,  ...,  0.0459, -0.0961, -0.0124],
        [ 0.0398, -0.0874, -0.2312,  ...,  0.0294, -0.0562, -0.0556],
        [-0.0234, -0.0297, -0.1155,  ...,  0.1143,  0.0083, -0.0675]],
       grad_fn=<TanhBackward>)

Q:使用Xavier初始化tanh激活函数的全连接网络解决梯度消失问题的代码示例?

In [10]:
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            # 多添加了tanh激活函数
            x = torch.tanh(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # 手动计算xavier
                a = np.sqrt(6 / (self.neural_num + self.neural_num))
                tanh_gain = nn.init.calculate_gain('tanh')
                a *= tanh_gain
                nn.init.uniform_(m.weight.data, -a, a)
                # pytorch提供的xavier初始化方法,效果同上
                # tanh_gain = nn.init.calculate_gain('tanh')
                # nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)

                # nn.init.normal_(m.weight.data, std=np.sqrt(2 / self.neural_num))
#                 nn.init.kaiming_normal_(m.weight.data)

# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)
layer:0, std:0.7571136355400085
layer:1, std:0.6924336552619934
layer:2, std:0.6677976846694946
layer:3, std:0.6551960110664368
layer:4, std:0.655646800994873
layer:5, std:0.6536089777946472
layer:6, std:0.6500504612922668
layer:7, std:0.6465446949005127
layer:8, std:0.6456685662269592
layer:9, std:0.6414617896080017
layer:10, std:0.6423627734184265
layer:11, std:0.6509683728218079
layer:12, std:0.6584846377372742
layer:13, std:0.6530249118804932
layer:14, std:0.6528729796409607
layer:15, std:0.6523412466049194
layer:16, std:0.6534921526908875
layer:17, std:0.6540238261222839
layer:18, std:0.6477403044700623
layer:19, std:0.6469652652740479
layer:20, std:0.6441705822944641
layer:21, std:0.6484488248825073
layer:22, std:0.6512865424156189
layer:23, std:0.6525684595108032
layer:24, std:0.6531476378440857
layer:25, std:0.6488809585571289
layer:26, std:0.6533839702606201
layer:27, std:0.6482065320014954
layer:28, std:0.6471589803695679
layer:29, std:0.6553042531013489
layer:30, std:0.6560811400413513
layer:31, std:0.6522760987281799
layer:32, std:0.6499098539352417
layer:33, std:0.6568747758865356
layer:34, std:0.6544532179832458
layer:35, std:0.6535674929618835
layer:36, std:0.6508696675300598
layer:37, std:0.6428772807121277
layer:38, std:0.6495102643966675
layer:39, std:0.6479291319847107
layer:40, std:0.6470604538917542
layer:41, std:0.6513484716415405
layer:42, std:0.6503545045852661
layer:43, std:0.6458993554115295
layer:44, std:0.6517387628555298
layer:45, std:0.6520006060600281
layer:46, std:0.6539937257766724
layer:47, std:0.6537032723426819
layer:48, std:0.6516646146774292
layer:49, std:0.6535552740097046
layer:50, std:0.6464877724647522
layer:51, std:0.6491119265556335
layer:52, std:0.6455202102661133
layer:53, std:0.6520237326622009
layer:54, std:0.6531855463981628
layer:55, std:0.6627183556556702
layer:56, std:0.6544181108474731
layer:57, std:0.6501768827438354
layer:58, std:0.6510448455810547
layer:59, std:0.6549468040466309
layer:60, std:0.6529951691627502
layer:61, std:0.6515748500823975
layer:62, std:0.6453633904457092
layer:63, std:0.644793689250946
layer:64, std:0.6489539742469788
layer:65, std:0.6553947925567627
layer:66, std:0.6535270810127258
layer:67, std:0.6528791785240173
layer:68, std:0.6492816209793091
layer:69, std:0.6596571207046509
layer:70, std:0.6536712646484375
layer:71, std:0.6498764157295227
layer:72, std:0.6538681387901306
layer:73, std:0.64595627784729
layer:74, std:0.6543275117874146
layer:75, std:0.6525828838348389
layer:76, std:0.6462088227272034
layer:77, std:0.6534948945045471
layer:78, std:0.6461930871009827
layer:79, std:0.6457878947257996
layer:80, std:0.6481245160102844
layer:81, std:0.6496317386627197
layer:82, std:0.6516988277435303
layer:83, std:0.6485154032707214
layer:84, std:0.6395408511161804
layer:85, std:0.6498249173164368
layer:86, std:0.6510564088821411
layer:87, std:0.6505221724510193
layer:88, std:0.6573457717895508
layer:89, std:0.6529723405838013
layer:90, std:0.6536353230476379
layer:91, std:0.6497699022293091
layer:92, std:0.6459059715270996
layer:93, std:0.6459072232246399
layer:94, std:0.6530925631523132
layer:95, std:0.6515892148017883
layer:96, std:0.6434286832809448
layer:97, std:0.6425578594207764
layer:98, std:0.6407340168952942
layer:99, std:0.6442393660545349
tensor([[ 0.1133,  0.1239,  0.8211,  ...,  0.9411, -0.6334,  0.5155],
        [-0.9585, -0.2371,  0.8548,  ..., -0.2339,  0.9326,  0.0114],
        [ 0.9487, -0.2279,  0.8735,  ..., -0.9593,  0.7922,  0.6263],
        ...,
        [ 0.7257,  0.0800, -0.4440,  ..., -0.9589,  0.2604,  0.5402],
        [-0.9572,  0.5179, -0.8041,  ..., -0.4298, -0.6087,  0.9679],
        [ 0.6105,  0.3994,  0.1072,  ...,  0.3904, -0.5274,  0.0776]],
       grad_fn=<TanhBackward>)
In [ ]:
Q:使用He初始化(Kaiming)relu激活函数的全连接网络解决梯度消失问题的代码示例
In [13]:
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            # 多添加了relu激活函数
            x = torch.relu(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # 手动计算kaiming初始化
                nn.init.normal_(m.weight.data, std=np.sqrt(2 / self.neural_num))
                # pytorch官方提供kaiming初始化
                # nn.init.kaiming_normal_(m.weight.data)

# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)
layer:0, std:0.826629638671875
layer:1, std:0.8786815404891968
layer:2, std:0.9134422540664673
layer:3, std:0.8892471194267273
layer:4, std:0.834428071975708
layer:5, std:0.874537467956543
layer:6, std:0.7926971316337585
layer:7, std:0.7806458473205566
layer:8, std:0.8684563636779785
layer:9, std:0.9434137344360352
layer:10, std:0.964215874671936
layer:11, std:0.8896796107292175
layer:12, std:0.8287257552146912
layer:13, std:0.8519769906997681
layer:14, std:0.8354345560073853
layer:15, std:0.802306056022644
layer:16, std:0.8613607287406921
layer:17, std:0.7583686709403992
layer:18, std:0.8120225071907043
layer:19, std:0.791111171245575
layer:20, std:0.7164372801780701
layer:21, std:0.778393030166626
layer:22, std:0.8672043085098267
layer:23, std:0.874812662601471
layer:24, std:0.9020991325378418
layer:25, std:0.8585715889930725
layer:26, std:0.7824353575706482
layer:27, std:0.7968912720680237
layer:28, std:0.8984369039535522
layer:29, std:0.8704465627670288
layer:30, std:0.9860473275184631
layer:31, std:0.9080777168273926
layer:32, std:0.9140636920928955
layer:33, std:1.009956955909729
layer:34, std:0.9909380674362183
layer:35, std:1.0253208875656128
layer:36, std:0.849043607711792
layer:37, std:0.703953742980957
layer:38, std:0.7186155319213867
layer:39, std:0.7250635027885437
layer:40, std:0.7030817270278931
layer:41, std:0.6325559020042419
layer:42, std:0.6623690724372864
layer:43, std:0.6960875988006592
layer:44, std:0.7140733003616333
layer:45, std:0.632905125617981
layer:46, std:0.6458898186683655
layer:47, std:0.7354375720024109
layer:48, std:0.6710687279701233
layer:49, std:0.6939153671264648
layer:50, std:0.6889258027076721
layer:51, std:0.6331773996353149
layer:52, std:0.6029313206672668
layer:53, std:0.6145528554916382
layer:54, std:0.6636686325073242
layer:55, std:0.7440094947814941
layer:56, std:0.7972175478935242
layer:57, std:0.7606149911880493
layer:58, std:0.696868360042572
layer:59, std:0.7306802272796631
layer:60, std:0.6875627636909485
layer:61, std:0.7171440720558167
layer:62, std:0.7646605372428894
layer:63, std:0.7965086698532104
layer:64, std:0.8833740949630737
layer:65, std:0.8592952489852905
layer:66, std:0.8092936873435974
layer:67, std:0.806481122970581
layer:68, std:0.6792410612106323
layer:69, std:0.6583346128463745
layer:70, std:0.5702278017997742
layer:71, std:0.5084435939788818
layer:72, std:0.4869326055049896
layer:73, std:0.46350404620170593
layer:74, std:0.4796811640262604
layer:75, std:0.47372108697891235
layer:76, std:0.45414549112319946
layer:77, std:0.4971912205219269
layer:78, std:0.492794930934906
layer:79, std:0.4422350823879242
layer:80, std:0.4802998900413513
layer:81, std:0.5579248666763306
layer:82, std:0.5283755660057068
layer:83, std:0.5451980829238892
layer:84, std:0.6203726530075073
layer:85, std:0.6571893095970154
layer:86, std:0.703682005405426
layer:87, std:0.7321067452430725
layer:88, std:0.6924356818199158
layer:89, std:0.6652532815933228
layer:90, std:0.6728308796882629
layer:91, std:0.6606621742248535
layer:92, std:0.6094604730606079
layer:93, std:0.6019102334976196
layer:94, std:0.595421552658081
layer:95, std:0.6624555587768555
layer:96, std:0.6377885341644287
layer:97, std:0.6079285740852356
layer:98, std:0.6579315066337585
layer:99, std:0.6668476462364197
tensor([[0.0000, 1.3437, 0.0000,  ..., 0.0000, 0.6444, 1.1867],
        [0.0000, 0.9757, 0.0000,  ..., 0.0000, 0.4645, 0.8594],
        [0.0000, 1.0023, 0.0000,  ..., 0.0000, 0.5148, 0.9196],
        ...,
        [0.0000, 1.2873, 0.0000,  ..., 0.0000, 0.6454, 1.1411],
        [0.0000, 1.3589, 0.0000,  ..., 0.0000, 0.6749, 1.2438],
        [0.0000, 1.1807, 0.0000,  ..., 0.0000, 0.5668, 1.0600]],
       grad_fn=<ReluBackward0>)

Q:如何计算激活函数的方差变化尺度?

  • torch.nn.init.calculate_gain(nonlinearity, param=None)
  • nonlinearity:激活函数名称
  • param:激活函数的参数,如Leaky ReLU的negative_slop

Q:calculate_gain代码示例?

In [1]:
import torch
import torch.nn as nn

# 数据
x = torch.randn(10000)
out = torch.tanh(x)

# 得到标准差的尺度变化,即缩放比例
gain = x.std() / out.std()
print('gain:{}'.format(gain))

# pytorch计算tanh的增益
tanh_gain = nn.init.calculate_gain('tanh')
print('tanh_gain in PyTorch:', tanh_gain)
gain:1.5827221870422363
tanh_gain in PyTorch: 1.6666666666666667

2.损失函数(一)

Q:如何进行交叉熵计算?

  • torch.nn.CrossEntropyLoss(weight: Optional[torch.Tensor] = None, size_average=None, ignore_index: int = -100, reduce=None, reduction: str = 'mean')
  • nn.LogSoftmax()与nn.NLLLoss()结合
  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean
    • none:逐个元素计算
    • sum:所有元素求和,返回标量
    • mean:加权平均,返回标量
  • 忽略size_average和reduce两个字段,后续会废除

Q:人民币分类模型代码演示

In [15]:
"""
# @file name  : 1_split_dataset.py
# @author     : TingsongYu https://github.com/TingsongYu
# @date       : 2020-07-24 10:08:00
# @brief      : 将数据集划分为训练集,验证集,测试集
"""

import os
import random
import shutil


def makedir(new_dir):
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)


if __name__ == '__main__':

    dataset_dir = os.path.abspath(os.path.join("data", "RMB_data"))
    split_dir = os.path.abspath(os.path.join("data", "rmb_split"))
    train_dir = os.path.join(split_dir, "train")
    valid_dir = os.path.join(split_dir, "valid")
    test_dir = os.path.join(split_dir, "test")

    if not os.path.exists(dataset_dir):
        raise Exception("\n{} 不存在,请下载 02-01-数据-RMB_data.rar 放到\n{} 下,并解压即可".format(
            dataset_dir, os.path.dirname(dataset_dir)))

    train_pct = 0.8
    valid_pct = 0.1
    test_pct = 0.1

    for root, dirs, files in os.walk(dataset_dir):
        for sub_dir in dirs:

            imgs = os.listdir(os.path.join(root, sub_dir))
            imgs = list(filter(lambda x: x.endswith('.jpg'), imgs))
            random.shuffle(imgs)
            img_count = len(imgs)

            train_point = int(img_count * train_pct)
            valid_point = int(img_count * (train_pct + valid_pct))

            for i in range(img_count):
                if i < train_point:
                    out_dir = os.path.join(train_dir, sub_dir)
                elif i < valid_point:
                    out_dir = os.path.join(valid_dir, sub_dir)
                else:
                    out_dir = os.path.join(test_dir, sub_dir)

                makedir(out_dir)

                target_path = os.path.join(out_dir, imgs[i])
                src_path = os.path.join(dataset_dir, sub_dir, imgs[i])

                shutil.copy(src_path, target_path)

            print('Class:{}, train:{}, valid:{}, test:{}'.format(sub_dir, train_point, valid_point-train_point,
                                                                 img_count-valid_point))
            print("已在 {} 创建划分好的数据\n".format(out_dir))
Class:1, train:80, valid:10, test:10
已在 /Volumes/code/GitHub/Learn-AI/pytorch_deepshare/data/rmb_split/test/1 创建划分好的数据

Class:100, train:80, valid:10, test:10
已在 /Volumes/code/GitHub/Learn-AI/pytorch_deepshare/data/rmb_split/test/100 创建划分好的数据

In [16]:
"""
# @file name  : ce_loss.py
# @author     : TingsongYu https://github.com/TingsongYu
# @date       : 2019-10-07 10:08:00
# @brief      : 人民币分类模型训练
"""
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.optim as optim
from PIL import Image
from matplotlib import pyplot as plt

from utils.lenet import LeNet
from utils.my_dataset import RMBDataset
from utils.common_tools import transform_invert, set_seed


set_seed(1)  # 设置随机种子
rmb_label = {"1": 0, "100": 1}

# 参数设置
MAX_EPOCH = 10
BATCH_SIZE = 16
LR = 0.01
log_interval = 10
val_interval = 1

# ============================ step 1/5 数据 ============================

split_dir = os.path.abspath(os.path.join("data", "rmb_split"))
if not os.path.exists(split_dir):
    raise Exception(r"数据 {} 不存在, 回到lesson-06\1_split_dataset.py生成数据".format(split_dir))
train_dir = os.path.join(split_dir, "train")
valid_dir = os.path.join(split_dir, "valid")

norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]

train_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomGrayscale(p=0.8),
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

valid_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

# 构建MyDataset实例
train_data = RMBDataset(data_dir=train_dir, transform=train_transform)
valid_data = RMBDataset(data_dir=valid_dir, transform=valid_transform)

# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE)

# ============================ step 2/5 模型 ============================

net = LeNet(classes=2)
net.initialize_weights()

# ============================ step 3/5 损失函数 ============================
loss_functoin = nn.CrossEntropyLoss()                                                   # 选择损失函数

# ============================ step 4/5 优化器 ============================
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9)                        # 选择优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)     # 设置学习率下降策略

# ============================ step 5/5 训练 ============================
train_curve = list()
valid_curve = list()

for epoch in range(MAX_EPOCH):

    loss_mean = 0.
    correct = 0.
    total = 0.

    net.train()
    for i, data in enumerate(train_loader):

        # forward
        inputs, labels = data
        outputs = net(inputs)

        # backward
        optimizer.zero_grad()
        loss = loss_functoin(outputs, labels)
        loss.backward()

        # update weights
        optimizer.step()

        # 统计分类情况
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).squeeze().sum().numpy()

        # 打印训练信息
        loss_mean += loss.item()
        train_curve.append(loss.item())
        if (i+1) % log_interval == 0:
            loss_mean = loss_mean / log_interval
            print("Training:Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
                epoch, MAX_EPOCH, i+1, len(train_loader), loss_mean, correct / total))
            loss_mean = 0.

    scheduler.step()  # 更新学习率

    # validate the model
    if (epoch+1) % val_interval == 0:

        correct_val = 0.
        total_val = 0.
        loss_val = 0.
        net.eval()
        with torch.no_grad():
            for j, data in enumerate(valid_loader):
                inputs, labels = data
                outputs = net(inputs)
                loss = loss_functoin(outputs, labels)

                _, predicted = torch.max(outputs.data, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).squeeze().sum().numpy()

                loss_val += loss.item()

            valid_curve.append(loss_val)
            print("Valid:\t Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
                epoch, MAX_EPOCH, j+1, len(valid_loader), loss_val, correct / total))


train_x = range(len(train_curve))
train_y = train_curve

train_iters = len(train_loader)
valid_x = np.arange(1, len(valid_curve)+1) * train_iters*val_interval # 由于valid中记录的是epochloss,需要对记录点进行转换到iterations
valid_y = valid_curve

plt.plot(train_x, train_y, label='Train')
plt.plot(valid_x, valid_y, label='Valid')

plt.legend(loc='upper right')
plt.ylabel('loss value')
plt.xlabel('Iteration')
plt.show()

# ============================ inference ============================

test_dir = "test_data"

test_data = RMBDataset(data_dir=test_dir, transform=valid_transform)
valid_loader = DataLoader(dataset=test_data, batch_size=1)

for i, data in enumerate(valid_loader):
    # forward
    inputs, labels = data
    outputs = net(inputs)
    _, predicted = torch.max(outputs.data, 1)

    rmb = 1 if predicted.numpy()[0] == 0 else 100

    img_tensor = inputs[0, ...]  # C H W
    img = transform_invert(img_tensor, train_transform)
    plt.imshow(img)
    plt.title("LeNet got {} Yuan".format(rmb))
    plt.show()
    plt.pause(0.5)
    plt.close()
Training:Epoch[000/010] Iteration[010/010] Loss: 0.6654 Acc:58.75%
Valid:	 Epoch[000/010] Iteration[002/002] Loss: 0.8840 Acc:58.75%
Training:Epoch[001/010] Iteration[010/010] Loss: 0.4679 Acc:84.38%
Valid:	 Epoch[001/010] Iteration[002/002] Loss: 0.2392 Acc:84.38%
Training:Epoch[002/010] Iteration[010/010] Loss: 0.3967 Acc:80.62%
Valid:	 Epoch[002/010] Iteration[002/002] Loss: 0.1648 Acc:80.62%
Training:Epoch[003/010] Iteration[010/010] Loss: 0.1178 Acc:96.88%
Valid:	 Epoch[003/010] Iteration[002/002] Loss: 0.0284 Acc:96.88%
Training:Epoch[004/010] Iteration[010/010] Loss: 0.0138 Acc:100.00%
Valid:	 Epoch[004/010] Iteration[002/002] Loss: 0.1566 Acc:100.00%
Training:Epoch[005/010] Iteration[010/010] Loss: 0.0511 Acc:98.75%
Valid:	 Epoch[005/010] Iteration[002/002] Loss: 0.0001 Acc:98.75%
Training:Epoch[006/010] Iteration[010/010] Loss: 0.0033 Acc:100.00%
Valid:	 Epoch[006/010] Iteration[002/002] Loss: 0.0002 Acc:100.00%
Training:Epoch[007/010] Iteration[010/010] Loss: 0.0440 Acc:98.12%
Valid:	 Epoch[007/010] Iteration[002/002] Loss: 0.0002 Acc:98.12%
Training:Epoch[008/010] Iteration[010/010] Loss: 0.0173 Acc:99.38%
Valid:	 Epoch[008/010] Iteration[002/002] Loss: 0.0004 Acc:99.38%
Training:Epoch[009/010] Iteration[010/010] Loss: 0.0228 Acc:99.38%
Valid:	 Epoch[009/010] Iteration[002/002] Loss: 0.0006 Acc:99.38%

Q:交叉熵代码演示?

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)

# ----------------------------------- CrossEntropy loss: reduction -----------------------------------

# def loss function
loss_f_none = nn.CrossEntropyLoss(weight=None, reduction='none')
loss_f_sum = nn.CrossEntropyLoss(weight=None, reduction='sum')
loss_f_mean = nn.CrossEntropyLoss(weight=None, reduction='mean')

# forward
loss_none = loss_f_none(inputs, target)
loss_sum = loss_f_sum(inputs, target)
loss_mean = loss_f_mean(inputs, target)

# view
print("Cross Entropy Loss:\n ", loss_none, loss_sum, loss_mean)
Cross Entropy Loss:
  tensor([1.3133, 0.1269, 0.1269]) tensor(1.5671) tensor(0.5224)

Q:通过手算校验pytorch的交叉熵代码的正确性代码?

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)

# --------------------------------- compute by hand

idx = 0

input_1 = inputs.detach().numpy()[idx]      # [1, 2]
target_1 = target.numpy()[idx]              # [0]

# 第一项
x_class = input_1[target_1]

# 第二项
sigma_exp_x = np.sum(list(map(np.exp, input_1)))
log_sigma_exp_x = np.log(sigma_exp_x)

# 输出loss
loss_1 = -x_class + log_sigma_exp_x

print("第一个样本loss为: ", loss_1)
第一个样本loss为:  1.3132617

Q:交叉熵的weight作用代码示例?

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)

# ----------------------------------- weight -----------------------------------
# def loss function
weights = torch.tensor([1, 2], dtype=torch.float)
# weights = torch.tensor([0.7, 0.3], dtype=torch.float)

loss_f_none_w = nn.CrossEntropyLoss(weight=weights, reduction='none')
loss_f_sum = nn.CrossEntropyLoss(weight=weights, reduction='sum')
loss_f_mean = nn.CrossEntropyLoss(weight=weights, reduction='mean')

# forward
loss_none_w = loss_f_none_w(inputs, target)
loss_sum = loss_f_sum(inputs, target)
loss_mean = loss_f_mean(inputs, target)

# view
print("weights: ", weights)
print(loss_none_w, loss_sum, loss_mean)
weights:  tensor([1., 2.])
tensor([1.3133, 0.2539, 0.2539]) tensor(1.8210) tensor(0.3642)

Q:交叉熵的weight作用手算校验代码示例?

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)

weights = torch.tensor([1, 2], dtype=torch.float)
weights_all = np.sum(list(map(lambda x: weights.numpy()[x], target.numpy())))  # [0, 1, 1]  # [1 2 2]

mean = 0
loss_sep = loss_none.detach().numpy()
for i in range(target.shape[0]):

    x_class = target.numpy()[i]
    tmp = loss_sep[i] * (weights.numpy()[x_class] / weights_all)
    mean += tmp

print(mean)
0.3641947731375694

Q:如何实现负对数似然函数中的负号功能?

  • $$\ell(x, y)=L=\left\{l_{1}, \ldots, l_{N}\right\}^{\prime}, \quad l_{n}=-w_{y_{n}} x_{n, y_{n}}$$
  • torch.nn.NLLLoss(weight: Optional[torch.Tensor] = None, size_average=None, ignore_index: int = -100, reduce=None, reduction: str = 'mean')
  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean
    • none:逐个元素计算
    • sum:所有元素求和,返回标量
    • mean:加权平均,返回标量

Q:NLLLoss代码示例?

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)

# ----------------------------------- 2 NLLLoss -----------------------------------
weights = torch.tensor([1, 1], dtype=torch.float)

loss_f_none_w = nn.NLLLoss(weight=weights, reduction='none')
loss_f_sum = nn.NLLLoss(weight=weights, reduction='sum')
loss_f_mean = nn.NLLLoss(weight=weights, reduction='mean')

# forward
loss_none_w = loss_f_none_w(inputs, target)
loss_sum = loss_f_sum(inputs, target)
loss_mean = loss_f_mean(inputs, target)

# view
print("weights: ", weights)
print("NLL Loss", loss_none_w, loss_sum, loss_mean)
weights:  tensor([1., 1.])
NLL Loss tensor([-1., -3., -3.]) tensor(-7.) tensor(-2.3333)

Q:二分类交叉熵是怎样?

  • $$l_{n}=-w_{n}\left[y_{n} \cdot \log x_{n}+\left(1-y_{n}\right) \cdot \log \left(1-x_{n}\right)\right]$$
  • torch.nn.BCELoss(weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean')
  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean
    • none:逐个元素计算
    • sum:所有元素求和,返回标量
    • mean:加权平均,返回标量
  • 注意:输入值取值在[0,1]

Q:BCELoss代码示例?

In [27]:
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)

target_bce = target

# itarget
inputs = torch.sigmoid(inputs)

weights = torch.tensor([1, 1], dtype=torch.float)

loss_f_none_w = nn.BCELoss(weight=weights, reduction='none')
loss_f_sum = nn.BCELoss(weight=weights, reduction='sum')
loss_f_mean = nn.BCELoss(weight=weights, reduction='mean')

# forward
loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum = loss_f_sum(inputs, target_bce)
loss_mean = loss_f_mean(inputs, target_bce)

# view
print("weights: ", weights)
print("BCE Loss", loss_none_w, loss_sum, loss_mean)
weights:  tensor([1., 1.])
BCE Loss tensor([[0.3133, 2.1269],
        [0.1269, 2.1269],
        [3.0486, 0.0181],
        [4.0181, 0.0067]]) tensor(11.7856) tensor(1.4732)

Q:BCELoss手算校验代码示例?

In [31]:
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)

inputs = torch.sigmoid(inputs)

idx = 0

x_i = inputs.detach().numpy()[idx, idx]
y_i = target.numpy()[idx, idx]              #

# loss
# l_i = -[ y_i * np.log(x_i) + (1-y_i) * np.log(1-y_i) ]      # np.log(0) = nan
l_i = -y_i * np.log(x_i) if y_i else -(1-y_i) * np.log(1-x_i)

# 输出loss
print("BCE inputs: ", inputs)
print("第一个loss为: ", l_i)
BCE inputs:  tensor([[0.7311, 0.8808],
        [0.8808, 0.8808],
        [0.9526, 0.9820],
        [0.9820, 0.9933]])
第一个loss为:  0.31326166

Q:结合Sigmoid与二分类交叉熵

  • $$l_{n}=-w_{n}\left[y_{n} \cdot \log \sigma\left(x_{n}\right)+\left(1-y_{n}\right) \cdot \log \left(1-\sigma\left(x_{n}\right)\right)\right]$$
  • torch.nn.BCEWithLogitsLoss(weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean', pos_weight: Optional[torch.Tensor] = None)
  • pos_weight:正样本的权值,用于均衡正负样本
  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean
    • none:逐个元素计算
    • sum:所有元素求和,返回标量
    • mean:加权平均,返回标量
  • 注意:网络最后不加sigmoid函数

Q:BCEwithLogitsLoss的代码示例?

In [33]:
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)

target_bce = target

# 不需要加sigmoid
# inputs = torch.sigmoid(inputs)

weights = torch.tensor([1, 1], dtype=torch.float)

loss_f_none_w = nn.BCEWithLogitsLoss(weight=weights, reduction='none')
loss_f_sum = nn.BCEWithLogitsLoss(weight=weights, reduction='sum')
loss_f_mean = nn.BCEWithLogitsLoss(weight=weights, reduction='mean')

# forward
loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum = loss_f_sum(inputs, target_bce)
loss_mean = loss_f_mean(inputs, target_bce)

# view
print("weights: ", weights)
print(loss_none_w, loss_sum, loss_mean)
weights:  tensor([1., 1.])
tensor([[0.3133, 2.1269],
        [0.1269, 2.1269],
        [3.0486, 0.0181],
        [4.0181, 0.0067]]) tensor(11.7856) tensor(1.4732)

Q:BCEwithLogitsLoss的pos_weight代码示例?

In [36]:
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)

target_bce = target

# itarget
# inputs = torch.sigmoid(inputs)

weights = torch.tensor([1], dtype=torch.float)
pos_w = torch.tensor([3], dtype=torch.float)        # 3

loss_f_none_w = nn.BCEWithLogitsLoss(weight=weights, reduction='none', pos_weight=pos_w)
loss_f_sum = nn.BCEWithLogitsLoss(weight=weights, reduction='sum', pos_weight=pos_w)
loss_f_mean = nn.BCEWithLogitsLoss(weight=weights, reduction='mean', pos_weight=pos_w)

# forward
loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum = loss_f_sum(inputs, target_bce)
loss_mean = loss_f_mean(inputs, target_bce)

# view
print("pos_weights: ", pos_w)
print(loss_none_w, loss_sum, loss_mean)
pos_weights:  tensor([3.])
tensor([[0.9398, 2.1269],
        [0.3808, 2.1269],
        [3.0486, 0.0544],
        [4.0181, 0.0201]]) tensor(12.7158) tensor(1.5895)

3.损失函数(二)

Q:如何计算inputs与target之差的绝对值?

  • $$l_{n}=\left|x_{n}-y_{n}\right|$$
  • torch.nn.L1Loss(size_average=None, reduce=None, reduction: str = 'mean')

Q:如何计算inputs与target之差的平方?

  • $$l_{n}=\left(x_{n}-y_{n}\right)^{2}$$
  • torch.nn.MSELoss(size_average=None, reduce=None, reduction: str = 'mean')

Q:L1Loss和MSELoss代码示例?

In [38]:
import torch
import torch.nn as nn
import numpy as np
from utils.common_tools import set_seed
set_seed(1)  # 设置随机种子

inputs = torch.ones((2, 2))
target = torch.ones((2, 2)) * 3

loss_f = nn.L1Loss(reduction='none')
loss = loss_f(inputs, target)

print("input:{}\ntarget:{}\nL1 loss:{}".format(inputs, target, loss))

loss_f_mse = nn.MSELoss(reduction='none')
loss_mse = loss_f_mse(inputs, target)

print("MSE loss:{}".format(loss_mse))
input:tensor([[1., 1.],
        [1., 1.]])
target:tensor([[3., 3.],
        [3., 3.]])
L1 loss:tensor([[2., 2.],
        [2., 2.]])
MSE loss:tensor([[4., 4.],
        [4., 4.]])

Q:平滑的L1Loss是怎样?

  • $$\operatorname{loss}(x, y)=\frac{1}{n} \sum_{i} z_{i}$$
  • $$z_{i}=\left\{\begin{array}{ll} 0.5\left(x_{i}-y_{i}\right)^{2}, & \text { if }\left|x_{i}-y_{i}\right|<1 \\ \left|x_{i}-y_{i}\right|-0.5, & \text { otherwise } \end{array}\right.$$
  • torch.nn.SmoothL1Loss(size_average=None, reduce=None, reduction: str = 'mean')

Q:SmoothL1Loss代码示例

In [40]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from utils.common_tools import set_seed
set_seed(1)  # 设置随机种子

inputs = torch.linspace(-3, 3, steps=500)
target = torch.zeros_like(inputs)

loss_f = nn.SmoothL1Loss(reduction='none')

loss_smooth = loss_f(inputs, target)

loss_l1 = np.abs(inputs.numpy())

plt.plot(inputs.numpy(), loss_smooth.numpy(), label='Smooth L1 Loss')
plt.plot(inputs.numpy(), loss_l1, label='L1 loss')
plt.xlabel('x_i - y_i')
plt.ylabel('loss value')
plt.legend()
plt.grid()
plt.show()

Q:泊松分布的负对数似然损失函数是怎样?

  • torch.nn.PoissonNLLLoss(log_input: bool = True, full: bool = False, size_average=None, eps: float = 1e-08, reduce=None, reduction: str = 'mean')
  • log_input:输入是否为对数形式,决定计算公式
  • full:计算所有loss,默认为False
  • eps:修正项,避免log(input)为nan
  • 当log_input = True, loss(input, target) = exp (input)-target * input
  • 当log_input = False, loss(input, target) = input - target * log(input +eps)

Q:PoissonNLLLoss以及手算校验的代码示例

In [42]:
inputs = torch.randn((2, 2))
target = torch.randn((2, 2))

loss_f = nn.PoissonNLLLoss(log_input=True, full=False, reduction='none')
loss = loss_f(inputs, target)
print("input:{}\ntarget:{}\nPoisson NLL loss:{}".format(inputs, target, loss))

idx = 0
loss_1 = torch.exp(inputs[idx, idx]) - target[idx, idx]*inputs[idx, idx]
print("第一个元素loss:", loss_1)
input:tensor([[-1.0276, -0.5631],
        [-0.8923, -0.0583]])
target:tensor([[-0.1955, -0.9656],
        [ 0.4224,  0.2673]])
Poisson NLL loss:tensor([[0.1570, 0.0258],
        [0.7866, 0.9590]])
第一个元素loss: tensor(0.1570)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

4.优化器(一)

Q:Pytorch中的优化器的代码是怎样?

  • class Optimizer(object):
      def __init__(self, params, defaults):
          self.defaults = defaults
          self.state = defaultdict(dict)
          self.param_groups = []
    
          self.param_groups = [{'params': param_groups}]
    
  • defaults:优化器超参数
  • state:参数的缓存, 如momentum的缓存
  • params_groups:管理的参数组
  • _step_count:记录更新次数, 学习率调整中使用

Q:pytorch的优化器有哪些方法?

  • zero_grad():清空所管理参数的梯度
  • step():执行一步更新
  • add_param_group():添加参数组
  • state_dict():获取优化器当前状态信息字典
  • load_state_dict():加载状态信息字典

Q:优化器的step和zero_grad代码示例

In [44]:
import torch
import torch.optim as optim
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = optim.SGD([weight], lr=0.1)

print("weight before step:{}".format(weight.data))
optimizer.step()        # 修改lr=1 0.1观察结果
print("weight after step:{}".format(weight.data))

print("weight in optimizer:{}\nweight in weight:{}\n".format(id(optimizer.param_groups[0]['params'][0]), id(weight)))

print("weight.grad is {}\n".format(weight.grad))
optimizer.zero_grad()
print("after optimizer.zero_grad(), weight.grad is\n{}".format(weight.grad))
weight before step:tensor([[0.6614, 0.2669],
        [0.0617, 0.6213]])
weight after step:tensor([[ 0.5614,  0.1669],
        [-0.0383,  0.5213]])
weight in optimizer:140728466157072
weight in weight:140728466157072

weight.grad is tensor([[1., 1.],
        [1., 1.]])

after optimizer.zero_grad(), weight.grad is
tensor([[0., 0.],
        [0., 0.]])

Q:优化器的add_param_group代码示例

In [46]:
import torch
import torch.optim as optim
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = optim.SGD([weight], lr=0.1)

print("optimizer.param_groups is\n{}".format(optimizer.param_groups))

w2 = torch.randn((3, 3), requires_grad=True)

optimizer.add_param_group({"params": w2, 'lr': 0.0001})

print("\noptimizer.param_groups is\n{}".format(optimizer.param_groups))
optimizer.param_groups is
[{'params': [tensor([[0.6614, 0.2669],
        [0.0617, 0.6213]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]

optimizer.param_groups is
[{'params': [tensor([[0.6614, 0.2669],
        [0.0617, 0.6213]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}, {'params': [tensor([[-0.4519, -0.1661, -1.5228],
        [ 0.3817, -1.0276, -0.5631],
        [-0.8923, -0.0583, -0.1955]], requires_grad=True)], 'lr': 0.0001, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]

Q:优化器的state_dict和load_state_dict代码示例

In [48]:
import torch
import torch.optim as optim
from utils.common_tools import set_seed

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

# ----------------------------------- state_dict -----------------------------------

optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
opt_state_dict = optimizer.state_dict()

print("state_dict before step:\n", opt_state_dict)

for i in range(10):
    optimizer.step()

print("state_dict after step:\n", optimizer.state_dict())

torch.save(optimizer.state_dict(), "optimizer_state_dict.pkl")

# -----------------------------------load state_dict -----------------------------------

optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
state_dict = torch.load("optimizer_state_dict.pkl")

print("\nstate_dict before load state:\n", optimizer.state_dict())
optimizer.load_state_dict(state_dict)
print("state_dict after load state:\n", optimizer.state_dict())
state_dict before step:
 {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]}
state_dict after step:
 {'state': {140728466498736: {'momentum_buffer': tensor([[6.5132, 6.5132],
        [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]}

state_dict before load state:
 {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]}
state_dict after load state:
 {'state': {140728466498736: {'momentum_buffer': tensor([[6.5132, 6.5132],
        [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]}

5.优化器(二)

Q:optim.SGD是怎样?

  • torch.optim.SGD(params, lr=<required parameter>, momentum=0, dampening=0, weight_decay=0, nesterov=False)
  • params:管理的参数组
  • lr:初始学习率
  • momentum:动量系数, 贝塔
  • weight_decay: L2 正则化系数
  • nesterov:是否采用 NAG
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: