深度学习中的异或问题

深度前馈网络(deep feedforward network),也叫作前馈神经网络(feedforward neural network)或者多层感知机(multilayer perceptron,MLP),是典型的深度学习模型。前馈网络的目标是近似某个函数f。例如分类器,y=f(x)将输入x映射到一个类别y。前馈网络定义了一个映射y=f(x;θ),并学习参数θ的值，使它能够得到最佳的函数近似。
我们使用“异或问题”对深度前馈网络中的前向传播和反向传播进行说明

上面这图反映了一个有趣的事实，那就是你没法使用一条直线将异或结果进行分割，但是这只是说在该空间下无法进行分割，如果我们将数据映射到其他特征空间中，那么它就成了线性可分的了。

基于以上事实，我们可以使用代码对代码进行编写

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

def sigmoid(x):
    return 1/(1+np.exp(-x))

def s_prime(z):
    return np.multiply(z, 1.0-z)  #对sigmoid函数求导

def init_weights(layers, epsilon):
    weights = []
    #i=0,1
    for i in range(len(layers)-1):
        #rand(2,3) 生成两行三列的随机数
        #用于输入层和隐层的计算
        #rand(1,3) 生成一行三列的随机数
        #用于隐层和输出层的计算
        w = np.random.rand(layers[i+1], layers[i]+1)
        w = w * 2*epsilon - epsilon
        weights.append(np.mat(w))
    return weights

def fit(X, Y, w):
    #w_grad为和原w数组中每个元素相同大小的零矩阵，相当于初始化零权重矩阵
    w_grad = ([np.mat(np.zeros(np.shape(w[i]))) for i in range(len(w))]) 
    #m=4,n=2
    m, n = X.shape
    h_total = np.zeros((m, 1))  # 所有样本的预测值, m*1, probability
    for i in range(m):
        #x [0,0] [0,1] [1,0] [1,1]
        x = X[i]
        #y 0 1 1 0
        y = Y[0,i]
        # forward propagate
        #a [0,0] [0,1] [1,0] [1,1]
        a = x
        a_s = []
        for j in range(len(w)):
            a = np.mat(np.append(1, a)).T
            a_s.append(a)  # 这里保存了前L-1层的a值
            z = w[j] * a
            a = sigmoid(z)
        h_total[i, 0] = a
        # back propagate
        delta = a - y.T
        w_grad[-1] += delta * a_s[-1].T  # L-1层的梯度
        # 倒过来，从倒数第二层开始到第二层结束，不包括第一层和最后一层
        for j in reversed(range(1, len(w))):
            delta = np.multiply(w[j].T*delta, s_prime(a_s[j]))  # 这里传递的参数是a，而不是z
            w_grad[j-1] += (delta[1:] * a_s[j-1].T)
    w_grad = [w_grad[i]/m for i in range(len(w))]
    J = (1.0 / m) * np.sum(-Y * np.log(h_total) - (np.array([[1]]) - Y) * np.log(1 - h_total))
    return {'w_grad': w_grad, 'J': J, 'h': h_total}


X = np.mat([[0,0],
            [0,1],
            [1,0],
            [1,1]])
Y = np.mat([0,1,1,0])
layers = [2,2,1]
epochs = 10000
alpha = 0.5
w = init_weights(layers, 1)
result = {'J': [], 'h': []}
w_s = {}
for i in range(epochs):
    fit_result = fit(X, Y, w)
    w_grad = fit_result.get('w_grad')
    J = fit_result.get('J')
    h_current = fit_result.get('h')
    result['J'].append(J)
    result['h'].append(h_current)
    for j in range(len(w)):
        w[j] -= alpha * w_grad[j]
    if i == 0 or i == (epochs - 1):
        # print('w_grad', w_grad)
        w_s['w_' + str(i)] = w_grad[:]

plt.plot(result.get('J'))
plt.show()
print(w_s)
print(result.get('h')[0], result.get('h')[-1])

对以上代码的分析参见下图

有关反向传播中涉及的权重更新可以参照下图的计算过程