行列演算でニューラルネット

巷では何でもできると評判のニューラルネットワーク。
（少なくとも身内の中では）
でも学習に時間がかかる、局所解に陥るなどSVMと比べると欠点も多くて、最近不人気みたい。

ニューラルネットワークは万能だと洗脳されている僕にとっては、もっと人気が出てほしいのでソースコードを晒しておく。

とりあえず、サンプルとしてXOR問題を解くプログラムで、変数名はPRMLに合わせてある。

main.cpp

int main() 
{
	// 入力
	VectorXd x[4];
	for (int i = 0; i < 4; ++i) {
		x[i] = VectorXd::Zero(2);
	}
	x[0] << 0.0, 0.0;
	x[1] << 0.0, 1.0;
	x[2] << 1.0, 0.0;
	x[3] << 1.0, 1.0;

        // 教師信号
	VectorXd t[4];
	for (int i = 0; i < 4; ++i) {
		t[i] = VectorXd::Zero(1);
	}
	t[0] << 0.0;
	t[1] << 1.0;
	t[2] << 1.0;
	t[3] << 0.0;

	MatrixXd w0 = MatrixXd::Random(kNumHidden, 1 + kNumInput);
	MatrixXd w1 = MatrixXd::Random(kNumOutput, 1 + kNumHidden);

        // 学習回数300回
	for (int n = 0; n < 300; ++n) {
		for (int i = 0; i < 4; ++i) {
			learn(x[i], t[i], w0, w1);
		}
	}

        // 出力結果
	for (int i = 0; i < 4; ++i) {
		std::cout << feedFoward(x[i], w0, w1) << std::endl;
	}

	//std::cout << "w0\n" << w0 << std::endl;
	//std::cout << "w1\n" << w1 << std::endl;

	return 0;
}

main.cppのニューラルネット部

#include <Eigen/Dense>
using namespace Eigen;

// 活性化関数
//auto lambda_act = [](const double x) { return 1.7159*tanh(0.6667*x); };
//auto lambda_act_p = [](const double x) { return 1.7159*0.6667*(1.0 - 1.0/(1.7159*1.7159)*x*x); };

auto lambda_act = [](const double x) { return tanh(x); };
auto lambda_act_p = [](const double x) { return 1.0 - x*x; };

//auto lambda_act = [](const double x) { return 1.0 / (1+exp(-x)); };
//auto lambda_act_p = [](const double x) { return x*(1.0 - x); };

const int kNumInput = 2; // 入力数
const int kNumHidden = 3; // 隠れ層の大きさ
const int kNumOutput = 1; // 出力数
const double kEta = 0.3; // 学習係数

// softmax関数
void Softmax(VectorXd& in)
{
	VectorXd vec_exp = in.array().exp();
	const double sum_exp = vec_exp.sum();

	in = vec_exp.array() / sum_exp;
}

// 学習用
// 2-layer NN
void learn(const VectorXd& in, const VectorXd& tk, MatrixXd& w0, MatrixXd& w1)
{
	// Feedfoward
	VectorXd xi(1 + kNumInput);
	xi(0) = 1.0;
	xi.tail(kNumInput) = in; 
	VectorXd aj = w0 * xi;
  
	VectorXd zj(1 + kNumHidden);
	zj(0) = 1.0;
	zj.tail(kNumHidden) = aj.unaryExpr(lambda_act);

	VectorXd yk = w1 * zj;

	//Softmax(yk);

	// Error back propagate
	VectorXd delta_k = yk - tk;
	VectorXd delta_j = w1.transpose() * delta_k;

	delta_j = delta_j.array() * zj.unaryExpr(lambda_act_p).array();

	w1.array() -= kEta * (delta_k * zj.transpose()).array(); // TODO noalias()付ける
	w0.array() -= kEta * (delta_j.tail(kNumHidden) * xi.transpose()).array();
}

// 識別用
// learn()の上半分と同じ内容
VectorXd feedFoward(const VectorXd& in, const MatrixXd& w0, const MatrixXd& w1)
{
	VectorXd xi(1 + kNumInput);
	xi(0) = 1.0;
	xi.tail(kNumInput) = in; 

	VectorXd aj = w0 * xi;
	VectorXd zj(1 + kNumHidden);
	zj(0) = 1.0;
	zj.tail(kNumHidden) = aj.unaryExpr(lambda_act);

	VectorXd yk = w1 * zj;

	//Softmax(yk);

	return yk;
}

主力結果が

となれば学習成功。
行列演算を使って実装しているのでコードの見通しもいいし、謎の行列積アルゴリズムのおかげで速いと思う。（未確認）
ただ学習係数が固定なので可変にした方がいいし、weightの保存ができないので、実際使おうとすると面倒くさい。
学習係数に関しては自動的に値を更新する、iRPROP-のコードがあるので載せておく。（覚えていないので間違っているかも）
weight.h

#include <random>
#include <time.h>
#include <Eigen/Dense>
using namespace Eigen;

static const double kEta = 0.0025; // for sgm

static const double kEta0 = 0.0125;
static const double kEtaPlus = 1.2;
static const double kEtaNeg = 0.5;
static const double kEtaMax = 50;
static const double kEtaMin = 0.000001;

class Weight {
public:
	Weight(int rows, int cols) : rows_(rows), cols_(cols)
	{
		std::mt19937 engine( static_cast<unsigned long>(time(0)) );	
		std::uniform_real_distribution<double> dist(-0.005, 0.005);
		
		weight_ = MatrixXd::Zero(rows_, cols_);
		weight_ = weight_.unaryExpr([&](const double x) { return dist(engine); });

		pre_weight_ = weight_;
		pre_deriv_weight_ = MatrixXd::Zero(rows_, cols_);
		deriv_weight_ = MatrixXd::Zero(rows_, cols_);
		pre_step_size_ = MatrixXd::Constant(rows_, cols_, kEta0);
                step_size_ = pre_step_size;
	}

	const MatrixXd& weight() const {
		return weight_;
	}

	MatrixXd& deriv_weight() {
		return deriv_weight_;
	}

	void SGM() {
		weight_ -= kEta * deriv_weight_;
		deriv_weight_.setZero();
	}

	// iRprop-
	void iRprop_neg() {
		for ( int y = 0; y < rows_; ++y ) {
			for ( int x = 0; x < cols_; ++x ) {
				double has_sign_changed = pre_deriv_weight_(y, x) * deriv_weight_(y, x);

				if ( has_sign_changed > 0.0 ) {
					step_size_(y, x) = std::min(pre_step_size_(y, x)*kEtaPlus, kEtaMax);
				} else if ( has_sign_changed < 0.0 ) {
					step_size_(y, x) = std::max(pre_step_size_(y, x)*kEtaNeg, kEtaMin);
					deriv_weight_(y, x) = 0.0;
				}
				weight_(y, x) = pre_weight_(y, x) - sign(deriv_weight_(y, x)) * step_size_(y,x);
			}
		}

		pre_deriv_weight_ = deriv_weight_;
		pre_weight_ = weight_;
		pre_step_size_ = step_size_;
		deriv_weight_.setZero();
	}

private:
	const int rows_;
	const int cols_;
	MatrixXd pre_weight_;
	MatrixXd pre_deriv_weight_;	
	MatrixXd pre_step_size_;
	MatrixXd weight_;
	MatrixXd step_size_;
	MatrixXd deriv_weight_;

	double sign(const double x) {
		if ( x > 0.0 ) {
			return 1.0;
		} else if ( x < 0.0 ) {
			return -1.0;
		} else {
			return 0.0;
		}
	}
};