I've been working on a neural net class that I can later turn into a library of my own. Primarily doing this to get a good understanding of nets and I've been reading all the formulas from pure maths lectures so I might have a few small details wrong. (I had no idea how to before I started this)
In this net I have coded in a normal SGD algorithm and then an momentum algorithm (or atleast what I think it is).
When I run the net on my simple data set using SGD, it works perfectly, no problems at all. But if I try using SGD with momentum, the net does not learn at all, even after 10000 iterations the loss stays around 0.7.
I have been back and forth, referencing the formula from many places and while I still doubt I completely understand, I feel like it is definitely something with my code but I cant figure it out. I have tried many combinations of alpha and lambda values, many reasonable combinations of layers and neurons(specifically more than one hidden layer with momentum formula, but it doesnt work with 1 layer either).
I am going to post the code for the full net, so if anyone is willing to just scan through it quick and see if there's anything that seems obviously wrong, that would be much appreciated. I feel the fault might ly in updateweights() function since that is where most of the calculation happens but it could also be in the calcema() function.
I have tried changing the weight update formula from W = W - (alpha * partial derivative) to W = W + ( alpha * PD) (and keeping the PD positive instead of making it negative), also tried removing the regularizer for the momentum update formula but none of it has actually made a difference.
I am still very new to this, trying my best so any feedback is appreciated.
Here is a sample from the input file:
in: 0.6 0.34 0.32 0.78
out: 1.0 0.0 0.0
in: 0.36 0.52 0.75 0.67
out: 1.0 0.0 0.0
in: 0.29 0.034 0.79 0.5
out: 0.0 1.0 0.0
in: 0.21 0.29 0.47 0.62
out: 0.0 1.0 0.0
in: 0.67 0.57 0.42 0.19
out: 0.0 1.0 0.0
in: 0.48 0.22 0.79 0.0096
out: 0.0 1.0 0.0
in: 0.75 0.48 0.61 0.67
out: 1.0 0.0 0.0
in: 0.41 0.96 0.65 0.074
out: 1.0 0.0 0.0
in: 0.19 0.88 0.68 0.1
out: 0.0 1.0 0.0
in: 0.9 0.89 0.95 0.45
out: 1.0 0.0 0.0
in: 0.71 0.58 0.95 0.013
out: 1.0 0.0 0.0
in: 0.66 0.043 0.073 0.98
out: 0.0 1.0 0.0
in: 0.12 0.37 0.2 0.22
out: 0.0 0.0 1.0
in: 0.11 0.38 0.54 0.64
out: 0.0 1.0 0.0
in: 0.42 0.81 0.94 0.98
out: 1.0 0.0 0.0
if anyone would like the full input file, let me know, I just dont know how to post files on here but I will find a way.
So my problem specifically is that when I use SGD with momentum (or what I think is SGD with momentum), my net does not learn at all and gets stuck at a loss of 0.7... but if I use normal SGD it works perfectly.
The code:
#include <iostream>
#include <vector>
#include <iomanip>
#include <cmath>
#include <random>
#include <fstream>
#include <chrono>
#include <sstream>
#include <string>
#include <assert.h>
double Relu(double val)
{
if (val < 0) return 0.01 * (exp(val) - 1);
else return val;
}
double Reluderiv(double val)
{
if (val < 0) return Relu(val) + 0.01;
else return 1;
}
double randdist(double x, double y)
{
return sqrt(2.0 / (x + y));
}
int randomt(int x, int y)
{
std::random_device rd;
std::mt19937 mt(rd());
std::uniform_real_distribution<double> dist(x, y);
return round(dist(mt));
}
class INneuron
{
public:
double val{};
std::vector <double> weights{};
std::vector <double> weightderivs{};
std::vector <double> emavals{};
};
class HIDneuron
{
public:
double preactval{};
double actval{};
double actvalPD{};
double preactvalPD{};
std::vector <double> weights{};
std::vector <double> weightderivs{};
std::vector <double> emavals{};
double bias{};
double biasderiv{};
double biasema{};
};
class OUTneuron
{
public:
double preactval{};
double actval{};
double preactvalPD{};
double bias{};
double biasderiv{};
double biasema{};
};
class Net
{
public:
Net(int netdimensions, int hidlayers, int hidneurons, int outneurons, int inneurons, double lambda, double alpha)
{
NETDIMENSIONS = netdimensions; HIDLAYERS = hidlayers; HIDNEURONS = hidneurons; OUTNEURONS = outneurons; INNEURONS = inneurons; Lambda = lambda; Alpha = alpha;
}
void defineoptimizer(std::string optimizer);
void Feedforward(const std::vector <double>& invec);
void Backprop(const std::vector <double>& targets);
void Updateweights();
void printvalues(double totalloss);
void Initweights();
void softmax();
double regularize(double weight,std::string type);
double lossfunc(const std::vector <double>& target);
void calcema(int Layernum, int neuron, int weight, std::string layer, std::string BorW);
private:
INneuron Inn;
HIDneuron Hidn;
OUTneuron Outn;
std::vector <std::vector <HIDneuron>> Hidlayers{};
std::vector <INneuron> Inlayer{};
std::vector <OUTneuron> Outlayer{};
double NETDIMENSIONS{};
double HIDLAYERS{};
double HIDNEURONS{};
double OUTNEURONS{};
double INNEURONS{};
double Lambda{};
double Alpha{};
double loss{};
int optimizerformula{};
};
void Net::defineoptimizer(std::string optimizer)
{
if (optimizer == "ExpAvrg")
{
optimizerformula = 1;
}
else if (optimizer == "SGD")
{
optimizerformula = 2;
}
else if (optimizer == "Adam")
{
optimizerformula = 3;
}
else if (optimizer == "MinibatchSGD")
{
optimizerformula = 4;
}
else {
std::cout << "no optimizer matching description" << '
';
abort();
}
}
double Net::regularize(double weight,std::string type)
{
if (type == "L1")
{
double absval{ weight };
/*if (weight < 0) absval = weight * -1;
else if (weight > 0 || weight == 0) absval = weight;
else;*/
if (absval > 0.0) return 1.0;
else if (absval < 0.0) return -1.0;
else if (absval == 0.0) return 0.0;
else return 2;
}
else if (type == "l2")
{
double absval{};
if (weight < 0.0) absval = weight * -1.0;
else absval = weight;
return (2.0 * absval);
}
else { std::cout << "no regularizer recognized" << '
'; abort(); }
}
void Net::softmax()
{
double sum{};
for (size_t Osize = 0; Osize < Outlayer.size(); Osize++)
{
sum += exp(Outlayer[Osize].preactval);
}
for (size_t Osize = 0; Osize < Outlayer.size(); Osize++)
{
Outlayer[Osize].actval = exp(Outlayer[Osize].preactval) / sum;
}
}
void Net::Initweights()
{
unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
std::default_random_engine generator(seed);
std::normal_distribution<double> distribution(0.0, 1.0);
for (int WD = 0; WD < HIDLAYERS + 1; WD++)
{
if (WD == 0)
{
for (int WL = 0; WL < INNEURONS; WL++)
{
Inlayer.push_back(Inn);
for (int WK = 0; WK < HIDNEURONS; WK++)
{
double val = distribution(generator) * randdist(INNEURONS, HIDNEURONS);
Inlayer.back().weights.push_back(val);
Inlayer.back().weightderivs.push_back(0.0);
Inlayer.back().emavals.push_back(0.0);
}
}
}
else if (WD < HIDLAYERS && WD != 0)
{
Hidlayers.push_back(std::vector <HIDneuron>());
for (int WL = 0; WL < HIDNEURONS; WL++)
{
Hidlayers.back().push_back(Hidn);
for (int WK = 0; WK < HIDNEURONS; WK++)
{
double val = distribution(generator) * randdist(HIDNEURONS, HIDNEURONS);
Hidlayers.back().back().weights.push_back(val);
Hidlayers.back().back().weightderivs.push_back(0.0);
Hidlayers.back().back().emavals.push_back(0.0);
}
Hidlayers.back().back().bias = 0.0;
Hidlayers.back().back().biasderiv = 0.0;
Hidlayers.back().back().biasema = 0.0;
}
}
else if (WD == HIDLAYERS)
{
Hidlayers.push_back(std::vector <HIDneuron>());
for (int WL = 0; WL < HIDNEURONS; WL++)
{
Hidlayers.back().push_back(Hidn);
for (int WK = 0; WK < OUTNEURONS; WK++)
{
double val = distribution(generator) * randdist(HIDNEURONS, OUTNEURONS);
Hidlayers.back().back().weights.push_back(val);
Hidlayers.back().back().weightderivs.push_back(0.0);
Hidlayers.back().back().emavals.push_back(0.0);
}
Hidlayers.back().back().bias = 0.0;
Hidlayers.back().back().biasderiv = 0.0;
Hidlayers.back().back().biasema = 0.0;
}
}
}
for (int i = 0; i < OUTNEURONS; i++)
{
Outlayer.push_back(Outn);
Outlayer.back().bias = 0.0;
Outlayer.back().biasderiv = 0.0;
Outlayer.back().biasema = 0.0;
}
}
void Net::Feedforward(const std::vector <double>& invec)
{
for (size_t I = 0; I < Inlayer.size(); I++)
{
Inlayer[I].val = invec[I];
}
for (size_t h = 0; h < Hidlayers[0].size(); h++)
{
double preval = Hidlayers[0][h].bias;
for (size_t I = 0;I < Inlayer.size(); I++)
{
preval += Inlayer[I].val * Inlayer[I].weights[h];
}
Hidlayers[0][h].preactval = preval;
Hidlayers[0][h].actval = Relu(preval);
}
for (size_t H = 1; H < Hidlayers.size();H++)
{
size_t prevh = H - 1;
for (size_t h = 0; h < Hidlayers[H].size(); h++)
{
double preval = Hidlayers[H][h].bias;
for (size_t p = 0; p < Hidlayers[prevh].size(); p++)
{
preval += Hidlayers[prevh][p].actval * Hidlayers[prevh][p].weights[h];
}
Hidlayers[H][h].preactval = preval;
Hidlayers[H][h].actval = Relu(preval);
}
}
for (size_t O = 0; O < Outlayer.size(); O++)
{
size_t lhid = Hidlayers.size() - 1;
double preval = Outlayer[O].bias;
for (size_t h = 0; h < Hidlayers[lhid].size(); h++)
{
preval += Hidlayers[lhid][h].actval * Hidlayers[lhid][h].weights[O];
}
Outlayer[O].preactval = preval;
}
}
void Net::Backprop(const std::vector <double>& targets)
{
for (size_t O = 0; O < Outlayer.size(); O++)
{
double PDval{};
PDval = targets[O] - Outlayer[O].actval;
PDval = PDval * -1.0;
Outlayer[O].preactvalPD = PDval;
}
for (size_t H = Hidlayers.size(); H > 0; H--)
{
size_t Top = H;
size_t Current = H - 1;
for (size_t h = 0; h < Hidlayers[Current].size(); h++)
{
double actPD{};
double PreactPD{};
double biasPD{