zhongkaifu#1. Adding bias cell for LSTM

zhongkaifu · zhongkaifu · commit 37a29efabc6e · 2016-03-06T09:59:23.000-08:00
diff --git a/RNNSharp/BiRNN.cs b/RNNSharp/BiRNN.cs
@@ -243,7 +243,7 @@ public SimpleLayer[] InnerDecode(Sequence pSequence, out SimpleLayer[] outputHid
                 {
                     State state = pSequence.States[curState];
                     forwardRNN.SetInputLayer(state, curState, numStates, null);
-                    forwardRNN.computeHiddenLayer(state);      //compute probability distribution
+                    forwardRNN.computeHiddenLayer(state);
 
                     mForward[curState] = forwardRNN.GetHiddenLayer();
                 }
diff --git a/RNNSharp/LSTMRNN.cs b/RNNSharp/LSTMRNN.cs
@@ -25,6 +25,7 @@ public class LSTMCell : SimpleCell
         public double netCellState;
         public double previousCellState;
         public double cellState;
+        public double yCellState;
 
         //internal weights and deltas
         public double wCellIn;
@@ -66,8 +67,8 @@ public class LSTMRNN : RNN
         private new Vector4 vecMaxGrad;
         private new Vector4 vecMinGrad;
 
-        private new Vector3 vecMaxGrad3;
-        private new Vector3 vecMinGrad3;
+        private Vector3 vecMaxGrad3;
+        private Vector3 vecMinGrad3;
 
         public LSTMRNN()
         {
@@ -453,31 +454,24 @@ public override void initWeights()
             }
         }
 
-        public void LSTMCellInit(LSTMCell c)
+        public void LSTMCellInit(LSTMCell c, bool bBias = false)
         {
-            //input gate
-            c.netIn = 0;
-            c.yIn = 0;
-
-            //forget gate
-            c.netForget = 0;
-            c.yForget = 0;
-
-            //cell state
-            c.netCellState = 0;
-            c.previousCellState = 0; //this is important
+            c.previousCellState = 0;
             c.cellState = 0;
 
             //partial derivatives
             c.dSWCellIn = 0;
             c.dSWCellForget = 0;
 
-            //output gate
-            c.netOut = 0;
-            c.yOut = 0;
-
-            //cell output
-            c.cellOutput = 0;
+            if (bBias == false)
+            {
+                //cell output
+                c.cellOutput = 0;
+            }
+            else
+            {
+                c.cellOutput = 1.0;
+            }
         }
 
         public override void CleanStatus()
@@ -544,7 +538,7 @@ private void CreateCell(BinaryReader br)
             for (int i = 0; i < L1; i++)
             {
                 neuHidden[i] = new LSTMCell();
-                LSTMCellInit(neuHidden[i]);
+                LSTMCellInit(neuHidden[i], i == L1 - 1);
             }
 
             if (br != null)
@@ -626,27 +620,26 @@ public override void LearnNet(State state, int numStates, int curState)
             int sparseFeatureSize = sparse.Count;
 
             //put variables for derivaties in weight class and cell class
-            Parallel.For(0, L1, parallelOption, i =>
+            Parallel.For(0, L1 - 1, parallelOption, i =>
             {
                 LSTMCell c = neuHidden[i];
 
                 //using the error find the gradient of the output gate
-                var gradientOutputGate = (float)(SigmoidDerivative(c.netOut) * c.cellState * c.er);
+                var gradientOutputGate = (float)(SigmoidDerivative(c.netOut) * TanH(c.cellState) * c.er);
 
                 //internal cell state error
-                var cellStateError = (float)(c.yOut * c.er);
+                var cellStateError = (float)(c.er);
 
                 Vector4 vecErr = new Vector4(cellStateError, cellStateError, cellStateError, gradientOutputGate);
-                vecErr = Vector4.Clamp(vecErr, vecMinGrad, vecMaxGrad);
 
                 var Sigmoid2Derivative_ci_netCellState_mul_ci_yIn = TanHDerivative(c.netCellState) * c.yIn;
                 var Sigmoid2_ci_netCellState_mul_SigmoidDerivative_ci_netIn = TanH(c.netCellState) * SigmoidDerivative(c.netIn);
                 var ci_previousCellState_mul_SigmoidDerivative_ci_netForget = c.previousCellState * SigmoidDerivative(c.netForget);
 
                 Vector3 vecDerivate = new Vector3(
-                        (float)Sigmoid2_ci_netCellState_mul_SigmoidDerivative_ci_netIn,
-                        (float)ci_previousCellState_mul_SigmoidDerivative_ci_netForget,
-                        (float)Sigmoid2Derivative_ci_netCellState_mul_ci_yIn);
+                        (float)(Sigmoid2_ci_netCellState_mul_SigmoidDerivative_ci_netIn * c.yOut),
+                        (float)(ci_previousCellState_mul_SigmoidDerivative_ci_netForget * c.yOut),
+                        (float)(Sigmoid2Derivative_ci_netCellState_mul_ci_yIn * c.yOut));
                 float c_yForget = (float)c.yForget;
 
 
@@ -668,8 +661,9 @@ public override void LearnNet(State state, int numStates, int curState)
                     //Computing final err delta
                     Vector4 vecDelta = new Vector4(wd, entry.Value);
                     vecDelta = vecErr * vecDelta;
+                    vecDelta = Vector4.Clamp(vecDelta, vecMinGrad, vecMaxGrad);
 
-                   //Computing actual learning rate
+                    //Computing actual learning rate
                     Vector4 vecLearningRate = ComputeLearningRate(vecDelta, ref wlr_i[entry.Key]);
                     w_i[entry.Key] += vecLearningRate * vecDelta;
                 }
@@ -693,6 +687,7 @@ public override void LearnNet(State state, int numStates, int curState)
 
                         Vector4 vecDelta = new Vector4(wd, feature);
                         vecDelta = vecErr * vecDelta;
+                        vecDelta = Vector4.Clamp(vecDelta, vecMinGrad, vecMaxGrad);
 
                         //Computing actual learning rate
                         Vector4 vecLearningRate = ComputeLearningRate(vecDelta, ref wlr_i[j]);
@@ -709,11 +704,10 @@ public override void LearnNet(State state, int numStates, int curState)
                 //update internal weights
                 Vector3 vecCellDelta = new Vector3((float)c.dSWCellIn, (float)c.dSWCellForget, (float)c.cellState);
                 Vector3 vecCellErr = new Vector3(cellStateError, cellStateError, gradientOutputGate);
+                vecCellDelta = vecCellErr * vecCellDelta;
 
                 //Normalize err by gradient cut-off
-                vecCellErr = Vector3.Clamp(vecCellErr, vecMinGrad3, vecMaxGrad3);
-
-                vecCellDelta = vecCellErr * vecCellDelta;
+                vecCellDelta = Vector3.Clamp(vecCellDelta, vecMinGrad3, vecMaxGrad3);
 
                 //Computing actual learning rate
                 Vector3 vecCellLearningRate = ComputeLearningRate(vecCellDelta, ref CellLearningRate[i]);
@@ -737,7 +731,7 @@ public override void computeHiddenLayer(State state, bool isTrain = true)
             var sparse = state.SparseData;
             int sparseFeatureSize = sparse.Count;
 
-            Parallel.For(0, L1, parallelOption, j =>
+            Parallel.For(0, L1 - 1, parallelOption, j =>
             {
                 LSTMCell cell_j = neuHidden[j];
 
@@ -780,14 +774,15 @@ public override void computeHiddenLayer(State state, bool isTrain = true)
                 cell_j.netForget += cell_j.previousCellState * cell_j.wCellForget;
                 cell_j.yForget = Sigmoid(cell_j.netForget);
 
+                cell_j.yCellState = TanH(cell_j.netCellState);
                 if (cell_j.mask == true)
                 {
                     cell_j.cellState = 0;
                 }
                 else
                 {
                     //cell state is equal to the previous cell state multipled by the forget gate and the cell inputs multiplied by the input gate
-                    cell_j.cellState = cell_j.yForget * cell_j.previousCellState + cell_j.yIn * TanH(cell_j.netCellState);
+                    cell_j.cellState = cell_j.yForget * cell_j.previousCellState + cell_j.yIn * cell_j.yCellState;
                 }
 
                 if (isTrain == false)
@@ -801,7 +796,7 @@ public override void computeHiddenLayer(State state, bool isTrain = true)
                 //squash output gate 
                 cell_j.yOut = Sigmoid(cell_j.netOut);
 
-                cell_j.cellOutput = cell_j.cellState * cell_j.yOut;
+                cell_j.cellOutput = TanH(cell_j.cellState) * cell_j.yOut;
 
                 neuHidden[j] = cell_j;
             });
@@ -825,7 +820,7 @@ public override void netReset(bool updateNet = false)   //cleans hidden layer ac
             for (int i = 0; i < L1; i++)
             {
                 neuHidden[i].mask = false;
-                LSTMCellInit(neuHidden[i]);
+                LSTMCellInit(neuHidden[i], i == L1 - 1);
             }
 
             if (Dropout > 0 && updateNet == true)
diff --git a/RNNSharp/RNNEncoder.cs b/RNNSharp/RNNEncoder.cs
@@ -109,11 +109,15 @@ public void Train()
                 lastAlpha = rnn.LearningRate;
 
                 //Validate the model by validated corpus
-                bool betterValidateNet = false;
                 if (ValidationSet != null)
                 {
                     Logger.WriteLine("Verify model on validated corpus.");
-                    betterValidateNet = rnn.ValidateNet(ValidationSet, iter);
+                    if (rnn.ValidateNet(ValidationSet, iter) == true)
+                    {
+                        //We got better result on validated corpus, save this model
+                        Logger.WriteLine("Saving better model into file {0}...", m_modelSetting.ModelFile);
+                        rnn.SaveModel(m_modelSetting.ModelFile);
+                    }
                 }
 
                 if (ppl >= lastPPL)
@@ -122,26 +126,6 @@ public void Train()
                     rnn.LearningRate = rnn.LearningRate / 2.0f;
                 }
 
-                if (betterValidateNet == true)
-                {
-                    //We got better result on validated corpus, save this model
-                    Logger.WriteLine("Saving better model into file {0}...", m_modelSetting.ModelFile);
-                    rnn.SaveModel(m_modelSetting.ModelFile);
-                }
-
-
-                //if ((ValidationSet != null && betterValidateNet == false) ||
-                //    (ValidationSet == null && ppl >= lastPPL))
-                //{
-                //    rnn.LearningRate = rnn.LearningRate / 2.0f;
-                //}
-                //else
-                //{
-                //    //If current model is better than before, save it into file
-                //    Logger.WriteLine("Saving better model into file {0}...", m_modelSetting.ModelFile);
-                //    rnn.SaveModel(m_modelSetting.ModelFile);
-                //}
-
                 lastPPL = ppl;
 
                 iter++;

Original file line number	Diff line number	Diff line change
`@@ -243,7 +243,7 @@ public SimpleLayer[] InnerDecode(Sequence pSequence, out SimpleLayer[] outputHid`
`243`	`243`	`{`
`244`	`244`	`State state = pSequence.States[curState];`
`245`	`245`	`forwardRNN.SetInputLayer(state, curState, numStates, null);`
`246`		`- forwardRNN.computeHiddenLayer(state); //compute probability distribution`
	`246`	`+ forwardRNN.computeHiddenLayer(state);`
`247`	`247`
`248`	`248`	`mForward[curState] = forwardRNN.GetHiddenLayer();`
`249`	`249`	`}`