Improve SimpleRNN by SIMD instructions

zhongkaifu · zhongkaifu · commit 91a8a89c79b8 · 2016-02-25T22:29:28.000-08:00
diff --git a/RNNSharp/BiRNN.cs b/RNNSharp/BiRNN.cs
@@ -3,6 +3,7 @@
 using System.Threading.Tasks;
 using AdvUtils;
 using System.Collections.Generic;
+using System.Numerics;
 
 /// <summary>
 /// RNNSharp written by Zhongkai Fu (fuzhongkai@gmail.com)
@@ -13,6 +14,7 @@ class BiRNN : RNN
     {
         private RNN forwardRNN;
         private RNN backwardRNN;
+        private Vector<double> vecConst2 = new Vector<double>(2.0f);
 
         public BiRNN(RNN s_forwardRNN, RNN s_backwardRNN)
         {
@@ -56,7 +58,7 @@ public override void CleanStatus()
             forwardRNN.CleanStatus();
             backwardRNN.CleanStatus();
 
-            Hidden2OutputWeightLearningRate = new Matrix<float>(L2, L1);
+            Hidden2OutputWeightLearningRate = new Matrix<double>(L2, L1);
         }
 
         public override void initWeights()
@@ -219,7 +221,7 @@ public override void InitMem()
                 }
             }
 
-            Hidden2OutputWeightLearningRate = new Matrix<float>(L2, L1);
+            Hidden2OutputWeightLearningRate = new Matrix<double>(L2, L1);
         }
 
         public SimpleLayer[] InnerDecode(Sequence pSequence, out SimpleLayer[] outputHiddenLayer, out Matrix<double> rawOutputLayer)
@@ -266,9 +268,22 @@ public SimpleLayer[] InnerDecode(Sequence pSequence, out SimpleLayer[] outputHid
                 SimpleLayer forwardCells = mForward[curState];
                 SimpleLayer backwardCells = mBackward[curState];
 
-                for (int i = 0; i < forwardRNN.L1; i++)
+                int i = 0;
+                while (i < forwardRNN.L1 - Vector<double>.Count)
+                {
+                    Vector<double> v1 = new Vector<double>(forwardCells.cellOutput, i);
+                    Vector<double> v2 = new Vector<double>(backwardCells.cellOutput, i);
+                    Vector<double> v = (v1 + v2) / vecConst2;
+
+                    v.CopyTo(cells.cellOutput, i);
+
+                    i += Vector<float>.Count;
+                }
+
+                while (i < forwardRNN.L1)
                 {
                     cells.cellOutput[i] = (forwardCells.cellOutput[i] + backwardCells.cellOutput[i]) / 2.0;
+                    i++;
                 }
             });
 
diff --git a/RNNSharp/LSTMRNN.cs b/RNNSharp/LSTMRNN.cs
@@ -496,7 +496,7 @@ public override void CleanStatus()
 
             });
 
-            Hidden2OutputWeightLearningRate = new Matrix<float>(L2, L1);
+            Hidden2OutputWeightLearningRate = new Matrix<double>(L2, L1);
             vecLearningRate = new Vector4(LearningRate, LearningRate, LearningRate, LearningRate);
             vecLearningRate3 = new Vector3(LearningRate, LearningRate, LearningRate);
         }
diff --git a/RNNSharp/Matrix.cs b/RNNSharp/Matrix.cs
@@ -1,10 +1,11 @@
-﻿
+﻿using System.Numerics;
+
 /// <summary>
 /// RNNSharp written by Zhongkai Fu (fuzhongkai@gmail.com)
 /// </summary>
 namespace RNNSharp
 {
-    public class Matrix<T>
+    public class Matrix<T> where T : struct
     {
 
         public int Height { get; set; } // the number of rows
@@ -41,7 +42,22 @@ public Matrix<T> CopyTo()
 
             for (int i = 0; i < Height; i++)
             {
-                m_saData[i].CopyTo(m[i], 0);
+                T[] m_i = m[i];
+                T[] m_saData_i = m_saData[i];
+				int j = 0;
+                while (j < Width - Vector<T>.Count)
+                {
+                    Vector<T> v1 = new Vector<T>(m_saData_i, j);
+                    v1.CopyTo(m_i, j);
+
+                    j += Vector<T>.Count;
+                }
+
+                while (j < Width)
+                {
+                    m_i[j] = m_saData_i[j];
+                    j++;
+                }
             }
 
             return m;
diff --git a/RNNSharp/RNN.cs b/RNNSharp/RNN.cs
@@ -3,6 +3,7 @@
 using System.Threading.Tasks;
 using System.IO;
 using AdvUtils;
+using System.Numerics;
 
 /// <summary>
 /// RNNSharp written by Zhongkai Fu (fuzhongkai@gmail.com)
@@ -61,7 +62,7 @@ abstract public class RNN
         public Matrix<double> CRFTagTransWeights { get; set; }
         public SimpleLayer OutputLayer { get; set; }
         public Matrix<double> Hidden2OutputWeight;
-        public Matrix<float> Hidden2OutputWeightLearningRate;
+        public Matrix<double> Hidden2OutputWeightLearningRate;
       
         // CRF result output
         protected Matrix<double> CRFSeqOutput;
@@ -96,10 +97,10 @@ protected SimpleCell[] InitSimpleCell(int size)
             return cells;
         }
 
-        public double UpdateLearningRate(Matrix<float> m, int i, int j, double delta)
+        public double UpdateLearningRate(Matrix<double> m, int i, int j, double delta)
         {
             double dg = m[i][j] + delta * delta;
-            m[i][j] = (float)dg;
+            m[i][j] = dg;
 
             return LearningRate / (1.0 + Math.Sqrt(dg));
         }
@@ -644,10 +645,23 @@ public void matrixXvectorADD(SimpleLayer dest, SimpleLayer srcvec, Matrix<double
                 {
                     double[] vector_i = srcmatrix[i];
                     double cellOutput = 0;
-                    for (int j = 0; j < SrcSize; j++)
+                    int j = 0;
+
+                    while (j < SrcSize - Vector<double>.Count)
+                    {
+                        Vector<double> v1 = new Vector<double>(srcvec.cellOutput, j);
+                        Vector<double> v2 = new Vector<double>(vector_i, j);
+                        cellOutput += Vector.Dot<double>(v1, v2);
+
+                        j += Vector<double>.Count;
+                    }
+
+                    while (j < SrcSize)
                     {
                         cellOutput += srcvec.cellOutput[j] * vector_i[j];
+                        j++;
                     }
+
                     dest.cellOutput[i] = cellOutput;
                 });
 
diff --git a/RNNSharp/RNNSharp.csproj b/RNNSharp/RNNSharp.csproj
@@ -37,7 +37,10 @@
     <Reference Include="System" />
     <Reference Include="System.Core" />
     <Reference Include="System.Numerics" />
-    <Reference Include="System.Numerics.Vectors" />
+    <Reference Include="System.Numerics.Vectors, Version=4.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
+      <HintPath>..\packages\System.Numerics.Vectors.4.1.0\lib\net46\System.Numerics.Vectors.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
     <Reference Include="System.Xml.Linq" />
     <Reference Include="System.Data.DataSetExtensions" />
     <Reference Include="Microsoft.CSharp" />
@@ -70,6 +73,9 @@
     <Compile Include="Vector.cs" />
     <Compile Include="WordEMWrapFeaturizer.cs" />
   </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
        Other similar extension points exist, see Microsoft.Common.targets.
diff --git a/RNNSharp/SimpleRNN.cs b/RNNSharp/SimpleRNN.cs
@@ -2,6 +2,7 @@
 using System.Threading.Tasks;
 using System.IO;
 using AdvUtils;
+using System.Numerics;
 
 /// <summary>
 /// RNNSharp written by Zhongkai Fu (fuzhongkai@gmail.com)
@@ -31,9 +32,13 @@ public class SimpleRNN : RNN
         protected Matrix<double> Feature2HiddenWeights { get; set; }
 
         //The learning ratio of each weight
-        protected Matrix<float> HiddenBpttWeightsLearningRate { get; set; }
-        protected Matrix<float> Input2HiddenWeightsLearningRate { get; set; }
-        protected Matrix<float> Feature2HiddenWeightsLearningRate { get; set; }
+        protected Matrix<double> HiddenBpttWeightsLearningRate { get; set; }
+        protected Matrix<double> Input2HiddenWeightsLearningRate { get; set; }
+        protected Matrix<double> Feature2HiddenWeightsLearningRate { get; set; }
+
+        protected Vector<double> vecMaxGrad;
+        protected Vector<double> vecMinGrad;
+        protected Vector<double> vecNormalLearningRate;
 
         public SimpleRNN()
         {
@@ -257,28 +262,55 @@ private void learnBptt(State state)
                     //dense weight update fea->0
                     double[] vector_a = null;
                     double er = neuHidden.er[a];
+                    Vector<double> vecErr = new Vector<double>(er);
+
+                    int i = 0;
                     if (DenseFeatureSize > 0)
                     {
                         vector_a = mat_bptt_synf[a];
-                        for (int i = 0; i < DenseFeatureSize; i++)
+                        i = 0;
+                        while (i < DenseFeatureSize - Vector<double>.Count)
+                        {
+                            Vector<double> v1 = new Vector<double>(bptt_fea_step, i);
+                            Vector<double> v2 = new Vector<double>(vector_a, i);
+                            v2 += vecErr * v1;
+                            v2.CopyTo(vector_a, i);
+
+                            i += Vector<double>.Count;
+                        }
+
+                        while (i < DenseFeatureSize)
                         {
                             vector_a[i] += er * bptt_fea_step[i];
+                            i++;
                         }
                     }
 
                     //sparse weight update hidden->input
                     vector_a = mat_bptt_syn0_w[a];
-                    for (int i = 0; i < sparse.Count; i++)
+                    for (i = 0; i < sparse.Count; i++)
                     {
                         var entry = sparse.GetEntry(i);
                         vector_a[entry.Key] += er * entry.Value;
                     }
 
                     //bptt weight update
                     vector_a = mat_bptt_syn0_ph[a];
-                    for (int i = 0; i < L1; i++)
+                    i = 0;
+                    while (i < L1 - Vector<double>.Count)
+                    {
+                        Vector<double> v1 = new Vector<double>(neuLastHidden.cellOutput, i);
+                        Vector<double> v2 = new Vector<double>(vector_a, i);
+                        v2 += vecErr * v1;
+                        v2.CopyTo(vector_a, i);
+
+                        i += Vector<double>.Count;
+                    }
+					
+                    while(i < L1)
                     {
                         vector_a[i] += er * neuLastHidden.cellOutput[i];
+                        i++;
                     }
 
                 });
@@ -308,33 +340,85 @@ private void learnBptt(State state)
             {
                 double[] vector_b = null;
                 double[] vector_bf = null;
+                double[] vector_lr = null;
 
                 //Update bptt feature weights
                 vector_b = HiddenBpttWeights[b];
                 vector_bf = mat_bptt_syn0_ph[b];
-                for (int i = 0; i < L1; i++)
+                vector_lr = HiddenBpttWeightsLearningRate[b];
+
+                int i = 0;
+                while (i < L1 - Vector<double>.Count)
+                {
+                    Vector<double> vecDelta = new Vector<double>(vector_bf, i);
+                    Vector<double> vecLearningRate = new Vector<double>(vector_lr, i);
+                    Vector<double> vecB = new Vector<double>(vector_b, i);
+                    vecDelta = Vector.Min<double>(vecDelta, vecMaxGrad);
+                    vecDelta = Vector.Max<double>(vecDelta, vecMinGrad);
+
+                    vecLearningRate += (vecDelta * vecDelta);
+                    vecLearningRate.CopyTo(vector_lr, i);
+                    vecLearningRate = vecNormalLearningRate / (Vector<double>.One + Vector.SquareRoot<double>(vecLearningRate));
+
+                    vecB += (vecLearningRate * vecDelta);
+                    vecB.CopyTo(vector_b, i);
+
+                    Vector<double>.Zero.CopyTo(vector_bf, i);
+
+                    i += Vector<double>.Count;
+                }
+
+                while (i < L1)
                 {
                     double delta = NormalizeGradient(vector_bf[i]);
                     double newLearningRate = UpdateLearningRate(HiddenBpttWeightsLearningRate, b, i, delta);
 
                     vector_b[i] += newLearningRate * delta;
                     //Clean bptt weight error
                     vector_bf[i] = 0;
+
+                    i++;
                 }
 
                 //Update dense feature weights
                 if (DenseFeatureSize > 0)
                 {
                     vector_b = Feature2HiddenWeights[b];
                     vector_bf = mat_bptt_synf[b];
-                    for (int i = 0; i < DenseFeatureSize; i++)
+                    vector_lr = Feature2HiddenWeightsLearningRate[b];
+
+                    i = 0;
+                    while (i < DenseFeatureSize - Vector<double>.Count)
+                    {
+                        Vector<double> vecDelta = new Vector<double>(vector_bf, i);
+                        Vector<double> vecLearningRate = new Vector<double>(vector_lr, i);
+                        Vector<double> vecB = new Vector<double>(vector_b, i);
+                        vecDelta = Vector.Min<double>(vecDelta, vecMaxGrad);
+                        vecDelta = Vector.Max<double>(vecDelta, vecMinGrad);
+
+                        vecLearningRate += (vecDelta * vecDelta);
+                        vecLearningRate.CopyTo(vector_lr, i);
+                        vecLearningRate = vecNormalLearningRate / (Vector<double>.One + Vector.SquareRoot<double>(vecLearningRate));
+
+                        vecB += (vecLearningRate * vecDelta);
+                        vecB.CopyTo(vector_b, i);
+
+                        vecDelta = Vector<double>.Zero;
+                        vecDelta.CopyTo(vector_bf, i);
+
+                        i += Vector<double>.Count;
+                    }
+
+                    while (i < DenseFeatureSize)
                     {
                         double delta = NormalizeGradient(vector_bf[i]);
                         double newLearningRate = UpdateLearningRate(Feature2HiddenWeightsLearningRate, b, i, delta);
 
                         vector_b[i] += newLearningRate * delta;
                         //Clean dense feature weights error
                         vector_bf[i] = 0;
+
+                        i++;
                     }
                 }
 
@@ -347,7 +431,7 @@ private void learnBptt(State state)
                     if (sparse == null)
                         break;
 
-                    for (int i = 0; i < sparse.Count; i++)
+                    for (i = 0; i < sparse.Count; i++)
                     {
                         int pos = sparse.GetEntry(i).Key;
 
@@ -387,10 +471,14 @@ public void resetBpttMem()
 
         public override void CleanStatus()
         {
-            Hidden2OutputWeightLearningRate = new Matrix<float>(L2, L1);
-            Input2HiddenWeightsLearningRate = new Matrix<float>(L1, L0);
-            Feature2HiddenWeightsLearningRate = new Matrix<float>(L1, DenseFeatureSize);
-            HiddenBpttWeightsLearningRate = new Matrix<float>(L1, L1);
+            Hidden2OutputWeightLearningRate = new Matrix<double>(L2, L1);
+            Input2HiddenWeightsLearningRate = new Matrix<double>(L1, L0);
+            Feature2HiddenWeightsLearningRate = new Matrix<double>(L1, DenseFeatureSize);
+            HiddenBpttWeightsLearningRate = new Matrix<double>(L1, L1);
+
+            vecMaxGrad = new Vector<double>(GradientCutoff);
+            vecMinGrad = new Vector<double>(-GradientCutoff);
+            vecNormalLearningRate = new Vector<double>(LearningRate);
         }
         public override void InitMem()
         {
diff --git a/RNNSharp/packages.config b/RNNSharp/packages.config
@@ -0,0 +1,8 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="System.Globalization" version="4.0.10" targetFramework="net46" />
+  <package id="System.Numerics.Vectors" version="4.1.0" targetFramework="net46" />
+  <package id="System.Resources.ResourceManager" version="4.0.0" targetFramework="net46" />
+  <package id="System.Runtime" version="4.0.20" targetFramework="net46" />
+  <package id="System.Runtime.Extensions" version="4.0.10" targetFramework="net46" />
+</packages>

Original file line number	Diff line number	Diff line change
`@@ -496,7 +496,7 @@ public override void CleanStatus()`
`496`	`496`
`497`	`497`	`});`
`498`	`498`
`499`		`- Hidden2OutputWeightLearningRate = new Matrix<float>(L2, L1);`
	`499`	`+ Hidden2OutputWeightLearningRate = new Matrix<double>(L2, L1);`
`500`	`500`	`vecLearningRate = new Vector4(LearningRate, LearningRate, LearningRate, LearningRate);`
`501`	`501`	`vecLearningRate3 = new Vector3(LearningRate, LearningRate, LearningRate);`
`502`	`502`	`}`