具有静态 std::array 的神经网络比使用动态 C 数组的神经网络慢

Neural network with static std::array is slower than neural network using dynamic C-array

本文关键字:神经网络 动态 数组 静态 array std      更新时间:2023-10-16

github上有一个名为Tinn的简约(大约200行(神经网络C库。 Tinn使用动态C数组来表示权重,偏差,神经元。我试图在C++中部分实现它,但使用静态 std::array。我认为静态std::array会快得多。然而,在进行一些测量后,情况恰恰相反。谁能告诉我我是否做错了什么,或者告诉我为什么静态数组即使使用 -O3 优化也会被动态击败的原因?

具有静态数组的神经网络 MLP_1.h

#pragma once    
#include <cmath>
#include <array>
#include <iostream>
#include <fstream>  
template<class Type, size_t nIn, size_t nHid, size_t nOut>
class MLP_1
{
public:
static constexpr size_t nInputs = nIn;
static constexpr size_t nHiddens = nHid;
static constexpr size_t nOutputs = nOut;    
static constexpr size_t nWeights = nHiddens * (nInputs + nOutputs);
static constexpr size_t nBiases = 2;
static constexpr size_t weightIndexOffset = nHiddens * nInputs; 
std::array<Type, nWeights> weights;
std::array<Type, nBiases> biases;   
std::array<Type, nHiddens> hiddenNeurons;
std::array<Type, nOut> outputNeurons;   

static Type activationFunction(const Type x) noexcept
{
//return x / (1 + std::abs(x)); // faster
return 1.0 / (1.0 + std::exp(-x));
}   

void forwardPropagation(const Type* const input) noexcept
{
// Calculate hidden layer neuron values.
for(size_t i = 0; i < nHiddens; ++i)
{
Type sum = 0.0;
for(size_t j = 0; j < nInputs; ++j)
{
const size_t weightIndex = (i * nInputs) + j;
sum += input[j] * weights[weightIndex];
}
hiddenNeurons[i] = activationFunction(sum + biases[0]);
}
// Calculate output layer neuron values.
for(size_t i = 0; i < nOutputs; ++i)
{
Type sum = 0.0;
for(size_t j = 0; j < nHiddens; ++j)
{
const size_t weightIndex = weightIndexOffset + (i * nHiddens) + j;
sum += hiddenNeurons[j] * weights[weightIndex];
}
outputNeurons[i] = activationFunction(sum + biases[1]);
}   
}   
const Type* const predict(const Type* const input) noexcept
{
forwardPropagation(input);
return outputNeurons.data();
}   
const std::array<Type, nOutputs>& predict(const std::array<Type, nInputs>& inputArray)
{
forwardPropagation(inputArray.data());
return outputNeurons;
}   
void load(const char* const path) noexcept
{
std::ifstream inputFile(path);
size_t nInputsFile, nHiddensFile, nOutputsFile;
std::string ignoreString;   
inputFile >> nInputsFile >> nHiddensFile >> nOutputsFile;   
if ((nInputs != nInputsFile) || (nHiddens != nHiddensFile) || (nOutputs != nOutputsFile))
{
std::cout << "Size missmatch.n";
std::cout << nInputs << ", " << nHiddens << ", " << nOutputs << std::endl;
std::cout << nInputsFile << ", " << nHiddensFile << ", " << nOutputsFile << std::endl;
}   
for (auto& bias : biases)
{
Type biasFile;
inputFile >> biasFile;
bias = biasFile;
}   
for (auto& weight : weights)
{
Type weightFile;
inputFile >> weightFile;
weight = weightFile;
}
}   
void printWeights() const
{
std::cout << "weights: ";
for (const auto& w : weights) { std::cout << w << " "; }
std::cout << "n";
}
void printBiases() const
{
std::cout << "biases: ";
for (const auto& b : biases) { std::cout << b << " "; }
std::cout << "n";
}   
void print() const
{
printWeights();
printBiases();
}
};

具有动态数组的神经网络 - Tinn.h

#pragma once    
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>   

typedef struct
{
// All the weights.
float* w;
// Hidden to output layer weights.
float* x;
// Biases.
float* b;
// Hidden layer.
float* h;
// Output layer.
float* o;
// Number of biases - always two - Tinn only supports a single hidden layer.
int nb;
// Number of weights.
int nw;
// Number of inputs.
int nips;
// Number of hidden neurons.
int nhid;
// Number of outputs.
int nops;
}
Tinn;   
// Returns floating point random from 0.0 - 1.0.
static float frand()
{
return rand() / (float) RAND_MAX;
}   
// Activation function.
static float act(const float a)
{
return 1.0f / (1.0f + expf(-a));
}   
// Performs forward propagation.
static void fprop(const Tinn t, const float* const in)
{
// Calculate hidden layer neuron values.
for(int i = 0; i < t.nhid; i++)
{
float sum = 0.0f;
for(int j = 0; j < t.nips; j++)
sum += in[j] * t.w[i * t.nips + j];
t.h[i] = act(sum + t.b[0]);
}
// Calculate output layer neuron values.
for(int i = 0; i < t.nops; i++)
{
float sum = 0.0f;
for(int j = 0; j < t.nhid; j++)
sum += t.h[j] * t.x[i * t.nhid + j];
t.o[i] = act(sum + t.b[1]);
}
}   
// Randomizes tinn weights and biases.
static void wbrand(const Tinn t)
{
for(int i = 0; i < t.nw; i++) t.w[i] = frand() - 0.5f;
for(int i = 0; i < t.nb; i++) t.b[i] = frand() - 0.5f;
}   
// Returns an output prediction given an input.
float* xtpredict(const Tinn t, const float* const in)
{
fprop(t, in);
return t.o;
}   

// Constructs a tinn with number of inputs, number of hidden neurons, and number of outputs
Tinn xtbuild(const int nips, const int nhid, const int nops)
{
Tinn t;
// Tinn only supports one hidden layer so there are two biases.
t.nb = 2;
t.nw = nhid * (nips + nops);
t.w = (float*) calloc(t.nw, sizeof(*t.w));
t.x = t.w + nhid * nips;
t.b = (float*) calloc(t.nb, sizeof(*t.b));
t.h = (float*) calloc(nhid, sizeof(*t.h));
t.o = (float*) calloc(nops, sizeof(*t.o));
t.nips = nips;
t.nhid = nhid;
t.nops = nops;
wbrand(t);
return t;
}   
// Saves a tinn to disk.
void xtsave(const Tinn t, const char* const path)
{
FILE* const file = fopen(path, "w");
// Save header.
fprintf(file, "%d %d %dn", t.nips, t.nhid, t.nops);
// Save biases and weights.
for(int i = 0; i < t.nb; i++) fprintf(file, "%fn", (double) t.b[i]);
for(int i = 0; i < t.nw; i++) fprintf(file, "%fn", (double) t.w[i]);
fclose(file);
}   
// Loads a tinn from disk.
Tinn xtload(const char* const path)
{
FILE* const file = fopen(path, "r");
int nips = 0;
int nhid = 0;
int nops = 0;
// Load header.
fscanf(file, "%d %d %dn", &nips, &nhid, &nops);
// Build a new tinn.
const Tinn t = xtbuild(nips, nhid, nops);
// Load biaes and weights.
for(int i = 0; i < t.nb; i++) fscanf(file, "%fn", &t.b[i]);
for(int i = 0; i < t.nw; i++) fscanf(file, "%fn", &t.w[i]);
fclose(file);
return t;
}   
// Frees object from heap.
void xtfree(const Tinn t)
{
free(t.w);
free(t.b);
free(t.h);
free(t.o);
}   
// Prints an array of floats. Useful for printing predictions.
void xtprint(const float* arr, const int size)
{
for(int i = 0; i < size; i++)
printf("%f ", (double) arr[i]);
printf("n");
}   
void xtprint(const Tinn& tinn)
{
printf("weights: ");
xtprint(tinn.w, tinn.nw);   
printf("biases: ");
xtprint(tinn.b, tinn.nb);
}

主要与测试主要.cpp

#include <iostream>
#include "MLP_1.h"
#include "Tinn.h"
#include <array>
#include <iterator>
#include <random>
#include <algorithm>    
#include <chrono>   
constexpr size_t in = 748;
constexpr size_t hid = 20;
constexpr size_t out = 5;   
const char* const path = "tinn01.txt";  
template< class Iter >
void fill_with_random_values( Iter start, Iter end, int min, int max)
{
static std::random_device rd;    // you only need to initialize it once
static std::mt19937 mte(rd());   // this is a relative big object to create 
std::uniform_real_distribution<float> dist(min, max);   
std::generate(start, end, [&] () { return dist(mte); });
}   
void testMLP(MLP_1<float, in, hid, out>& mlp, const std::array<float, in>& array)
{
std::cout << "------MLP------n";
float sum = 0;
const float* data = array.data();   
auto start = std::chrono::system_clock::now();
for (size_t i = 0; i < 60000; ++i)
{
const float* inputRes1 = mlp.predict(data);
sum += inputRes1[0];
}
auto end = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);  
std::cout << "sum:" << sum << "n";
std::cout << "elapsed time: " << elapsed.count() << "ms" << "n";
std::cout << "------MLP------n";
}   
void testTinn(Tinn& tinn, const std::array<float, in>& array)
{
std::cout << "------TINN------n";
float sum = 0;
const float* data = array.data();   
auto start = std::chrono::system_clock::now();
for (size_t i = 0; i < 60000; ++i)
{
const float* inputRes1 = xtpredict(tinn, data);
sum += inputRes1[0];
}
auto end = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);  
std::cout << "sum:" << sum << "n";
std::cout << "elapsed time: " << elapsed.count() << "ms" << "n";
std::cout << "------TINN------n";
}   
int main()
{
Tinn sTinn = xtbuild(in, hid, out);
xtsave(sTinn, path);    
Tinn tinn1 = xtload(path);  
MLP_1<float, in, hid, out> mlp;
mlp.load(path); 
std::array<float, in> inputTest;    
fill_with_random_values(inputTest.begin(), inputTest.end(), -10.0, 10.0);   
testMLP(mlp, inputTest);
std::cout << "n";
testTinn(tinn1, inputTest); 
return 0;
}

有了g++ -std=c++14 -O0 main.cpp我得到:

------MLP------
sum:33171.4
elapsed time: 6524ms
------MLP------
------TINN------
sum:33171.4
elapsed time: 2256ms
------TINN------

有了g++ -std=c++14 -O3 main.cpp我得到:

------MLP------
sum:19567.4
elapsed time: 758ms
------MLP------
------TINN------
sum:19567.4
elapsed time: 739ms
------TINN------

使用动态内存分配,慢速部分是分配和释放内存。您测量的循环中没有内存分配,因此没有理由期望动态分配的版本会变慢。事实上,使用 -O3 优化,运行时几乎相同。

可能影响运行时的程序之间的一个区别是使用不同的随机数生成器。std::mt19937rand()好得多,但可能更慢。