Optimize Code

关于代码优化的一些启示

今天看到一篇博文Optimize Your Code: Matrix Multiplication,里面提出了几个对于矩阵乘法的优化方法,于是试了一下。

优化方法

1. 原始写法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
template<typename T>
void matrixMul1(int size, T** m1, T** m2, T** result) {
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            result[i][j] = 0;
            for (int k=0; k<size; k++) {
                result[i][j] += m1[i][k] *m2[k][j];
            }
        }
    }
}

2. 采用临时变量

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
template<typename T>
void matrixMul2(int size, T** m1, T** m2, T** result) {
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            T c = 0;
            for (int k=0; k<size; k++) {
                c += m1[i][k] *m2[k][j];
            }
            result[i][j] = c;
        }
    }
}

很简单,但确实很有效。采用临时变量节省了在result中寻址并写入值的时间

3. 连续寻址

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
template<typename T>
void transpose(int size, T** m) {
    for (int i = 0; i < size; i++) {
        for (int j = i+1; j < size; j++) {
            swap(m[i][j],m[j][i]);
        }
    }
}

template<typename T>
void matrixMul3(int size, T** m1, T** m2, T** result) {
    transpose(size, m2);
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            T c = 0;
            for (int k=0; k<size; k++) {
                c += m1[i][k] *m2[j][k];
            }
            result[i][j] = c;
        }
    }
    transpose(size, m2);
}

转置矩阵,使m1[i]m2[i]都可以按顺序访问,这会极大提高性能。

开启优化对于性能的影响

采用下面的测试代码,比较上面的三种方案计算速度,并和Eigen库做对比。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <Eigen/Dense>
using namespace Eigen;
using namespace std;

clock_t start_t,end_t;

template<typename T>
T** randomMatrix(int size) {
    T** a;
    a = new T* [size];
    for (int i=0; i<size; i++) {
        a[i] = new T [size];
    }
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            a[i][j] = (T)(1.0 - rand()/(RAND_MAX/2.0));
        }
    }
    return a;
}

template<typename T>
void matrixMul1(int size, T** m1, T** m2, T** result) {
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            result[i][j] = 0;
            for (int k=0; k<size; k++) {
                result[i][j] += m1[i][k] *m2[k][j];
            }
        }
    }
}

template<typename T>
void matrixMul2(int size, T** m1, T** m2, T** result) {
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            T c = 0;
            for (int k=0; k<size; k++) {
                c += m1[i][k] *m2[k][j];
            }
            result[i][j] = c;
        }
    }
}


template<typename T>
void transpose(int size, T** m) {
    for (int i = 0; i < size; i++) {
        for (int j = i+1; j < size; j++) {
            swap(m[i][j],m[j][i]);
        }
    }
}

template<typename T>
void matrixMul3(int size, T** m1, T** m2, T** result) {
    transpose(size, m2);
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            T c = 0;
            for (int k=0; k<size; k++) {
                c += m1[i][k] *m2[j][k];
            }
            result[i][j] = c;
        }
    }
    transpose(size, m2);
}


int main(int argc, char *argv[]) {
    int n = atoi(argv[1]);

    double** m1 = randomMatrix<double>(n);
    double** m2 = randomMatrix<double>(n);
    double** m3 = randomMatrix<double>(n);

    double endTime = 0;
    for (int i=0; i<10; i++) {
        start_t = clock();
        matrixMul1(n,m1,m2,m3);
        end_t = clock();
        endTime += (double)(end_t-start_t)/CLOCKS_PER_SEC;
    }
    cout << endTime*100 << "ms" << endl;

    endTime = 0;
    for (int i=0; i<10; i++) {
        start_t = clock();
        matrixMul2(n,m1,m2,m3);
        end_t = clock();
        endTime += (double)(end_t-start_t)/CLOCKS_PER_SEC;
    }
    cout << endTime*100 << "ms" << endl;

    endTime = 0;
    for (int i=0; i<10; i++) {
        start_t = clock();
        matrixMul3(n,m1,m2,m3);
        end_t = clock();
        endTime += (double)(end_t-start_t)/CLOCKS_PER_SEC;
    }
    cout << endTime*100 << "ms" << endl;

    MatrixXd em1 = MatrixXd::Random(n,n);
    MatrixXd em2 = MatrixXd::Random(n,n);
    MatrixXd em3 = MatrixXd::Random(n,n);

    endTime = 0;
    for (int i=0; i<10; i++) {
        start_t = clock();
        em3 = em1*em2;
        end_t = clock();
        endTime += (double)(end_t-start_t)/CLOCKS_PER_SEC;
    }
    cout << endTime*100 << "ms" << endl;

    return 0;
}

对两个nxn的随机方阵做相乘运算,随机数范围-1~1,双精度,取十次计算的平均耗时。得到如下结果

size = 500x500 无编译优化 -O1 -O2 -O3
matrixMul1 729.774 ms 366.574 ms 159.6 ms 159.71 ms
matrixMul2 532.769 ms 199.984 ms 172.28 ms 166.058 ms
matrixMul3 375.81 ms 151.089 ms 149.625 ms 149.739 ms
Eigen 1128 ms 31.415 ms 30.5796 ms 30.8327 ms

在没有编译优化的情况下,Eigen的速度甚至比最朴素的写法还要慢得多,而只要开启O1优化,就可以吊打自己写的代码了,一定是用了很多的黑科技比如指令集优化?看来还是不要轻易自己造轮子,用别人轮子的时候也要小心啊。