C,Fortran,C++,java,numpy速度比較：1000次正方行列の積

環境：Core 2 Duo
C++,Cとfortranは配列の動的確保版がります。CとC++ではスタックサイズが１Mｂらしいので、それを解除してやります。(intel コンパイラでの実行環境はi7 860)

ulimit -s unlimited

結果としては
1.c 11.38s 動的:18.36s >>gcc -O3 -msse4.1で最適化 4.02s >>same option:icc 0.21s
1.c++ 11.37s 動的:18.52s >>g++ -O3 -msse4.1で最適化 4.01s >>same option:icpc 0.22s
1.fortran 9.02s 動的:14.82s >>gfortran -O3 -msse4.1で最適化 4.06s >>same option:ifort 0.21s
2.python,numpy 6.79s
3.java 13.43s

最適化率がCとC++が高い…。これは如何に自分のコードが酷いかを表しているのか!?
javaが思ったより早いですね。ネットでは行列計算はmatlabが一番早いらしいですね。lapackを使いたかったのですが、うまくインストールできませんでした。そのうち出来たら紹介します。

CとC++のメモリ確保には
http://handasse.blogspot.com/2009/02/cc.html
を参考にしました。

以下、それぞれのソース。
mat.f90

program main
  implicit none
  call test()
  call test2()
endprogram main

subroutine test2()
  implicit none
  integer :: M_SIZE=1000,i,j,k
  double precision :: mat1(1000,1000),mat2(1000,1000),mat3(1000,1000)
  double precision :: t1,t2
  call cpu_time(t1)
  do i=1,M_SIZE
     do j=1,M_SIZE
        mat1(j,i)=1.0
        mat3(j,i)=0.0
        if (i==j) then
           mat2(j,i)=1.0
        else
           mat2(j,i)=0.0
        endif
     enddo
  enddo
  
  do i=1,M_SIZE
     do j=1,M_SIZE
        do k=1,M_SIZE
           mat3(j,i)=mat3(j,i)+mat1(j,k)*mat2(k,i)
        enddo
     enddo
  enddo

  
  call cpu_time(t2)

  write(*,*) t2-t1
end subroutine test2

subroutine test()
  implicit none
  integer :: M_SIZE=1000,i,j,k
  double precision,allocatable:: mat1(:,:),mat2(:,:),mat3(:,:)
  double precision :: t1,t2
  call cpu_time(t1)
  allocate(mat1(M_SIZE,M_SIZE),mat2(M_SIZE,M_SIZE),mat3(M_SIZE,M_SIZE))
  do i=1,M_SIZE
     do j=1,M_SIZE
        mat1(j,i)=1.0
        mat3(j,i)=0.0
        if (i==j) then
           mat2(j,i)=1.0
        else
           mat2(j,i)=0.0
        endif
     enddo
  enddo
  
  do i=1,M_SIZE
     do j=1,M_SIZE
        do k=1,M_SIZE
           mat3(j,i)=mat3(j,i)+mat1(j,k)*mat2(k,i)
        enddo
     enddo
  enddo

  
  call cpu_time(t2)

  write(*,*) t2-t1
end subroutine test

Mat.java

public class Mat
{
    public static void main(String[] args)
    {
        double start=System.currentTimeMillis();
        int size;
        size=1000;
        double[][] mat1,mat2,mat3;
        mat1=new double[size][size];
        mat2=new double[size][size];
        mat3=new double[size][size];
        for(int i=0;i<size;i++){
            for(int j=0;j<size;j++){
                mat1[i][j]=1;
                mat3[i][j]=0;
                if(i==j)
                    mat2[i][j]=1;
                else
                    mat2[i][j]=0;
            }
        }
        for(int i=0;i<size;i++){
            for(int j=0;j<size;j++){
                for(int k=0;k<size;k++){
                    mat3[i][j]+=mat1[i][k]*mat2[k][j];
                }
            }
        }
                
        double end=System.currentTimeMillis();
        System.out.println((end-start)/1000);
    }
}

mat.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define M_SIZE 500
#define MM_SIZE 1000
void func1(void);
void func2(void);

int main(void){
    func1();
    func2();
    return 0;

}    
void func1(void){
    double matrix1[M_SIZE][M_SIZE],matrix2[M_SIZE][M_SIZE],matrix3[M_SIZE][M_SIZE];
    int i,j,k;
    clock_t end,start;
    start=clock();
    for(i=0;i<M_SIZE;i++){
        for(j=0;j<M_SIZE;j++){
            matrix1[i][j]=1;
            matrix3[i][j]=0;
            if(i==j)
                matrix2[i][j]=1;
            else
                matrix2[i][j]=0;
        }
    }
    
    for(i=0;i<M_SIZE;i++){
        for(j=0;j<M_SIZE;j++){
            for(k=0;k<M_SIZE;k++){
                matrix3[i][j]+=matrix1[i][k]*matrix2[k][j];
            }
        }
    }
    end=clock();
    printf("%.2fsecond\n",(double)(end-start)/CLOCKS_PER_SEC);
}    
void func2(void){
    double **matrix1,**matrix2,**matrix3;
    int i,j,k;
    clock_t end,start;
    start=clock();
    matrix1=(double **)malloc(sizeof(double)*MM_SIZE);
    matrix2=(double **)malloc(sizeof(double)*MM_SIZE);
    matrix3=(double **)malloc(sizeof(double)*MM_SIZE);
    matrix1[0]=(double *)malloc(sizeof(double)*MM_SIZE*MM_SIZE);
    matrix2[0]=(double *)malloc(sizeof(double)*MM_SIZE*MM_SIZE);
    matrix3[0]=(double *)malloc(sizeof(double)*MM_SIZE*MM_SIZE);
    for(i=1;i<MM_SIZE;i++){
        matrix1[i]=matrix1[0]+i*MM_SIZE;
        matrix2[i]=matrix2[0]+i*MM_SIZE;
        matrix3[i]=matrix3[0]+i*MM_SIZE;
    }

    for(i=0;i<MM_SIZE;i++){
        for(j=0;j<MM_SIZE;j++){
                 matrix1[i][j]=1;
            matrix3[i][j]=0;
            if(i==j)
                matrix2[i][j]=1;
            else
                matrix2[i][j]=0;
        }
    }
    
    for(i=0;i<MM_SIZE;i++){
        for(j=0;j<MM_SIZE;j++){
            for(k=0;k<MM_SIZE;k++){
                matrix3[i][j]+=matrix1[i][k]*matrix2[k][j];
            }
        }
    }       
    end=clock();
    printf("%.2fsecond\n",(double)(end-start)/CLOCKS_PER_SEC);
}

mat.py

import numpy as np
import time
stime=time.time()
mata=[[0 for i in xrange(1000)] for j in xrange(1000)]
matb=[[1 for i in xrange(1000)] for j in xrange(1000)]
matc=[[0 for i in xrange(1000)] for j in xrange(1000)]
for i in xrange(500):
    for j in xrange(500):
        if i==j:
            mata[i][j]=1

mata=np.matrix(mata)
matb=np.matrix(matb)
matc=np.matrix(matc)

matc=mata*matb
etime=time.time()
print etime-stime

mat.cpp

#include <iostream>
#include <math.h>
using namespace std;

#define SIZE 1000
int main()
{
  double mata[SIZE][SIZE],matb[SIZE][SIZE],matc[SIZE][SIZE];
  clock_t start,end;
  start=clock();
  
  for(int i=0;i<SIZE;i++){
    for(int j=0;j<SIZE;j++){
      mata[i][j]=1;
      matc[i][j]=0;
      if(i==j)
        matb[i][j]=1;
      else
        matb[i][j]=0;
    }
  }
  for(int i=0;i<SIZE;i++){
    for(int j=0;j<SIZE;j++){
      for(int k=0;k<SIZE;k++){
        matc[i][j]+=mata[i][k]*matb[k][j];

      }
    }
  }
  end=clock();  
  cout<<(double)(end-start)/CLOCKS_PER_SEC<<endl;
  for(int i=0;i<SIZE;i++){
    for(int j=0;j<SIZE;j++){
      cout<<matc[i][j]<<" ";
    }
    cout<<"\n";
  }
}

void func1(void){
  double **mata=new double*[SIZE];
  double **matb=new double*[SIZE];
  double **matc=new double*[SIZE];
  mata[0]=new double[SIZE * SIZE];
  matb[0]=new double[SIZE * SIZE];
  matc[0]=new double[SIZE * SIZE];
  for (int i = 1; i < SIZE; i++) mata[i] = mata[0] + i * SIZE;
  for (int i = 1; i < SIZE; i++) matb[i] = matb[0] + i * SIZE;
  for (int i = 1; i < SIZE; i++) matc[i] = matc[0] + i * SIZE;
  
  clock_t start,end;
  start=clock();
  
  for(int i=0;i<SIZE;i++){
    for(int j=0;j<SIZE;j++){
      mata[i][j]=1;
      matc[i][j]=0;
      if(i==j)
        matb[i][j]=1;
      else
        matb[i][j]=0;
    }
  }
  for(int i=0;i<SIZE;i++){
    for(int j=0;j<SIZE;j++){
      for(int k=0;k<SIZE;k++){
        matc[i][j]+=mata[i][k]*matb[k][j];

      }
    }
  }

  end=clock();  

  cout<<(double)(end-start)/CLOCKS_PER_SEC<<endl;
}

おいも貴婦人ブログ

生物系博士課程満期退学をしたAIエンジニアのブログ。

C,Fortran,C++,java,numpy速度比較：1000次正方行列の積