对于相同的计算,c++和fortran的结果有相当大的差异
Considerable difference in results from c++ and fortran for same calculation
我正在将FORTRAN程序翻译成CUDA
当我翻译一个子程序时,我发现结果变化很大,从分数的第三位开始!我读到它们确实不同(我读到差异相对较小)。我不知道节目中是否有什么问题。所以我也用C++编译了代码,但得到了与CUDA相同的结果。我发布这两个代码(C++和FORTRAN)。请检查。
FORTRAN代码(原始)
PROGRAM MAIN
IMPLICIT REAL(A-H,O-Z)
parameter (nda=3,nda3=nda*3,ND05=3)
c INCLUDE 'SIZES'
C
C CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C
COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
COMMON/COORS/R(NDA*(NDA+1)/2)
COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
*NREP(ND05),MREP(ND05),LREP(ND05)
DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
common/ind/natoms,i3n
831 FORMAT(' NJ=',I4,' NK=',I4,' ALJ=',1PE12.5,' BLJ=',E12.5,
*' CLJ=',E12.5,' NREP=',I4,' MREP=',I4,' LREP=',I4)
815 FORMAT(/)
read*,natoms
read*,nlj
i3n=natoms*3
c terms for calcluations, N5J & N5K are indices (N5K > N5J)
c ALJ, BLJ & CLJ are powers for generalized LENNARD-JONES potential.
DO 10 I=1,NLJ
READ(5,*)N5J(I),N5K(I),NREP(I),MREP(I),LREP(I),
* ALJ(I),BLJ(I),CLJ(I)
WRITE(6,831)N5J(I),N5K(I),ALJ(I),BLJ(I),CLJ(I),
* NREP(I),MREP(I),LREP(I)
WRITE(6,815)
10 continue
C
read(5,*)(q(i),i=1,i3n)
do 30 i=1,i3n
print*,'q (',i,') = ',q(i)
30 continue
DO 40 MN=1,i3n
PDOT(MN)=0.0
40 continue
c setting zero values for pdot, which is force
IF (NLJ.NE.0) THEN
CALL LENJ(1,NLJ)
ENDIF
end
SUBROUTINE LENJ(INL,LNL)
IMPLICIT REAL (A-H,O-Z)
parameter (nda=3,nda3=nda*3,ND05=3)
C CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C
COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
COMMON/COORS/R(NDA*(NDA+1)/2)
COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
*NREP(ND05),MREP(ND05),LREP(ND05)
DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
LOGICAL FIRST
DATA FIRST/.TRUE./
save FIRST,JKA,RNA,RMB,RLC
common/ind/natoms,i3n
IF (FIRST) THEN
DO NL=INL,LNL
JKA(NL)=ISHFT((N5J(NL)-1)*(2*NATOMS-N5J(NL)),-1)+N5K(NL)
* -N5J(NL)
c to avoid recounting
RNA(NL)=-NREP(NL)*ALJ(NL)
RMB(NL)=-MREP(NL)*BLJ(NL)
RLC(NL)=-LREP(NL)*CLJ(NL)
ENDDO
C
FIRST=.FALSE.
ENDIF
C
C CODE FOR GENERAL LENNARD-JONES
C
DO NL=INL,LNL
J3=3*N5J(NL)
J2=J3-1
J1=J2-1
K3=3*N5K(NL)
K2=K3-1
K1=K2-1
c since each atom is represented by 3 coordinates in space, indices above are used to point for a step of three.
JK=JKA(NL)
T1=Q(K1)-Q(J1)
T2=Q(K2)-Q(J2)
T3=Q(K3)-Q(J3)
R(JK)=SQRT(T1*T1+T2*T2+T3*T3)
c calculating the distance
RRJK=1.0/R(JK)
DUM1=RNA(NL)*RRJK**(2+NREP(NL))
DUM1=DUM1+RMB(NL)*RRJK**(MREP(NL)+2)
DUM1=DUM1+RLC(NL)*RRJK**(LREP(NL)+2)
c adding each terms
TDUM1=DUM1*T1
TDUM2=DUM1*T2
TDUM3=DUM1*T3
PDOT(K1)=PDOT(K1)+TDUM1
PDOT(K2)=PDOT(K2)+TDUM2
PDOT(K3)=PDOT(K3)+TDUM3
PDOT(J1)=PDOT(J1)-TDUM1
PDOT(J2)=PDOT(J2)-TDUM2
PDOT(J3)=PDOT(J3)-TDUM3
c final forces.
ENDDO
print*,'...............'
do i=1,i3n
print*,'i = ',i ,', dvdq = ',pdot(i)
enddo
C N5J(I), N5K(I), NREP(I), MREP(I), LREP(I), ALJ(I), BLJ(I) and CLJ(I) for I = 1, NLJ.
C This is information for the generalized Lennard-Jones potentials.
C N5J and N5K are the atoms with N5K greater than N5J.
C NREP, MREP, and LREP are the powers in the potential energy expression.
C If a power equals zero the appropriate term is skipped in the calculations.
return
END
C++代码
#include<iostream>
#include<fstream>
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
using namespace std;
void lenjones(int NATOMS, int* N5J, int* N5K, int* NREP, int* MREP, int* LREP, float* ALJ, float* BLJ, float* CLJ, int NLJ,float* Q, float* PDOT){
size_t NLJF = NLJ*sizeof(float);
size_t NLJI = NLJ*sizeof(float);
float* RMB = (float*)malloc(NLJF);
float* RLC = (float*)malloc(NLJF);
float* RNA = (float*)malloc(NLJF);
int *JKA=(int*)malloc(NLJI);
for (int NL=0; NL < NLJ; NL++){
JKA[NL]= (((N5J[NL]-1)*((2*NATOMS)-N5J[NL]))>> 1) +N5K[NL]-N5J[NL];
RNA[NL]=-NREP[NL]*ALJ[NL];
RMB[NL]=-MREP[NL]*BLJ[NL];
RLC[NL]=-LREP[NL]*CLJ[NL];
}
int J3,K3;
float T1,T2,T3;
float TDUM1,TDUM2,TDUM3,DUM;
size_t RRS = (NATOMS*(NATOMS+1)/2)*sizeof(float);
float* RR = (float*)malloc(RRS);
int JK;
for (int NL=0; NL < NLJ; NL++){
J3=(3*N5J[NL])-1;
K3=(3*N5K[NL])-1;
T1=Q[K3-2]-Q[J3-2];
T2=Q[K3-1]-Q[J3-1];
T3=Q[K3]-Q[J3];
JK=JKA[NL]-1;
RR[JK]=sqrtf((T1*T1)+(T2*T2)+(T3*T3));
RR[JK]=1/RR[JK];
DUM=(RNA[NL]*powf(RR[JK],2+NREP[NL]));
DUM+=(RMB[NL]*powf(RR[JK],MREP[NL]+2));
DUM+=(RLC[NL]*powf(RR[JK],LREP[NL]+2));
TDUM1=T1*DUM;
TDUM2=T2*DUM;
TDUM3=T3*DUM;
PDOT[K3-2]=PDOT[K3-2]+TDUM1;
PDOT[K3-1]=PDOT[K3-1]+TDUM2;
PDOT[K3]=PDOT[K3]+TDUM3;
PDOT[J3-2]=PDOT[J3-2]-TDUM1;
PDOT[J3-1]=PDOT[J3-1]-TDUM2;
PDOT[J3]=PDOT[J3]-TDUM3;
}
}
//=========================================
int main(){
int NATOMS,NDA3,ND05;
scanf("%d %d", &NATOMS, &ND05);
printf("n");
NDA3=3*NATOMS;
size_t QPDOT = NDA3*sizeof(float);
float* h_Q = (float*)malloc(QPDOT);
float* h_PDOT = (float*)malloc(QPDOT);
size_t NLJ = ND05*sizeof(float);
float* h_ALJ = (float*)malloc(NLJ);
float* h_BLJ = (float*)malloc(NLJ);
float* h_CLJ = (float*)malloc(NLJ);
size_t NLJ_i = ND05*sizeof(int);
int* h_LREP = (int*)malloc(NLJ_i);
int* h_MREP = (int*)malloc(NLJ_i);
int* h_NREP = (int*)malloc(NLJ_i);
int* h_N5J = (int*)malloc(NLJ_i);
int* h_N5K = (int*)malloc(NLJ_i);
int* h_JKA = (int*)malloc(NLJ_i);
for (int i=0; i< ND05; i++){
cin >> h_N5J[i] >> h_N5K[i] >> h_NREP[i] >> h_MREP[i] >> h_LREP[i] >> h_ALJ[i] >> h_BLJ[i] >> h_CLJ[i];
}
for (int i=0; i<NDA3; i++){
scanf("%f", &h_Q[i]);
}
for (int i=0; i<NDA3; i++){
h_PDOT[i]=0;
}
lenjones(NATOMS, &h_N5J[0], &h_N5K[0], &h_NREP[0], &h_MREP[0], &h_LREP[0], &h_ALJ[0], &h_BLJ[0], &h_CLJ[0], ND05, &h_Q[0], &h_PDOT[0]);
cout << "i " << "Q[i] " << "PDOT[i]" << endl;
for (int i=0; i<NDA3; i++){
printf("%d %e %le n" , i, h_Q[i], h_PDOT[i]);
}
}
如果你想要更好的数学精度,你应该在C++中使用double
而不是float
。不知道Fortran给你的精度是多少(我猜很好,因为它被广泛用于计算),但float
对于C++数学来说是不精确的。
检查您的C++/CUDA编译器选项以获得最佳数学精度,例如Visual C++
对于相同的输入,输出中间结果并跟踪它们在哪个点开始发散。
我在您的FORTRAN和C++代码之间注意到了一些很可能会引入数值差异的东西:
- FORTRAN在C语言中的能力(
**
)与powf
++ - FORTRAN的
sqrt
与C语言中的sqrt
++ - 在FORTRAN和C++之间选择SSE编译器(您应该禁用这两个编译器的优化,然后检查数值差)
- 浮点模式的编译器选择(快速与精确)
- FORTRAN选项,用于模拟一些较旧的浮点模式(在非正规情况下,VAX浮点会被截断为0)
相关文章:
- 是否可以将无符号 int 的最大值转换为 int 并将结果转换为 -1?
- 将较大的整数写为二进制并读回它们会产生不好的结果
- 带有大结构变量的 CUDA 内核函数给出了错误的结果
- 在向量数组中获得前五个最大的结果
- 调整OpenCV Mat向量向量的大小时出乎意料的结果
- 二项式系数程序输出错误的结果,如果输入是大的(C++)
- 为什么这个计算二进制搜索树中节点的递归函数总是返回比预期更大的结果
- 编译一个相当简单的c++11程序时,gcc和clang之间的结果不同
- 将整数转换为相当大的向量或字符串
- 使用浮点值和铸造的算术运算的错误结果 - 差异很大,我想这不是准确值的情况(429497)?
- 当试图为项目euler 11找到数组中最大的产品时,得到了尴尬的结果
- 当“虚拟”是一个相当大的开销时,有什么经验法则吗
- 返回类型大到足以容纳结果的求和函数
- 使用比特移位缩短2个字符会导致具有大值的奇怪结果
- 在c++中计算64位整数的位数时,大输入(64位)会产生意想不到的结果
- 寻找最大使用段树给出错误的结果
- 用于计算 C++ 中最大数量的宏会产生不正确的结果
- 当m和n在10^10范围内相当大时,使用埃拉托色尼筛法打印m和n之间的素数
- 在相当大的整数的大集合上快速实现运算
- 对于相同的计算,c++和fortran的结果有相当大的差异