Considerable difference in results from c++ and fortran for same calculation [closed]_问答_开发者

It's difficult to tell what is being asked here. This question is ambiguous, vague, incomplete, overly broad, or rhetorical and cannot be reasonably answered in its current form. For help clarifying this question so that it can be reopened, visit the help center. Closed 11 years ago.

I am translating a FORTRAN program to CUDA.

When I translated one subroutine I found that results vary considerably, starting from the 3rd digit in the fraction! I've read that they do differ (I've read that the difference would be very small relatively). I don't know if there's anything wrong in the program. So I compiled the code in C++ too, but to get the same result as in CUDA. I am posting both the codes (C++ and FORTRAN). pls check.

FORTRAN CODE (ORIGINAL)

      PROGRAM MAIN
      IMPLICIT REAL(A-H,O-Z)
      parameter (nda=3,nda3=nda*3,ND05=3)
c      INCLUDE 'SIZES'
C
C         CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C
      COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
      COMMON/COORS/R(NDA*(NDA+1)/2)
      COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
     *NREP(ND05),MREP(ND05),LREP(ND05)
      DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
      common/ind/natoms,i3n

  831 FORMAT('   NJ=',I4,'  NK=',I4,'  ALJ=',1PE12.5,'  BLJ=',E12.5,
     *'  CLJ=',E12.5,'  NREP=',I4,'  MREP=',I4,'  LREP=',I4)
  815 FORMAT(/)

      read*,natoms
      read*,nlj
      i3n=natoms*3
c terms for calcluations, N5J & N5K are indices (N5K > N5J)
c ALJ, BLJ & CLJ are powers for generalized LENNARD-JONES potential.
         DO 10 I=1,NLJ
            READ(5,*)N5J(I),N5K(I),NREP(I),MREP(I),LREP(I),
     *               ALJ(I),BLJ(I),CLJ(I)
            WRITE(6,831)N5J(I),N5K(I),ALJ(I),BLJ(I),CLJ(I),
     *                  NREP(I),MREP(I),LREP(I)
         WRITE(6,815)
10    continue
C
          read(5,*)(q(i),i=1,i3n)

      do 30 i=1,i3n
        print*,'q (',i,') = ',q(i)
30    continue

      DO 40 MN=1,i3n
         PDOT(MN)=0.0
40    continue
c setting zero values for pdot, which is force
      IF (NLJ.NE.0) THEN
         CALL LENJ(1,NLJ)
      ENDIF


      end


      SUBROUTINE LENJ(INL,LNL)
      IMPLICIT REAL (A-H,O-Z)
      pa开发者_StackOverflow社区rameter (nda=3,nda3=nda*3,ND05=3)
C         CALCULATE LENNARD-JONES POTENTIAL ENERGY DERIVATIVES
C

      COMMON/QPDOT/Q(NDA3),PDOT(NDA3)
      COMMON/COORS/R(NDA*(NDA+1)/2)
      COMMON/LENJB/ALJ(ND05),BLJ(ND05),CLJ(ND05),N5J(ND05),N5K(ND05),
     *NREP(ND05),MREP(ND05),LREP(ND05)
      DIMENSION JKA(ND05),RNA(ND05),RMB(ND05),RLC(ND05)
      LOGICAL FIRST
      DATA FIRST/.TRUE./
      save FIRST,JKA,RNA,RMB,RLC
      common/ind/natoms,i3n


      IF (FIRST) THEN
         DO NL=INL,LNL
            JKA(NL)=ISHFT((N5J(NL)-1)*(2*NATOMS-N5J(NL)),-1)+N5K(NL)
     *             -N5J(NL)
      c to avoid recounting
            RNA(NL)=-NREP(NL)*ALJ(NL)
            RMB(NL)=-MREP(NL)*BLJ(NL)
            RLC(NL)=-LREP(NL)*CLJ(NL)
         ENDDO
C
         FIRST=.FALSE.
      ENDIF
C
C         CODE FOR GENERAL LENNARD-JONES
C
      DO NL=INL,LNL
         J3=3*N5J(NL)
         J2=J3-1
         J1=J2-1
         K3=3*N5K(NL)
         K2=K3-1
         K1=K2-1
      c since each atom is represented by 3 coordinates in space, indices above are used to point for a step of three.
         JK=JKA(NL)
         T1=Q(K1)-Q(J1)
         T2=Q(K2)-Q(J2)
         T3=Q(K3)-Q(J3)
         R(JK)=SQRT(T1*T1+T2*T2+T3*T3)
       c calculating the distance
         RRJK=1.0/R(JK)
         DUM1=RNA(NL)*RRJK**(2+NREP(NL))
         DUM1=DUM1+RMB(NL)*RRJK**(MREP(NL)+2)
         DUM1=DUM1+RLC(NL)*RRJK**(LREP(NL)+2)
       c adding each terms
         TDUM1=DUM1*T1
         TDUM2=DUM1*T2
         TDUM3=DUM1*T3
         PDOT(K1)=PDOT(K1)+TDUM1
         PDOT(K2)=PDOT(K2)+TDUM2
         PDOT(K3)=PDOT(K3)+TDUM3
         PDOT(J1)=PDOT(J1)-TDUM1
         PDOT(J2)=PDOT(J2)-TDUM2
         PDOT(J3)=PDOT(J3)-TDUM3
       c final forces.
      ENDDO
      print*,'...............'
      do i=1,i3n
        print*,'i = ',i ,', dvdq = ',pdot(i)
      enddo
C   N5J(I), N5K(I), NREP(I), MREP(I), LREP(I), ALJ(I), BLJ(I) and CLJ(I) for I = 1, NLJ.
C   This is information for the generalized Lennard-Jones potentials.
C   N5J and N5K are the atoms with N5K greater than N5J.
C   NREP, MREP, and LREP are the powers in the potential energy expression.
C   If a power equals zero the appropriate term is skipped in the calculations.
      return
      END

C++ CODE

#include<iostream>
#include<fstream>
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
using namespace std;

void lenjones(int NATOMS, int* N5J, int* N5K, int* NREP, int* MREP, int* LREP, float* ALJ, float* BLJ, float* CLJ, int NLJ,float* Q, float* PDOT){

size_t NLJF = NLJ*sizeof(float);
size_t NLJI = NLJ*sizeof(float);

float* RMB = (float*)malloc(NLJF);
float* RLC = (float*)malloc(NLJF);
float* RNA = (float*)malloc(NLJF);
int *JKA=(int*)malloc(NLJI);
for (int NL=0; NL < NLJ; NL++){
    JKA[NL]= (((N5J[NL]-1)*((2*NATOMS)-N5J[NL]))>> 1) +N5K[NL]-N5J[NL];
    RNA[NL]=-NREP[NL]*ALJ[NL];
    RMB[NL]=-MREP[NL]*BLJ[NL];
    RLC[NL]=-LREP[NL]*CLJ[NL];
}

int J3,K3;
float T1,T2,T3;
float TDUM1,TDUM2,TDUM3,DUM;
size_t RRS = (NATOMS*(NATOMS+1)/2)*sizeof(float);
float* RR = (float*)malloc(RRS);
int JK;
for (int NL=0; NL < NLJ; NL++){
    J3=(3*N5J[NL])-1;
    K3=(3*N5K[NL])-1;
    T1=Q[K3-2]-Q[J3-2];
    T2=Q[K3-1]-Q[J3-1];
    T3=Q[K3]-Q[J3];
    JK=JKA[NL]-1;
    RR[JK]=sqrtf((T1*T1)+(T2*T2)+(T3*T3));
    RR[JK]=1/RR[JK];
    DUM=(RNA[NL]*powf(RR[JK],2+NREP[NL]));
    DUM+=(RMB[NL]*powf(RR[JK],MREP[NL]+2));
    DUM+=(RLC[NL]*powf(RR[JK],LREP[NL]+2));
    TDUM1=T1*DUM;
    TDUM2=T2*DUM;
    TDUM3=T3*DUM;
    PDOT[K3-2]=PDOT[K3-2]+TDUM1;
    PDOT[K3-1]=PDOT[K3-1]+TDUM2;
    PDOT[K3]=PDOT[K3]+TDUM3;
    PDOT[J3-2]=PDOT[J3-2]-TDUM1;
    PDOT[J3-1]=PDOT[J3-1]-TDUM2;
    PDOT[J3]=PDOT[J3]-TDUM3;
}

}


//=========================================
int main(){

int NATOMS,NDA3,ND05;
scanf("%d %d", &NATOMS, &ND05);
printf("\n");
NDA3=3*NATOMS;
size_t QPDOT = NDA3*sizeof(float);
float* h_Q = (float*)malloc(QPDOT);
float* h_PDOT = (float*)malloc(QPDOT);

size_t NLJ = ND05*sizeof(float);

float* h_ALJ = (float*)malloc(NLJ);
float* h_BLJ = (float*)malloc(NLJ);
float* h_CLJ = (float*)malloc(NLJ);

size_t NLJ_i = ND05*sizeof(int);
int* h_LREP = (int*)malloc(NLJ_i);
int* h_MREP = (int*)malloc(NLJ_i);
int* h_NREP = (int*)malloc(NLJ_i);
int* h_N5J = (int*)malloc(NLJ_i);
int* h_N5K = (int*)malloc(NLJ_i);
int* h_JKA = (int*)malloc(NLJ_i);

for (int i=0; i< ND05; i++){
    cin >> h_N5J[i] >> h_N5K[i] >> h_NREP[i] >> h_MREP[i] >> h_LREP[i] >> h_ALJ[i] >> h_BLJ[i] >> h_CLJ[i];
}
for (int i=0; i<NDA3; i++){
    scanf("%f", &h_Q[i]);
}
for (int i=0; i<NDA3; i++){
    h_PDOT[i]=0;
}

lenjones(NATOMS, &h_N5J[0], &h_N5K[0], &h_NREP[0], &h_MREP[0], &h_LREP[0], &h_ALJ[0], &h_BLJ[0], &h_CLJ[0], ND05, &h_Q[0], &h_PDOT[0]);

cout << "i  " << "Q[i]  " << "PDOT[i]" << endl;
for (int i=0; i<NDA3; i++){
    printf("%d  %e  %le \n" , i, h_Q[i], h_PDOT[i]);
}

}

If you want better math precision you should be using double not float in C++. Don't know what precision Fortran gives you (good, I am guessing since it's widely-used for calculation afaik), but float is imprecise for C++ math.

Check your C++/CUDA compiler options for best math precision also, eg. for Visual C++

For the same inputs, output intermediate results and track down at which point do they start diverging.

Some things I noticed between your FORTRAN and C++ code that are very likely to introduce numerical differences:

FORTRAN's power (**) versus powf in your C++
FORTRAN's sqrt versus sqrt in your C++
Compiler selection of SSE between the FORTRAN and C++ (you should disable optimization on both of them and check the numerical differencs then)
Compiler selection of floating point mode (fast vs accurate)
FORTRAN option to emulate some of the older floating point modes (VAX floats come to mind with truncate towards 0 in denormal situations)