[문제]
아래에 주어진 편미분 방정식 풀이 방법으로 순차 프로그램이 주어져 있다. 주어진 풀이 방식은 유한차분 (finite-difference) 방법을 활용하는 것이다. 주어진 2차원 공간을 x, y 방향으로 각각 m, n개의 격자를 고려한다. 즉, x 방향으로 i=1,2,3,...,m (또는 0,1,2,...m-1)의 격자가 정의되고, y 방향으로도 j=1,2,3,...,n (또는 0,1,2,...,n-1)처럼 격자가 정의 된다. 2차원 공간은 [0,1] ╳ [0,1]로 주어진다. 여기서 x 방향 경계들, x=0, x=1은 각각 유한 격자 지수 i=1, i=m (또는 i=0, i=m-1)에 해당한다. 마찬가지로 y 방향으로도 동일한 조건을 활용한다. y=0, y=1은 각각 j=1, j=n (또는 j=0, j=n-1)에 해당한다.
주어진 순차 프로그램에서처럼 특정한 경계 조건을 활용하여 문제풀이를 진행한다. 일반으로 유한차분과 관련된 병렬화 과정에서는 계산 노드들 사이의 데이터 통신이 필요하다. 현재 주어진 문제에서도 유한차분 공식에 따른 미분값들을 계산하기 위해서 데이터 통신을 필요로 하는 병렬화를 가정한다. 데이터 통신에 소모되는 시간을 최소화하는 것이 병렬화의 기본 원칙이다. 왜냐하면, 통신에 소모되는 시간의 비중이 높기 때문이다.
경계조건 적용과 방정식 풀이 방법(이완 방법)을 주어진 순차 프로그램에서와 동일하게 적용하여 병렬화하시오. 순차 프로그램과 동일한 계산 결과를 주는지 확인하시오. 또한, (m,n)=(2000,2000) 조건일 때, CPU 16개를 활용한 계산에서 얻어진 병렬 효율성 (시간은 wall-clock time 기준, 단일-CPU 계산 대비 16-CPU 계산 시간 비율)을 제시하시오.
sequential.c
#include<stdio.h> #include<stdlib.h> #include<stddef.h> #include<math.h> #include<float.h> #define max(a,b) ((a)>(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b)) void axmb(int m, int n, double *uu, double *ww, double *xg, double *yg){ int i,j; double pi, hh, hh2; hh = xg[1] - xg[0]; hh2 = hh*hh; for(i=0;i<m*n;i++) ww[i] = 0.L; for(i=1;i<m-1;i++){ for(j=0;j<n;j++){ ww[j+n*i] += (-2.L*uu[j+n*i]+uu[j+n*(i-1)] + uu[j+n*(i+1)])/hh2; } } hh = yg[1] - yg[0]; hh2 = hh*hh; for(i=1;i<m-1;i++){ for(j=1;j<n-1;j++){ ww[j+n*i] += (-2.L*uu[j+n*i]+uu[j-1+n*i] + uu[j+1+n*i])/hh2; } } pi = M_PI; for(i=0;i<m;i++){ for(j=0;j<n;j++){ ww[j+n*i] -= (-2.l*pi*pi*cos(pi*xg[i])*cos(pi*yg[j])); } } } void vndb(int m, int n, double *uu){ int i,j; for(j=0;j<n;j++) uu[j] = uu[j+1*n]; for(j=0;j<n;j++) uu[j+n*(m-1)] = uu[j+n*(m-2)]; for(i=0;i<m;i++) uu[n*i] = uu[1+n*i]; for(i=0;i<m;i++) uu[n-1+n*i] = uu[n-2+n*i]; } int main(int argc, char **argv){ double *uu, *vv, *ww; double *xg, *yg; int i,j,k,miter,iter; double pi, test0, hh,hh2; int m,n; n = m = 1000; xg = (double*)malloc(sizeof(double)*m); yg = (double*)malloc(sizeof(double)*n); uu = (double*)malloc(sizeof(double)*m*n); vv = (double*)malloc(sizeof(double)*m*n); ww = (double*)malloc(sizeof(double)*m*n); for(i=0;i<m;i++) xg[i] = 0.L + (1.L-0.L)/(double)(m-1)*(double)(i); for(j=0;j<n;j++) yg[j] = 0.L + (1.L-0.L)/(double)(n-1)*(double)(j); pi = M_PI; hh = xg[1] - xg[0]; hh2 = hh * hh; for(i=0;i<m*n;i++) uu[i] = 0.l; vndb(m,n,uu); for(i=0;i<m*n;i++) vv[i] = uu[i]; miter = 1000000000; miter = 10000; for(iter=1;iter<=miter;iter++){ vndb(m,n,uu); axmb(m,n,uu,ww,xg,yg); vndb(m,n,ww); for(i=1;i<m-1;i++){ for(j=1;j<n-1;j++){ uu[j+n*i] = vv[j+n*i] + ww[j+n*i]*(hh2/4.L)*0.9L; } } test0 = -DBL_MAX; double amax = -DBL_MAX; for(i=0;i<m*n;i++){ test0 = max(fabs(vv[i]-uu[i]),test0); amax = max(fabs(uu[i])+1.E-8L,amax); } test0 = test0/amax; for(i=0;i<m*n;i++){ vv[i] = uu[i]; } if((iter%1000) == 1 || iter < 1000) printf("%d %24.15e\n",iter, test0); if(test0<1.E-6L ) break; } printf("%24.15g %24.15g\n",uu[0],uu[m-1]); printf("%24.15g %24.15g\n",uu[n-1],uu[n-1+n*(m-1)]); FILE *wp = fopen("fort.11", "w"); for(i=0;i<m;i++){ for(j=0;j<n;j++){ fprintf(wp,"%g %g %g\n",xg[i],yg[i],uu[j+n*i]); } fprintf(wp,"\n"); } fprintf(wp,"\n"); free(xg); free(yg); free(uu); free(ww); free(vv); }
#include<stdio.h> #include<stdlib.h> #include<stddef.h> #include<math.h> #include<float.h> #include<mpi.h> #define max(a,b) ((a)>(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b)) void axmb(int m, int n, int ns1, int ns2, double *uu, double *ww, double *xg, double *yg, int jsta2, int jend1){ int i,j; double pi, hh, hh2; hh = xg[1] - xg[0]; hh2 = hh*hh; for(i=0;i<m*(ns2-ns1+1);i++) ww[i] = 0.L; for(j=1;j<ns2-ns1;j++){ for(i=1;i<m-1;i++){ ww[i+m*j] += (-2.L*uu[i+m*j]+uu[i-1+m*j] + uu[i+1+m*j])/hh2; } } hh = yg[1] - yg[0]; hh2 = hh*hh; for(j=1;j<ns2-ns1;j++){ for(i=1;i<m-1;i++){ ww[i+m*j] += (-2.L*uu[i+m*j]+uu[i+m*(j-1)] + uu[i+m*(j+1)])/hh2; } } pi = M_PI; for(j=0;j<ns2-ns1+1;j++){ for(i=0;i<m;i++){ ww[i+m*j] -= (-2.l*pi*pi*cos(pi*xg[i])*cos(pi*yg[j+ns1-1])); } } } void vnbd(int m, int n, int ns1, int ns2,double *uu){ int i,j; for(j=0;j<ns2-ns1+1;j++) uu[j*m] = uu[1+j*m]; for(j=0;j<ns2-ns1+1;j++) uu[m-1+m*j] = uu[m-2+m*j]; if(ns1== 1) for(i=0;i<m;i++) uu[i+m*(1-ns1)] = uu[i+m*(2-ns1)]; if(ns2 ==n) for(i=0;i<m;i++) uu[i+m*(n-ns1)] = uu[i+m*(n-ns1-1)]; } void para_range(int n1, int n2, int nid, int myid, int *ista, int *iend){ int iwork1, iwork2; iwork1 = (n2-n1+1)/nid; iwork2 = (n2-n1+1)%nid; *ista = myid*iwork1 + n1 + min(myid, iwork2); *iend = *ista + iwork1 - 1; if(iwork2 > myid) *iend += 1; } void para_range1(int n1, int n2, int nid, int myid, int *ista, int *iend){ int iwork; iwork = (n2-n1)/nid + 1; *ista = min(myid * iwork + n1, n2+1); *iend = min( *ista + iwork-1, n2); } int main(int argc, char **argv){ double *uu, *vv, *ww; double *xg, *yg; int i,j,k,miter,iter; double pi, test0, hh,hh2; int ns1,ns2; double *uurow; int jsta,jend,jsta2,jend1,iprev,inext; MPI_Request isend1,isend2,irecv1,irecv2; int iplot,myid,nid; MPI_Status istatus; double test1,wtime1,wtime2; char lexit; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD,&nid); MPI_Comm_rank(MPI_COMM_WORLD,&myid); wtime1 = MPI_Wtime(); int m,n; n = (m = 2000); xg = (double*)malloc(sizeof(double)*m); yg = (double*)malloc(sizeof(double)*m); for(i=0;i<m;i++) xg[i] = 0.L + (1.L-0.L)/(double)(m-1)*(double)(i); for(j=0;j<n;j++) yg[j] = 0.L + (1.L-0.L)/(double)(n-1)*(double)(j); pi = M_PI; hh = xg[1] - xg[0]; hh2 = hh * hh; para_range(1,n,nid,myid,&jsta,&jend); jsta2 = jsta; jend1 = jend; if(myid == 0) jsta2 = 2; if(myid == nid-1) jend1 = n-1; ns1 = max(1, jsta-1); ns2 = min(n, jend+1); uu = (double*)malloc(sizeof(double)*m*(ns2-ns1+1)); vv = (double*)malloc(sizeof(double)*m*(ns2-ns1+1)); ww = (double*)malloc(sizeof(double)*m*(ns2-ns1+1)); inext = myid + 1; iprev = myid - 1; if(myid == nid-1) inext = MPI_PROC_NULL; if(myid == 0) iprev = MPI_PROC_NULL; for(j=jsta; j<=jend;j++) for(i=0;i<m;i++) uu[i+m*(j-ns1)] = 0; vnbd(m,n,ns1,ns2,uu); for(i=0;i<m*(ns2-ns1+1);i++) vv[i] = uu[i]; miter = 1000000000; miter = 10000; for(iter=1;iter<=miter;iter++){ vnbd(m,n,ns1,ns2,uu); MPI_Isend(uu+m*(jend-ns1), m , MPI_DOUBLE, inext, 1, MPI_COMM_WORLD, &isend1); MPI_Isend(uu+m*(jsta-ns1), m , MPI_DOUBLE, iprev, 1, MPI_COMM_WORLD, &isend2); MPI_Irecv(uu+m*(jsta-1-ns1), m , MPI_DOUBLE, iprev, 1, MPI_COMM_WORLD, &irecv1); MPI_Irecv(uu+m*(jend+1-ns1), m , MPI_DOUBLE, inext, 1, MPI_COMM_WORLD, &irecv2); MPI_Wait(&isend1,&istatus); MPI_Wait(&isend2,&istatus); MPI_Wait(&irecv1,&istatus); MPI_Wait(&irecv2,&istatus); axmb(m,n,ns1,ns2,uu,ww,xg,yg, jsta2, jend1); vnbd(m,n,ns1,ns2,ww); for(j=jsta2;j<=jend1;j++){ for(i=0;i<m;i++){ uu[i+m*(j-ns1)] = vv[i+m*(j-ns1)] + ww[i+m*(j-ns1)]*(hh2/4.L)*0.9L; } } test0 = -1.e30L; double amax = -1.e20L,amax1; for(i=0;i<m*(ns2-ns1+1);i++){ test0 = max(fabs(vv[i]-uu[i]),test0); amax = max(fabs(uu[i]),amax); } amax = amax + 1.e-8L; MPI_Reduce(&test0, &test1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&amax, &amax1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if(myid==0){ amax = amax1; test0 = test1; } MPI_Bcast(&amax,1,MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&test0,1,MPI_DOUBLE, 0, MPI_COMM_WORLD); test0 = test0/amax; for(i=0;i<m*(ns2-ns1+1);i++){ vv[i] = uu[i]; } if(myid ==0){ if((iter%1000) == 1 || iter < 1000) printf("%d %24.15e\n",iter, test0); lexit = 'N'; if(test0<1.e-6L) lexit = 'Y'; } MPI_Bcast(&lexit, 1, MPI_CHAR, 0, MPI_COMM_WORLD); if(lexit == 'Y') break; } if(myid==0) printf("%24.15g %24.15g\n",uu[0 + m *(1-ns1)],uu[m-1 + m *(1-ns1)]); if(myid==nid-1) printf("%24.15g %24.15g\n",uu[0 + m * (n-ns1)],uu[m-1 + m *(n-ns1)]); /* FILE *wp = fopen("sol","w"); long ioffset = (jsta-1) * sizeof(double)*(m); fseek(wp, ioffset, SEEK_SET); for(j=jsta;j<=jend;j++){ fwrite(uu+(j-ns1)*m, sizeof(double), m, wp); } fclose(wp); */ MPI_Barrier(MPI_COMM_WORLD); wtime2 = MPI_Wtime(); if(myid ==0) printf("[C] np=%d \t %g (sec)\n",nid, wtime2-wtime1); iplot = 0; iplot = 1; /* if(iplot ==1 && myid == 0) { FILE *fp = fopen("sol", "r"); wp = fopen("fort.11","w"); uurow = (double*)malloc(sizeof(double)*m); for(j=0;j<n;j++){ fread(uurow,sizeof(double), m,fp); for(i=0;i<m;i++){ fprintf(wp,"%24.15g %24.15g %24.15g\n",xg[i],yg[j],uurow[i]); } fprintf(wp,"\n"); } fclose(fp); fclose(wp); free(uurow); } */ free(xg); free(yg); free(uu); free(ww); free(vv); MPI_Finalize(); }
#include<stdio.h> #include<stdlib.h> #include<stddef.h> #include<math.h> #include<float.h> #include<mpi.h> #define max(a,b) ((a)>(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b)) int tm, tn; void axmb(int m, int n, double *uu, double *ww, double *xg, double *yg, int nprocs, int myrank, int xspos){ int i,j; double pi, hh, hh2; hh = xg[1] - xg[0]; hh2 = hh*hh; for(i=0;i<m*n;i++) ww[i+n] = 0.L; int ispos = 0, iepos = m; if(myrank == 0) ispos++; if(myrank == nprocs-1) iepos--; for(i=ispos;i<iepos;i++){ for(j=0;j<n;j++){ ww[j+n*i+n] += (-2.L*uu[j+n*i+n]+uu[j+n*(i-1)+n] + uu[j+n*(i+1)+n])/hh2; } } hh = yg[1] - yg[0]; hh2 = hh*hh; for(i=ispos;i<iepos;i++){ for(j=1;j<n-1;j++){ ww[j+n*i+n] += (-2.L*uu[j+n*i+n]+uu[j-1+n*i+n] + uu[j+1+n*i+n])/hh2; } } pi = M_PI; for(i=0;i<m;i++){ for(j=0;j<n;j++){ ww[j+n*i+n] -= (-2.l*pi*pi*cos(pi*xg[i+xspos])*cos(pi*yg[j])); } } } void vndb(int m, int n, double *uu, int nprocs, int myrank){ int i,j; if(myrank == 0) for(j=0;j<n;j++) uu[j+n] = uu[j+1*n+n]; if(myrank == nprocs-1) for(j=0;j<n;j++) uu[j+n*(m-1)+n] = uu[j+n*(m-2)+n]; for(i=0;i<m;i++) uu[n*i+n] = uu[1+n*i+n]; for(i=0;i<m;i++) uu[n-1+n*i+n] = uu[n-2+n*i+n]; } void exchange_boundary(int m, int n, double *uu, int nprocs, int myrank){ int i,j; MPI_Request req[4]; MPI_Status stat[4]; if(myrank < nprocs-1){ MPI_Isend(&uu[m*n], n, MPI_DOUBLE, myrank+1, 123, MPI_COMM_WORLD, &req[0]); MPI_Irecv(&uu[(m+1)*n], n, MPI_DOUBLE, myrank+1, 321, MPI_COMM_WORLD, &req[1]); } if(myrank > 0){ MPI_Isend(&uu[n], n, MPI_DOUBLE, myrank-1, 321, MPI_COMM_WORLD, &req[2]); MPI_Irecv(&uu[0], n, MPI_DOUBLE, myrank-1, 123, MPI_COMM_WORLD, &req[3]); } if(myrank < nprocs-1){ MPI_Waitall(2, req, stat); } if(myrank > 0){ MPI_Waitall(2, req+2, stat+2); } } int main(int argc, char **argv){ double *uu, *vv, *ww; double *xg, *yg; int i,j,k,miter,iter; double pi, test0, hh,hh2; tm = tn = 1000; int nprocs, myrank, q, r, asize, xspos; int m,n; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); q = tm/nprocs; r = tm%nprocs; asize = myrank<r?(q+1):q; m = asize; n = tn; xspos = myrank<r?(q+1)*myrank:(q*myrank+r); xg = (double*)malloc(sizeof(double)*tm); yg = (double*)malloc(sizeof(double)*tn); uu = (double*)malloc(sizeof(double)*(m+2)*n); vv = (double*)malloc(sizeof(double)*(m+2)*n); ww = (double*)malloc(sizeof(double)*(m+2)*n); for(i=0;i<tm;i++) xg[i] = 0.L + (1.L-0.L)/(double)(tm-1)*(double)(i); for(j=0;j<tn;j++) yg[j] = 0.L + (1.L-0.L)/(double)(tn-1)*(double)(j); pi = M_PI; hh = xg[1] - xg[0]; hh2 = hh * hh; for(i=0;i<m*n;i++) uu[i+n] = 0.l; vndb(m,n,uu,nprocs,myrank); for(i=0;i<m*n;i++) vv[i+n] = uu[i+n]; miter = 1000000000; miter = 1000; int ispos = 0, iepos = m; if(myrank == 0) ispos++; if(myrank == nprocs-1) iepos--; for(iter=1;iter<=miter;iter++){ vndb(m,n,uu,nprocs,myrank); exchange_boundary(m,n,uu,nprocs,myrank); axmb(m,n,uu,ww,xg,yg,nprocs,myrank,xspos); vndb(m,n,ww,nprocs,myrank); for(i=ispos;i<iepos;i++){ for(j=1;j<n-1;j++){ uu[j+n*i+n] = vv[j+n*i+n] + ww[j+n*i+n]*(hh2/4.L)*0.9L; } } test0 = -DBL_MAX; double amax = -DBL_MAX, test1, amax1; for(i=0;i<m*n;i++){ test0 = max(fabs(vv[i+n]-uu[i+n]),test0); amax = max(fabs(uu[i+n])+1.E-8L,amax); } MPI_Reduce(&test0, &test1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&amax, &amax1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); test0 = test1/amax1; MPI_Bcast(&test0, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); for(i=0;i<m*n;i++){ vv[i+n] = uu[i+n]; } if(myrank == 0 && ((iter%1000) == 1 || iter < 1000)) printf("%d %24.15e\n",iter, test0); if(test0<1.E-6L ) break; } //printf("%24.15g %24.15g\n",uu[0],uu[m-1]); //printf("%24.15g %24.15g\n",uu[n-1],uu[n-1+n*(m-1)]); int *ircnt, *idisp; double *uutot; if(myrank == 0) { ircnt = (int*)malloc(sizeof(int)*nprocs); idisp = (int*)malloc(sizeof(int)*nprocs); uutot = (double*)malloc(sizeof(double)*tm*tn); idisp[0] = 0; for(i = 0; i < nprocs; i++){ ircnt[i] = (i<r?(q+1):q)*n; if(i>0) idisp[i] = idisp[i-1] + ircnt[i-1]; } } MPI_Gatherv(uu+n, m*n, MPI_DOUBLE, uutot, ircnt, idisp, MPI_DOUBLE, 0, MPI_COMM_WORLD); if(myrank == 0){ FILE *wp = fopen("fort.22", "w"); for(i=0;i<tm;i++){ for(j=0;j<tn;j++){ fprintf(wp,"%g %g %g\n",xg[i],yg[j],uutot[j+tn*i]); } fprintf(wp,"\n"); } fprintf(wp,"\n"); free(ircnt); free(idisp); free(uutot); } free(xg); free(yg); free(uu); free(ww); free(vv); MPI_Finalize(); }
'Parallel Programming' 카테고리의 다른 글
KSC 2014 3번 문제 및 답안 (0) | 2016.10.01 |
---|---|
KSC 2014 2번 문제 및 답안 (0) | 2016.10.01 |
KSC 2013 3번 문제 및 답안 (0) | 2016.10.01 |
KSC 2013 2번 문제 및 답안 (0) | 2016.10.01 |
KSC 2013 1번 문제 및 답안 (0) | 2016.10.01 |
댓글