본문 바로가기
Parallel Programming

KSC 2014 1번 문제 및 답안

by suminhan 2016. 10. 1.

[문제]


아래에 주어진 편미분 방정식 풀이 방법으로 순차 프로그램이 주어져 있다. 주어진 풀이 방식은 유한차분 (finite-difference) 방법을 활용하는 것이다. 주어진 2차원 공간을 x, y 방향으로 각각 m, n개의 격자를 고려한다. 즉, x 방향으로 i=1,2,3,...,m (또는 0,1,2,...m-1)의 격자가 정의되고, y 방향으로도 j=1,2,3,...,n (또는 0,1,2,...,n-1)처럼 격자가 정의 된다. 2차원 공간은 [0,1] ╳ [0,1]로 주어진다. 여기서 x 방향 경계들, x=0, x=1은 각각 유한 격자 지수 i=1, i=m (또는 i=0, i=m-1)에 해당한다. 마찬가지로 y 방향으로도 동일한 조건을 활용한다. y=0, y=1은 각각 j=1, j=n (또는 j=0, j=n-1)에 해당한다. 


주어진 순차 프로그램에서처럼 특정한 경계 조건을 활용하여 문제풀이를 진행한다. 일반으로 유한차분과 관련된 병렬화 과정에서는 계산 노드들 사이의 데이터 통신이 필요하다. 현재 주어진 문제에서도 유한차분 공식에 따른 미분값들을 계산하기 위해서 데이터 통신을 필요로 하는 병렬화를 가정한다. 데이터 통신에 소모되는 시간을 최소화하는 것이 병렬화의 기본 원칙이다. 왜냐하면, 통신에 소모되는 시간의 비중이 높기 때문이다. 


경계조건 적용과 방정식 풀이 방법(이완 방법)을 주어진 순차 프로그램에서와 동일하게 적용하여 병렬화하시오. 순차 프로그램과 동일한 계산 결과를 주는지 확인하시오. 또한, (m,n)=(2000,2000) 조건일 때, CPU 16개를 활용한 계산에서 얻어진 병렬 효율성 (시간은 wall-clock time 기준, 단일-CPU 계산 대비 16-CPU 계산 시간 비율)을 제시하시오.



sequential.c

#include<stdio.h>
#include<stdlib.h>
#include<stddef.h>
#include<math.h>
#include<float.h>

#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))

void axmb(int m, int n, double *uu, double *ww, double *xg, double *yg){
	int i,j;
	double pi, hh, hh2;
	hh = xg[1] - xg[0]; hh2 = hh*hh;
	for(i=0;i<m*n;i++) ww[i] = 0.L;

	for(i=1;i<m-1;i++){
		for(j=0;j<n;j++){
			ww[j+n*i] += (-2.L*uu[j+n*i]+uu[j+n*(i-1)] + uu[j+n*(i+1)])/hh2;
		}
	}
	hh = yg[1] - yg[0]; hh2 = hh*hh;
	for(i=1;i<m-1;i++){
		for(j=1;j<n-1;j++){
			ww[j+n*i] += (-2.L*uu[j+n*i]+uu[j-1+n*i] + uu[j+1+n*i])/hh2;
		}
	}
	pi = M_PI;
	for(i=0;i<m;i++){
		for(j=0;j<n;j++){
			ww[j+n*i] -= (-2.l*pi*pi*cos(pi*xg[i])*cos(pi*yg[j]));
		}
	}
}
void vndb(int m, int n, double *uu){
	int i,j;
	for(j=0;j<n;j++) uu[j] = uu[j+1*n];
	for(j=0;j<n;j++) uu[j+n*(m-1)] = uu[j+n*(m-2)];
	for(i=0;i<m;i++) uu[n*i] = uu[1+n*i];
	for(i=0;i<m;i++) uu[n-1+n*i] = uu[n-2+n*i];
}


int main(int argc, char **argv){
	double *uu, *vv, *ww;
	double *xg, *yg;
	int i,j,k,miter,iter;
	double pi, test0, hh,hh2;

	int m,n;

	n = m = 1000;
	xg = (double*)malloc(sizeof(double)*m);
	yg = (double*)malloc(sizeof(double)*n);
	uu = (double*)malloc(sizeof(double)*m*n);
	vv = (double*)malloc(sizeof(double)*m*n);
	ww = (double*)malloc(sizeof(double)*m*n);


	for(i=0;i<m;i++) xg[i] = 0.L + (1.L-0.L)/(double)(m-1)*(double)(i);
	for(j=0;j<n;j++) yg[j] = 0.L + (1.L-0.L)/(double)(n-1)*(double)(j);


	pi = M_PI;
	hh = xg[1] - xg[0];
	hh2 = hh * hh;
	for(i=0;i<m*n;i++) uu[i] = 0.l;
	vndb(m,n,uu);
	for(i=0;i<m*n;i++) vv[i] = uu[i];
	miter = 1000000000;
	miter = 10000;

	for(iter=1;iter<=miter;iter++){
		vndb(m,n,uu);
		axmb(m,n,uu,ww,xg,yg);
		vndb(m,n,ww);
		for(i=1;i<m-1;i++){
			for(j=1;j<n-1;j++){
				uu[j+n*i] = vv[j+n*i] + ww[j+n*i]*(hh2/4.L)*0.9L;
			}
		}
		test0 = -DBL_MAX;
		double amax = -DBL_MAX;
		for(i=0;i<m*n;i++){
			test0 = max(fabs(vv[i]-uu[i]),test0);
			amax = max(fabs(uu[i])+1.E-8L,amax);
		}
		test0 = test0/amax;
		for(i=0;i<m*n;i++){
			vv[i] = uu[i];
		}
		if((iter%1000) == 1 || iter < 1000) printf("%d %24.15e\n",iter, test0);
		if(test0<1.E-6L ) break;
	}

	printf("%24.15g %24.15g\n",uu[0],uu[m-1]);
	printf("%24.15g %24.15g\n",uu[n-1],uu[n-1+n*(m-1)]);
	FILE *wp = fopen("fort.11", "w");
	for(i=0;i<m;i++){
		for(j=0;j<n;j++){
			fprintf(wp,"%g %g %g\n",xg[i],yg[i],uu[j+n*i]);
		}
		fprintf(wp,"\n");
	}
	fprintf(wp,"\n");
	free(xg);
	free(yg);
	free(uu);
	free(ww);
	free(vv);
}
answer.c
#include<stdio.h>
#include<stdlib.h>
#include<stddef.h>
#include<math.h>
#include<float.h>
#include<mpi.h>

#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))



void axmb(int m, int n, int ns1, int ns2, double *uu, double *ww, double *xg, double *yg,
	  int jsta2, int jend1){
  int i,j;
  double pi, hh, hh2;
  hh = xg[1] - xg[0]; hh2 = hh*hh;
  for(i=0;i<m*(ns2-ns1+1);i++) ww[i] = 0.L;
  
  for(j=1;j<ns2-ns1;j++){
    for(i=1;i<m-1;i++){
      ww[i+m*j] += (-2.L*uu[i+m*j]+uu[i-1+m*j] + uu[i+1+m*j])/hh2;
    }
  }
  hh = yg[1] - yg[0]; hh2 = hh*hh;
  for(j=1;j<ns2-ns1;j++){
    for(i=1;i<m-1;i++){
      ww[i+m*j] += (-2.L*uu[i+m*j]+uu[i+m*(j-1)] + uu[i+m*(j+1)])/hh2;
    }
  }
  pi = M_PI;
  for(j=0;j<ns2-ns1+1;j++){
    for(i=0;i<m;i++){
      ww[i+m*j] -= (-2.l*pi*pi*cos(pi*xg[i])*cos(pi*yg[j+ns1-1]));
    }
  }
}
void vnbd(int m, int n, int ns1, int ns2,double *uu){
  int i,j;
  for(j=0;j<ns2-ns1+1;j++) uu[j*m] = uu[1+j*m];
  for(j=0;j<ns2-ns1+1;j++) uu[m-1+m*j] = uu[m-2+m*j];
  if(ns1== 1) for(i=0;i<m;i++) uu[i+m*(1-ns1)] = uu[i+m*(2-ns1)];
  if(ns2 ==n) for(i=0;i<m;i++) uu[i+m*(n-ns1)] = uu[i+m*(n-ns1-1)];
}


void para_range(int n1, int n2, int nid, int myid, int *ista, int *iend){
  int iwork1, iwork2;
  
  iwork1 = (n2-n1+1)/nid;
  iwork2 = (n2-n1+1)%nid;
  *ista = myid*iwork1 + n1 + min(myid, iwork2);
  *iend = *ista + iwork1 - 1;
  if(iwork2 > myid) *iend += 1;
}
void para_range1(int n1, int n2, int nid, int myid, int *ista, int *iend){
  int iwork;
  iwork = (n2-n1)/nid + 1;
  *ista = min(myid * iwork + n1, n2+1);
  *iend = min( *ista + iwork-1, n2);
}


int main(int argc, char **argv){
  double *uu, *vv, *ww;
  double *xg, *yg;
  int i,j,k,miter,iter;
  double pi, test0, hh,hh2;
  int ns1,ns2;
  double *uurow;
  int jsta,jend,jsta2,jend1,iprev,inext;
  MPI_Request isend1,isend2,irecv1,irecv2;
  int iplot,myid,nid;
  MPI_Status istatus;
  double test1,wtime1,wtime2;
  char lexit;
  
  
  
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD,&nid);
  MPI_Comm_rank(MPI_COMM_WORLD,&myid);
  
  wtime1 = MPI_Wtime();
  
  int m,n;
  
  
  
  n = (m = 2000);
  xg = (double*)malloc(sizeof(double)*m);
  yg = (double*)malloc(sizeof(double)*m);
  
  
  for(i=0;i<m;i++) xg[i] = 0.L + (1.L-0.L)/(double)(m-1)*(double)(i);
  for(j=0;j<n;j++) yg[j] = 0.L + (1.L-0.L)/(double)(n-1)*(double)(j);
  

  pi = M_PI;
  hh = xg[1] - xg[0];
  hh2 = hh * hh;
  
  
  
  para_range(1,n,nid,myid,&jsta,&jend);
  jsta2 = jsta;
  jend1 = jend;
  
  if(myid == 0) jsta2 = 2;
  if(myid == nid-1) jend1 = n-1;
  ns1 = max(1, jsta-1); 
  ns2 = min(n, jend+1);
  
  uu = (double*)malloc(sizeof(double)*m*(ns2-ns1+1));
  vv = (double*)malloc(sizeof(double)*m*(ns2-ns1+1));
  ww = (double*)malloc(sizeof(double)*m*(ns2-ns1+1));
  inext = myid + 1;
  iprev = myid - 1;
  if(myid == nid-1) inext = MPI_PROC_NULL;
  if(myid == 0) iprev = MPI_PROC_NULL;
  
  for(j=jsta; j<=jend;j++) for(i=0;i<m;i++) uu[i+m*(j-ns1)] = 0;
  
  vnbd(m,n,ns1,ns2,uu);
  for(i=0;i<m*(ns2-ns1+1);i++) vv[i] = uu[i];
  miter = 1000000000;
  miter = 10000;
  
  for(iter=1;iter<=miter;iter++){
    vnbd(m,n,ns1,ns2,uu);
    MPI_Isend(uu+m*(jend-ns1), m , MPI_DOUBLE, inext, 1, MPI_COMM_WORLD, &isend1);
    MPI_Isend(uu+m*(jsta-ns1), m , MPI_DOUBLE, iprev, 1, MPI_COMM_WORLD, &isend2);
    MPI_Irecv(uu+m*(jsta-1-ns1), m , MPI_DOUBLE, iprev, 1, MPI_COMM_WORLD, &irecv1);
    MPI_Irecv(uu+m*(jend+1-ns1), m , MPI_DOUBLE, inext, 1, MPI_COMM_WORLD, &irecv2);
    MPI_Wait(&isend1,&istatus);
    MPI_Wait(&isend2,&istatus);
    MPI_Wait(&irecv1,&istatus);
    MPI_Wait(&irecv2,&istatus);
    
    axmb(m,n,ns1,ns2,uu,ww,xg,yg, jsta2, jend1);
    vnbd(m,n,ns1,ns2,ww);
    for(j=jsta2;j<=jend1;j++){
      for(i=0;i<m;i++){
	uu[i+m*(j-ns1)] = vv[i+m*(j-ns1)] + ww[i+m*(j-ns1)]*(hh2/4.L)*0.9L;
      }
    }
    test0 = -1.e30L;
    double amax = -1.e20L,amax1;
    for(i=0;i<m*(ns2-ns1+1);i++){
      test0 = max(fabs(vv[i]-uu[i]),test0);
      amax = max(fabs(uu[i]),amax);
    }
    amax = amax + 1.e-8L;
    MPI_Reduce(&test0, &test1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&amax, &amax1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    if(myid==0){
      amax = amax1; test0 = test1;
    }
    MPI_Bcast(&amax,1,MPI_DOUBLE, 0, MPI_COMM_WORLD);
    MPI_Bcast(&test0,1,MPI_DOUBLE, 0, MPI_COMM_WORLD);
    test0 = test0/amax;
    for(i=0;i<m*(ns2-ns1+1);i++){
      vv[i] = uu[i];
    }
    if(myid ==0){
      if((iter%1000) == 1 || iter < 1000) printf("%d %24.15e\n",iter, test0);
      lexit = 'N';
      if(test0<1.e-6L) lexit = 'Y';
    }
    MPI_Bcast(&lexit, 1, MPI_CHAR, 0, MPI_COMM_WORLD);
    if(lexit == 'Y') break;
  }
  
  if(myid==0) printf("%24.15g %24.15g\n",uu[0 + m *(1-ns1)],uu[m-1 + m *(1-ns1)]);
  if(myid==nid-1) printf("%24.15g %24.15g\n",uu[0 + m * (n-ns1)],uu[m-1 + m *(n-ns1)]);
  /*
    FILE *wp = fopen("sol","w");
    long ioffset = (jsta-1) * sizeof(double)*(m);
    fseek(wp, ioffset, SEEK_SET);
    for(j=jsta;j<=jend;j++){
    fwrite(uu+(j-ns1)*m, sizeof(double), m, wp);
    }
    fclose(wp);
  */
  
  MPI_Barrier(MPI_COMM_WORLD);
  wtime2 = MPI_Wtime();
  if(myid ==0) printf("[C] np=%d \t %g (sec)\n",nid, wtime2-wtime1);
  iplot = 0;
  iplot = 1;
  /*
    if(iplot ==1 && myid == 0) {
    FILE *fp = fopen("sol", "r");
    wp = fopen("fort.11","w");
    
    
    uurow = (double*)malloc(sizeof(double)*m);
    
    for(j=0;j<n;j++){
    fread(uurow,sizeof(double), m,fp);
    for(i=0;i<m;i++){
    fprintf(wp,"%24.15g %24.15g %24.15g\n",xg[i],yg[j],uurow[i]);
    }
    fprintf(wp,"\n");
    }
    fclose(fp);
    fclose(wp);
    free(uurow);
    }
  */
  free(xg);
  free(yg);
  free(uu);
  free(ww);
  free(vv);
  MPI_Finalize();
}
myparallel.c
#include<stdio.h>
#include<stdlib.h>
#include<stddef.h>
#include<math.h>
#include<float.h>
#include<mpi.h>

#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))

int tm, tn;

void axmb(int m, int n, double *uu, double *ww, double *xg, double *yg, int nprocs, int myrank, int xspos){
	int i,j;
	double pi, hh, hh2;
	hh = xg[1] - xg[0]; hh2 = hh*hh;
	for(i=0;i<m*n;i++) ww[i+n] = 0.L;
	int ispos = 0, iepos = m;
	if(myrank == 0) ispos++;
	if(myrank == nprocs-1) iepos--;

	for(i=ispos;i<iepos;i++){
		for(j=0;j<n;j++){
			ww[j+n*i+n] += (-2.L*uu[j+n*i+n]+uu[j+n*(i-1)+n] + uu[j+n*(i+1)+n])/hh2;
		}
	}
	hh = yg[1] - yg[0]; hh2 = hh*hh;
	for(i=ispos;i<iepos;i++){
		for(j=1;j<n-1;j++){
			ww[j+n*i+n] += (-2.L*uu[j+n*i+n]+uu[j-1+n*i+n] + uu[j+1+n*i+n])/hh2;
		}
	}
	pi = M_PI;
	for(i=0;i<m;i++){
		for(j=0;j<n;j++){
			ww[j+n*i+n] -= (-2.l*pi*pi*cos(pi*xg[i+xspos])*cos(pi*yg[j]));
		}
	}
}
void vndb(int m, int n, double *uu, int nprocs, int myrank){
	int i,j;
	if(myrank == 0)
		for(j=0;j<n;j++) uu[j+n] = uu[j+1*n+n];
	if(myrank == nprocs-1)
		for(j=0;j<n;j++) uu[j+n*(m-1)+n] = uu[j+n*(m-2)+n];
	for(i=0;i<m;i++) uu[n*i+n] = uu[1+n*i+n];
	for(i=0;i<m;i++) uu[n-1+n*i+n] = uu[n-2+n*i+n];
}

void exchange_boundary(int m, int n, double *uu, int nprocs, int myrank){
	int i,j;
	MPI_Request req[4];
	MPI_Status stat[4];
	
	if(myrank < nprocs-1){
		MPI_Isend(&uu[m*n], n, MPI_DOUBLE, myrank+1, 123, MPI_COMM_WORLD, &req[0]);
		MPI_Irecv(&uu[(m+1)*n], n, MPI_DOUBLE, myrank+1, 321, MPI_COMM_WORLD, &req[1]);
	}
	if(myrank > 0){
		MPI_Isend(&uu[n], n, MPI_DOUBLE, myrank-1, 321, MPI_COMM_WORLD, &req[2]);
		MPI_Irecv(&uu[0], n, MPI_DOUBLE, myrank-1, 123, MPI_COMM_WORLD, &req[3]);
	}
	if(myrank < nprocs-1){
		MPI_Waitall(2, req, stat);
	}
	if(myrank > 0){
		MPI_Waitall(2, req+2, stat+2);
	}
}

int main(int argc, char **argv){
	double *uu, *vv, *ww;
	double *xg, *yg;
	int i,j,k,miter,iter;
	double pi, test0, hh,hh2;

	tm = tn = 1000;

	int nprocs, myrank, q, r, asize, xspos;
	int m,n;
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
	q = tm/nprocs;
	r = tm%nprocs;
	asize = myrank<r?(q+1):q;
	m = asize;
	n = tn;
	xspos = myrank<r?(q+1)*myrank:(q*myrank+r);

	xg = (double*)malloc(sizeof(double)*tm);
	yg = (double*)malloc(sizeof(double)*tn);
	uu = (double*)malloc(sizeof(double)*(m+2)*n);
	vv = (double*)malloc(sizeof(double)*(m+2)*n);
	ww = (double*)malloc(sizeof(double)*(m+2)*n);

	for(i=0;i<tm;i++) xg[i] = 0.L + (1.L-0.L)/(double)(tm-1)*(double)(i);
	for(j=0;j<tn;j++) yg[j] = 0.L + (1.L-0.L)/(double)(tn-1)*(double)(j);

	pi = M_PI;
	hh = xg[1] - xg[0];
	hh2 = hh * hh;
	for(i=0;i<m*n;i++) uu[i+n] = 0.l;
	vndb(m,n,uu,nprocs,myrank);
	for(i=0;i<m*n;i++) vv[i+n] = uu[i+n];
	miter = 1000000000;
	miter = 1000;

	int ispos = 0, iepos = m;
	if(myrank == 0) ispos++;
	if(myrank == nprocs-1) iepos--;
	for(iter=1;iter<=miter;iter++){
		vndb(m,n,uu,nprocs,myrank);
		exchange_boundary(m,n,uu,nprocs,myrank);
		
		axmb(m,n,uu,ww,xg,yg,nprocs,myrank,xspos);
		vndb(m,n,ww,nprocs,myrank);
		for(i=ispos;i<iepos;i++){
			for(j=1;j<n-1;j++){
				uu[j+n*i+n] = vv[j+n*i+n] + ww[j+n*i+n]*(hh2/4.L)*0.9L;
			}
		}
		test0 = -DBL_MAX;
		double amax = -DBL_MAX, test1, amax1;
		for(i=0;i<m*n;i++){
			test0 = max(fabs(vv[i+n]-uu[i+n]),test0);
			amax = max(fabs(uu[i+n])+1.E-8L,amax);
		}
		MPI_Reduce(&test0, &test1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
		MPI_Reduce(&amax, &amax1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
		test0 = test1/amax1;
		MPI_Bcast(&test0, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

		for(i=0;i<m*n;i++){
			vv[i+n] = uu[i+n];
		}
		if(myrank == 0 && ((iter%1000) == 1 || iter < 1000))
			printf("%d %24.15e\n",iter, test0);
		if(test0<1.E-6L ) break;
	}

	//printf("%24.15g %24.15g\n",uu[0],uu[m-1]);
	//printf("%24.15g %24.15g\n",uu[n-1],uu[n-1+n*(m-1)]);
	int *ircnt, *idisp;
	double *uutot;
	if(myrank == 0) {
		ircnt = (int*)malloc(sizeof(int)*nprocs);	
		idisp = (int*)malloc(sizeof(int)*nprocs);
		uutot = (double*)malloc(sizeof(double)*tm*tn);
		idisp[0] = 0;
		for(i = 0; i < nprocs; i++){
			ircnt[i] = (i<r?(q+1):q)*n;
			if(i>0) idisp[i] = idisp[i-1] + ircnt[i-1];
		}
	}

	MPI_Gatherv(uu+n, m*n, MPI_DOUBLE, uutot, ircnt, idisp, MPI_DOUBLE, 0, MPI_COMM_WORLD);

	if(myrank == 0){
		FILE *wp = fopen("fort.22", "w");
		for(i=0;i<tm;i++){
			for(j=0;j<tn;j++){
				fprintf(wp,"%g %g %g\n",xg[i],yg[j],uutot[j+tn*i]);
			}
			fprintf(wp,"\n");
		}
		fprintf(wp,"\n");
		free(ircnt);
		free(idisp);
		free(uutot);
	}
	free(xg);
	free(yg);
	free(uu);
	free(ww);
	free(vv);
	MPI_Finalize();
}


'Parallel Programming' 카테고리의 다른 글

KSC 2014 3번 문제 및 답안  (0) 2016.10.01
KSC 2014 2번 문제 및 답안  (0) 2016.10.01
KSC 2013 3번 문제 및 답안  (0) 2016.10.01
KSC 2013 2번 문제 및 답안  (0) 2016.10.01
KSC 2013 1번 문제 및 답안  (0) 2016.10.01

댓글