sequential.c
#include <stdlib.h> #include <stdio.h> #include <math.h> #include <string.h> #ifndef M_PI #define M_PI 3.14159265358979323846 #endif /* int np = 30, ne = 512; */ int np = 5, ne = 10; // np: number of nodes in each element // ne: number of elements // caution: np 20, ne 256 will take approx 10 minutes on a serial job void interface_flux(double *qq, double *fstar, double *ib, double speed); void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed); void lagrange(double xx, double *pts, double *ll); void lagrange_deriv(double xx, double *dl, double *pts); void legendre(int n, double *x, int xsize, double *output); double legendre_scalar(int n, double x); double dlegendre_scalar(int n, double x); double ddlegendre_scalar(int n, double x); double dddlegendre_scalar(int n, double x); void dlegendre_roots(int polyorder, double *output); void gausslobatto_quadrature(int polyorder, double *roots, double *weights); double dot_product(double *v, double *u, int n); void save_field(double *xx, double *qq, int elem_num, double *roots, int eres); void initialize(double *qq, double *xx, double xmax, double xmin, char *init_type); int main(int argc, char **argv){ double tend = 1E2, speed = 1.; // double tend = 1E-1, speed = 1.; char *init_type="mixed2"; double *roots, *weights, *ll, *dl, xmin, xmax, deltax, jac, xr, xl, cfl, dt, rtime, min_dx; int ii, jj, kk, ee, idx, eres; long nstep; double *dx, *mesh; double *smat, *xx, *qq, *qtemp, *k1, *k2, *k3, *k4, *minv_vec, *mmat, *dv, *mf, *ib, *df, *fstar; // initialize // fortran index structure array[ii,jj,ee] where size(array) = (np, np, ne) // c 1d index structure array = [ee*np*np + jj*np + ii] roots = (double*)malloc(np* sizeof(double)); weights = (double*)malloc(np* sizeof(double)); ll = (double*)malloc(np* sizeof(double)); dl = (double*)malloc(np* sizeof(double)); dx = (double*)malloc(ne* sizeof(double)); mesh = (double*)malloc((ne+1)*sizeof(double)); smat = (double*)malloc(np*np*sizeof(double)); // [jj np, ii np] xx = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] qq = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] qtemp = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k1 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k2 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k3 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k4 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] minv_vec= (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] mmat = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np] dv = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np] mf = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np] ib = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np] fstar = (double*)malloc(2*ne*sizeof(double)); // [jj 2, ii ne] df = (double*)malloc(ne*2*np*sizeof(double)); // [ee ne, jj 2, ii np] for (ii=0; ii<np; ++ii){ roots[ii] = 0; weights[ii] = 0; ll[ii] = 0; dl[ii] = 0; } for (ii=0; ii<ne; ++ii){ dx[ii] = 0; mesh[ii] = 0; } mesh[ne] = 0; for (ii=0; ii<np*np; ++ii){ smat[ii] = 0; } for (ii=0; ii<ne*np; ++ii){ xx[ii] = 0; qq[ii] = 0; k1[ii] = 0; k2[ii] = 0; k3[ii] = 0; k4[ii] = 0; minv_vec[ii] = 0; } for (ii=0; ii<ne*np*np; ++ii){ mmat[ii] = 0; dv[ii] = 0; } for (ii=0; ii<np*2; ++ii){ mf[ii] = 0; ib[ii] = 0; } for (ii=0; ii<ne*2; ++ii){ fstar[ii] = 0; } for (ii=0; ii<ne*2*np; ++ii){ df[ii] = 0; } // mesh setup xmin = 0.; xmax = 10.; deltax = (xmax-xmin)/(double)ne; mesh[ne] = xmax; for(ee=0;ee<ne;++ee){ mesh[ee] = xmin+ee*deltax; } // gauss lobatto quadrature point, weight setup gausslobatto_quadrature(np, roots, weights); // coordinates and element size min_dx = xmax - xmin; // initial guess for(ee=0;ee<ne;ee++){ xl = mesh[ee]; xr = mesh[ee+1]; dx[ee] = xr-xl; // size of each element if(dx[ee] < min_dx){ min_dx = dx[ee]; // finding minimum dx } for(ii=0;ii<np;ii++){ idx = ee*np+ii; xx[idx] = xl + 0.5*(1+roots[ii])*dx[ee]; } } // mass matrix for(ii=0;ii<ne*np*np;ii++){ mmat[ii] = 0; } for(ee=0;ee<ne;ee++){ jac = fabs(dx[ee])/2; for(kk=0;kk<np;kk++){ lagrange(roots[kk], ll, roots); for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ idx = ee*np*np+jj*np+ii; // mass matrix mmat[ne][np][np] in 1d index representation mmat[idx] += jac*weights[kk]*ll[ii]*ll[jj]; } } } } // stiffness matrix for(ii=0;ii<np*np;ii++){ smat[ii] = 0; } for(kk=0;kk<np;kk++){ lagrange(roots[kk], ll, roots); lagrange_deriv(roots[kk], dl, roots); for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ idx = jj*np+ii; // stiffness matrix smat[np][np] in 1d index representation smat[idx] += weights[kk]*ll[jj]*dl[ii]; } } } // face integration for(ii=0;ii<np*2;ii++){ mf[ii] = 0; } lagrange(-1,mf, roots); // mf[ii] for(ii=0, ii<np,ii++) represents element left face integration lagrange( 1,mf+np,roots); // mf[ii] for ii=np, ii<2*np, ii++) reresents element right face integration // boundary interpolation for(ii=0;ii<np*2;ii++){ ib[ii] = 0; } lagrange(-1,ib, roots); // element left edge interpolation lagrange( 1,ib+np,roots); // element right edge interpolation // divergence operators for(ii=0;ii<ne*np*np;ii++){ dv[ii] = 0; } for(ii=0;ii<ne*np*2;ii++){ dv[ii] = 0; } for(ee=0;ee<ne;ee++){ for(jj=0;jj<np;jj++){ // it turn out that mmat is diagonal. i.e., ii != jj, mmat[ee][jj][ii] = 0 // the inverse of mmat is just the inverse of the diagonal components // here, we are extracting the inverse diagonal components only minv_vec[ee*np+jj] = 1./mmat[ee*np*np+jj*np+jj]; } for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ dv[ee*np*np+jj*np+ii] = minv_vec[ee*np+ii]*smat[jj*np+ii]; } } for(jj=0;jj<2;jj++){ for(ii=0;ii<np;ii++){ df[ee*np*2+jj*np+ii] = minv_vec[ee*np+ii]*mf[jj*np+ii]; } } } // initialize qq field initialize(qq, xx, xmax, xmin, init_type); cfl = 1./(np*np); dt = cfl * min_dx / fabs(speed); rtime = 0.; nstep = 0; printf("Start Time Integration\n"); // Runge-Kutta 4th order Time integration loop while(rtime < tend){ dt = fmin(dt, tend-rtime); rhs(qq, k1, dv, df, ib, speed); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+0.5*dt*k1[ii]; rhs(qtemp, k2, dv, df, ib, speed); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+0.5*dt*k2[ii]; rhs(qtemp, k3, dv, df, ib, speed); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+dt*k3[ii]; rhs(qtemp, k4, dv, df, ib, speed); for(ii=0;ii<ne*np;ii++) qq[ii] += 1./6.*dt*(k1[ii]+2*k2[ii]+2*k3[ii]+k4[ii]); rtime += dt; nstep += 1; if(nstep%10000 == 0) printf("nstep = %10ld, %5.1f%% complete\n", nstep, rtime/tend*100); } // timeloop ends here; printf("Integration complete\n"); if(ne > 200){ eres = 2; } else if (ne > 60){ eres = 3; } else if (ne > 30){ eres = 6; } else { eres = 10; } // final report printf("-----------------------------------------------\n"); printf("code type : c serial\n"); printf("Final time : %13.5e\n", rtime); printf("CFL : %13.5e\n", cfl); printf("DOF : %13d\n", ne*np); printf("No. of Elem : %13d\n", ne); printf("Order : %13d\n", np); printf("eres : %13d\n", eres); printf("time steps : %13ld\n", nstep); printf("-----------------------------------------------\n"); save_field(xx, qq, ne, roots, eres); free(roots); free(weights); free(ll); free(dl); free(dx); free(mesh); free(smat); free(xx); free(qq); free(qtemp); free(k1); free(k2); free(k3); free(k4); free(minv_vec); free(mmat); free(dv); free(mf); free(ib); free(fstar); free(df); return 0; } //////////////////////////////////////////////////////////////////////////// // parallelization needed functions: interface_flux & rhs void interface_flux(double *qq, double *fstar, double *ib, double speed){ int ii, ee; double qb[2*ne]; for(ii=0;ii<2*ne;ii++){ fstar[ii] = 0; } // fstar[0:ne-1] stores numerical flux of the left edge on each elements // fstar[ne:2*ne-1] stores numerical flux of the right edge on each elements for(ii=0;ii<ne;ii++){ qb[ii] = dot_product(ib, qq+ii*np, np); // left edge interpolated value of qq at element ii qb[ne+ii] = dot_product(ib+np, qq+ii*np, np); // right edge interpolated value of qq at element ii } ii = 0; // calculating numerical flux (fstar) with periodic boundary condition fstar[ii] = -((qb[ne+ne-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ne-1]-qb[ii])/2); fstar[ne+ne-1] = -fstar[0]; for(ii=1;ii<ne;ii++){ fstar[ii] = -((qb[ne+ii-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ii-1]-qb[ii])/2); fstar[ne+ii-1] = -fstar[ii]; } } void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed){ int ii, jj, ee, idx; double fstar[2*ne], term; for(ii=0;ii<np*ne;ii++){ rr[ii] = 0; } for(ee=0;ee<ne;ee++){ for(ii=0;ii<np;ii++){ for(jj=0; jj<np; jj++){ rr[ee*np+ii] += dv[ee*np*np+jj*np+ii]*speed*qq[ee*np+jj]; } } } interface_flux(qq, fstar, ib, speed); for(ee=0;ee<ne;ee++){ for(ii=0; ii<np; ii++){ for(jj=0;jj<2;jj++){ rr[ee*np+ii] -= df[ee*(np*2)+jj*np+ii]*fstar[ee+jj*ne]; } } } }
answer.c
#include <stdlib.h> #include <stdio.h> #include <math.h> #include <string.h> #include <mpi.h> #define IMIN(a, b) (((a) < (b)) ? (a) : (b)) #ifndef M_PI #define M_PI 3.14159265358979323846 #endif int np = 30, tne = 512, ne; /* int np = 4, tne = 5, ne; */ // np: number of nodes in each element // tne: total number of elements // ne: number of elements in each rank void equal_load(int n1, int n2, int nprocs, int myid, int *istart, int *ifinish); void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank); void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed, int nprocs, int myrank); void lagrange(double xx, double *pts, double *ll); void lagrange_deriv(double xx, double *dl, double *pts); void legendre(int n, double *x, int xsize, double *output); double legendre_scalar(int n, double x); double dlegendre_scalar(int n, double x); double ddlegendre_scalar(int n, double x); double dddlegendre_scalar(int n, double x); void dlegendre_roots(int polyorder, double *output); void gausslobatto_quadrature(int polyorder, double *roots, double *weights); double dot_product(double *v, double *u, int n); void save_field(double *xx, double *qq, int elem_num, double *roots, int eres); void initialize(double *qq, double *xx, double xmax, double xmin, char *init_type); double lxmin, lxmax; int main(int argc, char **argv){ double tend = 0.5, speed = 1.; // double tend = 1E-1, speed = 1.; char *init_type="mixed2"; double *roots, *weights, *ll, *dl, xmin, xmax, deltax, jac, xr, xl, cfl, dt, rtime, min_dx; int ii, jj, kk, ee, idx, eres; long nstep; double *dx, *mesh; double *smat, *xx, *qq, *qtemp, *k1, *k2, *k3, *k4, *minv_vec, *mmat, *dv, *mf, *ib, *df, *fstar, *gxx, *gqq, *buff1, *buff2; int istart, ifinish, myrank, nprocs; MPI_Request ireq1, ireq2, ireq3, ireq4; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD,&myrank); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); equal_load(0, tne-1, nprocs, myrank, &istart, &ifinish); ne = ifinish - istart + 1; // initialize // fortran index structure array[ii,jj,ee] where size(array) = (np, np, ne) // c 1d index structure array = [ee*np*np + jj*np + ii] roots = (double*)malloc(np* sizeof(double)); weights = (double*)malloc(np* sizeof(double)); ll = (double*)malloc(np* sizeof(double)); dl = (double*)malloc(np* sizeof(double)); dx = (double*)malloc(ne* sizeof(double)); mesh = (double*)malloc((ne+1)*sizeof(double)); smat = (double*)malloc(np*np*sizeof(double)); // [jj np, ii np] xx = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] qq = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] qtemp = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k1 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k2 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k3 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k4 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] minv_vec= (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] mmat = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np] dv = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np] mf = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np] ib = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np] fstar = (double*)malloc(2*ne*sizeof(double)); // [jj 2, ii ne] df = (double*)malloc(ne*2*np*sizeof(double)); // [ee ne, jj 2, ii np] for (ii=0; ii<np; ++ii){ roots[ii] = 0; weights[ii] = 0; ll[ii] = 0; dl[ii] = 0; } for (ii=0; ii<ne; ++ii){ dx[ii] = 0; mesh[ii] = 0; } mesh[ne] = 0; for (ii=0; ii<np*np; ++ii){ smat[ii] = 0; } for (ii=0; ii<ne*np; ++ii){ xx[ii] = 0; qq[ii] = 0; qtemp[ii] = 0; k1[ii] = 0; k2[ii] = 0; k3[ii] = 0; k4[ii] = 0; minv_vec[ii] = 0; } for (ii=0; ii<ne*np*np; ++ii){ mmat[ii] = 0; dv[ii] = 0; } for (ii=0; ii<np*2; ++ii){ mf[ii] = 0; ib[ii] = 0; } for (ii=0; ii<ne*2; ++ii){ fstar[ii] = 0; } for (ii=0; ii<ne*2*np; ++ii){ df[ii] = 0; } // mesh setup xmin = 0.; xmax = 10.; deltax = (xmax-xmin)/(double)tne; lxmin = xmin + (istart)*deltax; lxmax = xmin + (ifinish+1)*deltax; mesh[ne] = lxmax; for(ee=0;ee<ne;++ee){ mesh[ee] = lxmin+ee*deltax; } // gauss lobatto quadrature point, weight setup gausslobatto_quadrature(np, roots, weights); // coordinates and element size min_dx = xmax - xmin; // initial guess for(ee=0;ee<ne;ee++){ xl = mesh[ee]; xr = mesh[ee+1]; dx[ee] = xr-xl; // size of each element if(dx[ee] < min_dx){ min_dx = dx[ee]; // finding minimum dx } for(ii=0;ii<np;ii++){ idx = ee*np+ii; xx[idx] = xl + 0.5*(1+roots[ii])*dx[ee]; } } // mass matrix for(ii=0;ii<ne*np*np;ii++){ mmat[ii] = 0; } for(ee=0;ee<ne;ee++){ jac = fabs(dx[ee])/2; for(kk=0;kk<np;kk++){ lagrange(roots[kk], ll, roots); for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ idx = ee*np*np+jj*np+ii; // mass matrix mmat[ne][np][np] in 1d index representation mmat[idx] += jac*weights[kk]*ll[ii]*ll[jj]; } } } } // stiffness matrix for(ii=0;ii<np*np;ii++){ smat[ii] = 0; } for(kk=0;kk<np;kk++){ lagrange(roots[kk], ll, roots); lagrange_deriv(roots[kk], dl, roots); for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ idx = jj*np+ii; // stiffness matrix smat[np][np] in 1d index representation smat[idx] += weights[kk]*ll[jj]*dl[ii]; } } } // face integration for(ii=0;ii<np*2;ii++){ mf[ii] = 0; } lagrange(-1,mf, roots); // mf[ii] for(ii=0, ii<np,ii++) represents element left face integration lagrange( 1,mf+np,roots); // mf[ii] for ii=np, ii<2*np, ii++) reresents element right face integration // boundary interpolation for(ii=0;ii<np*2;ii++){ ib[ii] = 0; } lagrange(-1,ib, roots); // element left edge interpolation lagrange( 1,ib+np,roots); // element right edge interpolation // divergence operators for(ii=0;ii<ne*np*np;ii++){ dv[ii] = 0; } for(ii=0;ii<ne*np*2;ii++){ dv[ii] = 0; } for(ee=0;ee<ne;ee++){ for(jj=0;jj<np;jj++){ // it turn out that mmat is diagonal. i.e., ii != jj, mmat[ee][jj][ii] = 0 // the inverse of mmat is just the inverse of the diagonal components // here, we are extracting the inverse diagonal components only minv_vec[ee*np+jj] = 1./mmat[ee*np*np+jj*np+jj]; } for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ dv[ee*np*np+jj*np+ii] = minv_vec[ee*np+ii]*smat[jj*np+ii]; } } for(jj=0;jj<2;jj++){ for(ii=0;ii<np;ii++){ df[ee*np*2+jj*np+ii] = minv_vec[ee*np+ii]*mf[jj*np+ii]; } } } // initialize qq field initialize(qq, xx, xmax, xmin, init_type); cfl = 1./(np*np); dt = cfl * min_dx / fabs(speed); rtime = 0.; nstep = 0; if(myrank == 0){ printf("Start Time Integration\n"); } // Runge-Kutta 4th order Time integration loop while(rtime < tend){ dt = fmin(dt, tend-rtime); rhs(qq, k1, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+0.5*dt*k1[ii]; rhs(qtemp, k2, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+0.5*dt*k2[ii]; rhs(qtemp, k3, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+dt*k3[ii]; rhs(qtemp, k4, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qq[ii] += 1./6.*dt*(k1[ii]+2*k2[ii]+2*k3[ii]+k4[ii]); rtime += dt; nstep += 1; if(myrank == 0 && (nstep%10000 == 0)) printf("nstep = %10ld, %5.1f%% complete\n", nstep, rtime/tend*100); } // timeloop ends here; if(myrank != 0){ MPI_Isend(xx, np*ne, MPI_DOUBLE, 0, 10, MPI_COMM_WORLD, &ireq1); MPI_Isend(qq, np*ne, MPI_DOUBLE, 0, 20, MPI_COMM_WORLD, &ireq2); MPI_Isend(&istart, 1, MPI_INT, 0, 30, MPI_COMM_WORLD, &ireq3); MPI_Isend(&ifinish, 1, MPI_INT, 0, 40, MPI_COMM_WORLD, &ireq4); MPI_Wait(&ireq3, &status); MPI_Wait(&ireq4, &status); MPI_Wait(&ireq1, &status); MPI_Wait(&ireq2, &status); } if(myrank == 0){ printf("Integration complete\n"); if(tne > 200){ eres = 2; } else if (tne > 60){ eres = 3; } else if (tne > 30){ eres = 6; } else { eres = 10; } // final report printf("-----------------------------------------------\n"); printf("code type : c parallel\n"); printf("Final time : %13.5e\n", rtime); printf("CFL : %13.5e\n", cfl); printf("DOF : %13d\n", tne*np); printf("No. of Elem : %13d\n", tne); printf("Order : %13d\n", np); printf("eres : %13d\n", eres); printf("time steps : %13ld\n", nstep); printf("-----------------------------------------------\n"); gxx = (double*)malloc(tne*np*sizeof(double)); gqq = (double*)malloc(tne*np*sizeof(double)); buff1 = (double*)malloc((tne/nprocs+1)*np*sizeof(double)); buff2 = (double*)malloc((tne/nprocs+1)*np*sizeof(double)); for(ii=0;ii<ne*np;ii++){ gxx[ii] = xx[ii]; gqq[ii] = qq[ii]; } kk = ne*np; for(ii=1;ii<nprocs;ii++){ MPI_Irecv(&istart, 1, MPI_INT, ii, 30, MPI_COMM_WORLD, &ireq3); MPI_Irecv(&ifinish, 1, MPI_INT, ii, 40, MPI_COMM_WORLD, &ireq4); MPI_Wait(&ireq3, &status); MPI_Wait(&ireq4, &status); ne = ifinish-istart+1; MPI_Irecv(buff1, np*ne, MPI_DOUBLE, ii, 10, MPI_COMM_WORLD, &ireq1); MPI_Irecv(buff2, np*ne, MPI_DOUBLE, ii, 20, MPI_COMM_WORLD, &ireq2); MPI_Wait(&ireq1, &status); MPI_Wait(&ireq2, &status); for(jj=0; jj<ne*np;jj++){ gxx[kk+jj] = buff1[jj]; gqq[kk+jj] = buff2[jj]; } kk += ne*np; } save_field(gxx, gqq, tne, roots, eres); free(gxx); free(gqq); free(buff1); free(buff2); } free(roots); free(weights); free(ll); free(dl); free(dx); free(mesh); free(smat); free(xx); free(qq); free(qtemp); free(k1); free(k2); free(k3); free(k4); free(minv_vec); free(mmat); free(dv); free(mf); free(ib); free(fstar); free(df); MPI_Finalize(); return 0; } //////////////////////////////////////////////////////////////////////////// // parallelization needed functions: interface_flux & rhs void equal_load(int n1, int n2, int nprocs, int myid, int *istart, int *ifinish){ int iw1, iw2; iw1 = (n2-n1+1)/nprocs; iw2 = (n2-n1+1)%nprocs; *istart = myid*iw1+n1+IMIN(myid,iw2); *ifinish= *istart+iw1-1; if(iw2>myid) *ifinish+=1; if(n2 < *istart) *ifinish = *istart-1; } void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank){ int ii, ee; double qb[2*ne], qb_start; MPI_Status status; MPI_Request ireq1, ireq2, ireq3, ireq4; for(ii=0;ii<2*ne;ii++){ fstar[ii] = 0; } for(ii=0;ii<ne;ii++){ qb[ii] = dot_product(ib, qq+ii*np, np); // left edge interpolated value of qq at element ii qb[ne+ii] = dot_product(ib+np, qq+ii*np, np); // right edge interpolated value of qq at element ii } // fstar[0:ne-1] stores numerical flux of the left edge on each elements // fstar[ne:2*ne-1] stores numerical flux of the right edge on each elements if(myrank == 0){ MPI_Isend(qb+2*ne-1, 1, MPI_DOUBLE, myrank+1, 80, MPI_COMM_WORLD, &ireq1); MPI_Irecv(&qb_start, 1, MPI_DOUBLE, nprocs-1, 80, MPI_COMM_WORLD, &ireq2); } else if (myrank == nprocs-1){ MPI_Isend(qb+2*ne-1, 1, MPI_DOUBLE, 0, 80, MPI_COMM_WORLD, &ireq1); MPI_Irecv(&qb_start, 1, MPI_DOUBLE, myrank-1, 80, MPI_COMM_WORLD, &ireq2); } else{ MPI_Isend(qb+2*ne-1, 1, MPI_DOUBLE, myrank+1, 80, MPI_COMM_WORLD, &ireq1); MPI_Irecv(&qb_start, 1, MPI_DOUBLE, myrank-1, 80, MPI_COMM_WORLD, &ireq2); } MPI_Wait(&ireq1, &status); MPI_Wait(&ireq2, &status); ee = 0; // calculating numerical flux (fstar) with periodic boundary condition fstar[ee] = -((qb_start+qb[ee])/2.*speed + fabs(speed)*(qb_start-qb[ee])/2.); if(myrank == 0){ MPI_Isend(fstar, 1, MPI_DOUBLE, nprocs-1, 90, MPI_COMM_WORLD, &ireq3); } else{ MPI_Isend(fstar, 1, MPI_DOUBLE, myrank-1, 90, MPI_COMM_WORLD, &ireq3); } if(myrank == nprocs-1){ MPI_Irecv(fstar+2*ne-1, 1, MPI_DOUBLE, 0, 90, MPI_COMM_WORLD, &ireq4); } else{ MPI_Irecv(fstar+2*ne-1, 1, MPI_DOUBLE, myrank+1, 90, MPI_COMM_WORLD, &ireq4); } for(ee=1;ee<ne;ee++){ fstar[ee] = -((qb[ne+ee-1]+qb[ee])/2.*speed + fabs(speed)*(qb[ne+ee-1]-qb[ee])/2.); fstar[ne+ee-1] = -fstar[ee]; } MPI_Wait(&ireq3, &status); MPI_Wait(&ireq4, &status); fstar[2*ne-1] *= -1.; } void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed, int nprocs, int myrank){ int ii, jj, ee; double fstar[2*ne]; for(ii=0;ii<np*ne;ii++){ rr[ii] = 0; } for(ee=0;ee<ne;ee++){ for(ii=0;ii<np;ii++){ for(jj=0; jj<np; jj++){ rr[ee*np+ii] += dv[ee*np*np+jj*np+ii]*speed*qq[ee*np+jj]; } } } interface_flux(qq, fstar, ib, speed, nprocs, myrank); for(ee=0;ee<ne;ee++){ for(ii=0; ii<np; ii++){ for(jj=0;jj<2;jj++){ rr[ee*np+ii] -= df[ee*(np*2)+jj*np+ii]*fstar[ee+jj*ne]; } } } }
myparallel.c
#include <stdlib.h> #include <stdio.h> #include <math.h> #include <string.h> #include <mpi.h> #ifndef M_PI #define M_PI 3.14159265358979323846 #endif /* int np = 30, ne = 512; */ int np = 5, ne, tne = 10; // np: number of nodes in each element // ne: number of elements // caution: np 20, ne 256 will take approx 10 minutes on a serial job void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank); void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed, int nprocs, int myrank); void lagrange(double xx, double *pts, double *ll); void lagrange_deriv(double xx, double *dl, double *pts); void legendre(int n, double *x, int xsize, double *output); double legendre_scalar(int n, double x); double dlegendre_scalar(int n, double x); double ddlegendre_scalar(int n, double x); double dddlegendre_scalar(int n, double x); void dlegendre_roots(int polyorder, double *output); void gausslobatto_quadrature(int polyorder, double *roots, double *weights); double dot_product(double *v, double *u, int n); void save_field(double *xx, double *qq, int elem_num, double *roots, int eres); void initialize(double *qq, double *xx, double xmax, double xmin, char *init_type); int main(int argc, char **argv){ double tend = 1E2, speed = 1.; // double tend = 1E-1, speed = 1.; char *init_type="mixed2"; double *roots, *weights, *ll, *dl, xmin, xmax, deltax, jac, xr, xl, cfl, dt, rtime, min_dx; int ii, jj, kk, ee, idx, eres; long nstep; double *dx, *mesh; double *smat, *xx, *qq, *qtemp, *k1, *k2, *k3, *k4, *minv_vec, *mmat, *dv, *mf, *ib, *df, *fstar; // newly added variables for parallelization int nprocs, myrank, q, r, esta; int *ircnt, *idisp; double *xxtot, *qqtot; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); q = tne/nprocs; r = tne%nprocs; ne = myrank<r?(q+1):q; esta = myrank<r?(q+1)*myrank:(q*myrank+r); // initialize // fortran index structure array[ii,jj,ee] where size(array) = (np, np, ne) // c 1d index structure array = [ee*np*np + jj*np + ii] roots = (double*)malloc(np* sizeof(double)); weights = (double*)malloc(np* sizeof(double)); ll = (double*)malloc(np* sizeof(double)); dl = (double*)malloc(np* sizeof(double)); dx = (double*)malloc(ne* sizeof(double)); mesh = (double*)malloc((ne+1)*sizeof(double)); smat = (double*)malloc(np*np*sizeof(double)); // [jj np, ii np] xx = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] qq = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] qtemp = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k1 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k2 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k3 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] k4 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] minv_vec= (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np] mmat = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np] dv = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np] mf = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np] ib = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np] fstar = (double*)malloc(2*ne*sizeof(double)); // [jj 2, ii ne] df = (double*)malloc(ne*2*np*sizeof(double)); // [ee ne, jj 2, ii np] if(!myrank) { xxtot = (double*)malloc(tne*np*sizeof(double)); // [ee ne, ii np] qqtot = (double*)malloc(tne*np*sizeof(double)); // [ee ne, ii np] ircnt = (int*)malloc(nprocs*sizeof(int)); idisp = (int*)malloc(nprocs*sizeof(int)); idisp[0] = 0; for(ii=0; ii<nprocs; ii++){ ircnt[ii] = (myrank<r?(q+1):q)*np; if(ii>0) idisp[ii] = idisp[ii-1] + ircnt[ii-1]; printf("ircnt[%d] = %d, idisp[%d] = %d\n", ii, ircnt[ii], ii, idisp[ii]); } } for (ii=0; ii<np; ++ii){ roots[ii] = 0; weights[ii] = 0; ll[ii] = 0; dl[ii] = 0; } for (ii=0; ii<ne; ++ii){ dx[ii] = 0; mesh[ii] = 0; } mesh[ne] = 0; for (ii=0; ii<np*np; ++ii){ smat[ii] = 0; } for (ii=0; ii<ne*np; ++ii){ xx[ii] = 0; qq[ii] = 0; k1[ii] = 0; k2[ii] = 0; k3[ii] = 0; k4[ii] = 0; minv_vec[ii] = 0; } for (ii=0; ii<ne*np*np; ++ii){ mmat[ii] = 0; dv[ii] = 0; } for (ii=0; ii<np*2; ++ii){ mf[ii] = 0; ib[ii] = 0; } for (ii=0; ii<ne*2; ++ii){ fstar[ii] = 0; } for (ii=0; ii<ne*2*np; ++ii){ df[ii] = 0; } // mesh setup xmin = 0.; xmax = 10.; deltax = (xmax-xmin)/(double)tne; //mesh[ne] = xmax; for(ee=0;ee<ne+1;++ee){ mesh[ee] = xmin+(ee+esta)*deltax; } if(myrank == nprocs-1) mesh[ne] = xmax; // gauss lobatto quadrature point, weight setup gausslobatto_quadrature(np, roots, weights); // coordinates and element size min_dx = xmax - xmin; // initial guess for(ee=0;ee<ne;ee++){ xl = mesh[ee]; xr = mesh[ee+1]; dx[ee] = xr-xl; // size of each element if(dx[ee] < min_dx){ min_dx = dx[ee]; // finding minimum dx } for(ii=0;ii<np;ii++){ idx = ee*np+ii; xx[idx] = xl + 0.5*(1+roots[ii])*dx[ee]; } } // mass matrix for(ii=0;ii<ne*np*np;ii++){ mmat[ii] = 0; } for(ee=0;ee<ne;ee++){ jac = fabs(dx[ee])/2; for(kk=0;kk<np;kk++){ lagrange(roots[kk], ll, roots); for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ idx = ee*np*np+jj*np+ii; // mass matrix mmat[ne][np][np] in 1d index representation mmat[idx] += jac*weights[kk]*ll[ii]*ll[jj]; } } } } // stiffness matrix for(ii=0;ii<np*np;ii++){ smat[ii] = 0; } for(kk=0;kk<np;kk++){ lagrange(roots[kk], ll, roots); lagrange_deriv(roots[kk], dl, roots); for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ idx = jj*np+ii; // stiffness matrix smat[np][np] in 1d index representation smat[idx] += weights[kk]*ll[jj]*dl[ii]; } } } // face integration for(ii=0;ii<np*2;ii++){ mf[ii] = 0; } lagrange(-1,mf, roots); // mf[ii] for(ii=0, ii<np,ii++) represents element left face integration lagrange( 1,mf+np,roots); // mf[ii] for ii=np, ii<2*np, ii++) reresents element right face integration // boundary interpolation for(ii=0;ii<np*2;ii++){ ib[ii] = 0; } lagrange(-1,ib, roots); // element left edge interpolation lagrange( 1,ib+np,roots); // element right edge interpolation // divergence operators for(ii=0;ii<ne*np*np;ii++){ dv[ii] = 0; } for(ii=0;ii<ne*np*2;ii++){ dv[ii] = 0; } for(ee=0;ee<ne;ee++){ for(jj=0;jj<np;jj++){ // it turn out that mmat is diagonal. i.e., ii != jj, mmat[ee][jj][ii] = 0 // the inverse of mmat is just the inverse of the diagonal components // here, we are extracting the inverse diagonal components only minv_vec[ee*np+jj] = 1./mmat[ee*np*np+jj*np+jj]; } for(jj=0;jj<np;jj++){ for(ii=0;ii<np;ii++){ dv[ee*np*np+jj*np+ii] = minv_vec[ee*np+ii]*smat[jj*np+ii]; } } for(jj=0;jj<2;jj++){ for(ii=0;ii<np;ii++){ df[ee*np*2+jj*np+ii] = minv_vec[ee*np+ii]*mf[jj*np+ii]; } } } // initialize qq field initialize(qq, xx, xmax, xmin, init_type); cfl = 1./(np*np); dt = cfl * min_dx / fabs(speed); rtime = 0.; nstep = 0; if(!myrank) printf("Start Time Integration\n"); // Runge-Kutta 4th order Time integration loop while(rtime < tend){ dt = fmin(dt, tend-rtime); rhs(qq, k1, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+0.5*dt*k1[ii]; rhs(qtemp, k2, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+0.5*dt*k2[ii]; rhs(qtemp, k3, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qtemp[ii] = qq[ii]+dt*k3[ii]; rhs(qtemp, k4, dv, df, ib, speed, nprocs, myrank); for(ii=0;ii<ne*np;ii++) qq[ii] += 1./6.*dt*(k1[ii]+2*k2[ii]+2*k3[ii]+k4[ii]); rtime += dt; nstep += 1; if((myrank==0) && nstep%10000 == 0) printf("nstep = %10ld, %5.1f%% complete\n", nstep, rtime/tend*100); } printf("end..?\n"); // timeloop ends here; //printf("ne*np = %d, ircnt[0] = %d, idisp[0] = %d\n", ne*np, ircnt[0], idisp[0]); MPI_Gatherv(xx, ne*np, MPI_DOUBLE, xxtot, ircnt, idisp, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gatherv(qq, ne*np, MPI_DOUBLE, qqtot, ircnt, idisp, MPI_DOUBLE, 0, MPI_COMM_WORLD); printf("end..?\n"); if(!myrank){ printf("Integration complete\n"); if(ne > 200){ eres = 2; } else if (ne > 60){ eres = 3; } else if (ne > 30){ eres = 6; } else { eres = 10; } // final report printf("-----------------------------------------------\n"); printf("code type : c parallel\n"); printf("Final time : %13.5e\n", rtime); printf("CFL : %13.5e\n", cfl); printf("DOF : %13d\n", tne*np); printf("No. of Elem : %13d\n", tne); printf("Order : %13d\n", np); printf("eres : %13d\n", eres); printf("time steps : %13ld\n", nstep); printf("-----------------------------------------------\n"); save_field(xxtot, qqtot, tne, roots, eres); free(idisp); free(ircnt); free(xxtot); free(qqtot); } free(roots); free(weights); free(ll); free(dl); free(dx); free(mesh); free(smat); free(xx); free(qq); free(qtemp); free(k1); free(k2); free(k3); free(k4); free(minv_vec); free(mmat); free(dv); free(mf); free(ib); free(fstar); free(df); MPI_Finalize(); return 0; } //////////////////////////////////////////////////////////////////////////// // parallelization needed functions: interface_flux & rhs void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank){ int ii, ee; double qb[2*ne]; for(ii=0;ii<2*ne;ii++){ fstar[ii] = 0; } // fstar[0:ne-1] stores numerical flux of the left edge on each elements // fstar[ne:2*ne-1] stores numerical flux of the right edge on each elements for(ii=0;ii<ne;ii++){ qb[ii] = dot_product(ib, qq+ii*np, np); // left edge interpolated value of qq at element ii qb[ne+ii] = dot_product(ib+np, qq+ii*np, np); // right edge interpolated value of qq at element ii } ii = 0; // calculating numerical flux (fstar) with periodic boundary condition double qbf, fsta; double aspeed = fabs(speed); int prev = (myrank - 1 + nprocs)%nprocs; int next = (myrank + 1)%nprocs; MPI_Request req[4]; MPI_Status stat[4]; MPI_Isend(&qb[ne+ne-1], 1, MPI_DOUBLE, next, 123, MPI_COMM_WORLD, &req[0]); MPI_Irecv(&qbf, 1, MPI_DOUBLE, prev, 123, MPI_COMM_WORLD, &req[1]); MPI_Waitall(2, req, stat); fstar[0] = -((qbf+qb[0])/2*speed + fabs(speed)*(qbf-qb[0])/2); MPI_Isend(&fstar[0], 1, MPI_DOUBLE, prev, 321, MPI_COMM_WORLD, &req[2]); MPI_Irecv(&fsta, 1, MPI_DOUBLE, next, 321, MPI_COMM_WORLD, &req[3]); MPI_Waitall(2, req+2, stat+2); fstar[ne+ne-1] = -fsta; for(ii=1;ii<ne;ii++){ fstar[ii] = -((qb[ne+ii-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ii-1]-qb[ii])/2); fstar[ne+ii-1] = -fstar[ii]; } /* fstar[0] = -((qb[ne+ne-1]+qb[0])/2*speed + fabs(speed)*(qb[ne+ne-1]-qb[0])/2); fstar[ne+ne-1] = -fstar[0]; for(ii=1;ii<ne;ii++){ fstar[ii] = -((qb[ne+ii-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ii-1]-qb[ii])/2); fstar[ne+ii-1] = -fstar[ii]; } */ } void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed, int nprocs, int myrank){ int ii, jj, ee, idx; double fstar[2*ne], term; for(ii=0;ii<np*ne;ii++){ rr[ii] = 0; } for(ee=0;ee<ne;ee++){ for(ii=0;ii<np;ii++){ for(jj=0; jj<np; jj++){ rr[ee*np+ii] += dv[ee*np*np+jj*np+ii]*speed*qq[ee*np+jj]; } } } interface_flux(qq, fstar, ib, speed, nprocs, myrank); for(ee=0;ee<ne;ee++){ for(ii=0; ii<np; ii++){ for(jj=0;jj<2;jj++){ rr[ee*np+ii] -= df[ee*(np*2)+jj*np+ii]*fstar[ee+jj*ne]; } } } }
'Parallel Programming' 카테고리의 다른 글
MPI Function Dictionary (1) | 2016.10.04 |
---|---|
KSC 2015 3번 문제 및 풀이 (0) | 2016.10.01 |
KSC 2015 1번 문제 및 답안 (0) | 2016.10.01 |
KSC 2014 3번 문제 및 답안 (0) | 2016.10.01 |
KSC 2014 2번 문제 및 답안 (0) | 2016.10.01 |
댓글