sequential.c
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
/* int np = 30, ne = 512; */
int np = 5, ne = 10;
// np: number of nodes in each element
// ne: number of elements
// caution: np 20, ne 256 will take approx 10 minutes on a serial job
void interface_flux(double *qq, double *fstar, double *ib, double speed);
void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed);
void lagrange(double xx, double *pts, double *ll);
void lagrange_deriv(double xx, double *dl, double *pts);
void legendre(int n, double *x, int xsize, double *output);
double legendre_scalar(int n, double x);
double dlegendre_scalar(int n, double x);
double ddlegendre_scalar(int n, double x);
double dddlegendre_scalar(int n, double x);
void dlegendre_roots(int polyorder, double *output);
void gausslobatto_quadrature(int polyorder, double *roots, double *weights);
double dot_product(double *v, double *u, int n);
void save_field(double *xx, double *qq, int elem_num, double *roots, int eres);
void initialize(double *qq, double *xx, double xmax, double xmin, char *init_type);
int main(int argc, char **argv){
double tend = 1E2, speed = 1.;
// double tend = 1E-1, speed = 1.;
char *init_type="mixed2";
double *roots, *weights, *ll, *dl, xmin, xmax,
deltax, jac, xr, xl, cfl, dt, rtime, min_dx;
int ii, jj, kk, ee, idx, eres;
long nstep;
double *dx, *mesh;
double *smat, *xx, *qq, *qtemp, *k1, *k2, *k3, *k4, *minv_vec, *mmat, *dv,
*mf, *ib, *df, *fstar;
// initialize
// fortran index structure array[ii,jj,ee] where size(array) = (np, np, ne)
// c 1d index structure array = [ee*np*np + jj*np + ii]
roots = (double*)malloc(np* sizeof(double));
weights = (double*)malloc(np* sizeof(double));
ll = (double*)malloc(np* sizeof(double));
dl = (double*)malloc(np* sizeof(double));
dx = (double*)malloc(ne* sizeof(double));
mesh = (double*)malloc((ne+1)*sizeof(double));
smat = (double*)malloc(np*np*sizeof(double)); // [jj np, ii np]
xx = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
qq = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
qtemp = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k1 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k2 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k3 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k4 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
minv_vec= (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
mmat = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np]
dv = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np]
mf = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np]
ib = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np]
fstar = (double*)malloc(2*ne*sizeof(double)); // [jj 2, ii ne]
df = (double*)malloc(ne*2*np*sizeof(double)); // [ee ne, jj 2, ii np]
for (ii=0; ii<np; ++ii){
roots[ii] = 0;
weights[ii] = 0;
ll[ii] = 0;
dl[ii] = 0;
}
for (ii=0; ii<ne; ++ii){
dx[ii] = 0;
mesh[ii] = 0;
}
mesh[ne] = 0;
for (ii=0; ii<np*np; ++ii){
smat[ii] = 0;
}
for (ii=0; ii<ne*np; ++ii){
xx[ii] = 0;
qq[ii] = 0;
k1[ii] = 0;
k2[ii] = 0;
k3[ii] = 0;
k4[ii] = 0;
minv_vec[ii] = 0;
}
for (ii=0; ii<ne*np*np; ++ii){
mmat[ii] = 0;
dv[ii] = 0;
}
for (ii=0; ii<np*2; ++ii){
mf[ii] = 0;
ib[ii] = 0;
}
for (ii=0; ii<ne*2; ++ii){
fstar[ii] = 0;
}
for (ii=0; ii<ne*2*np; ++ii){
df[ii] = 0;
}
// mesh setup
xmin = 0.;
xmax = 10.;
deltax = (xmax-xmin)/(double)ne;
mesh[ne] = xmax;
for(ee=0;ee<ne;++ee){
mesh[ee] = xmin+ee*deltax;
}
// gauss lobatto quadrature point, weight setup
gausslobatto_quadrature(np, roots, weights);
// coordinates and element size
min_dx = xmax - xmin; // initial guess
for(ee=0;ee<ne;ee++){
xl = mesh[ee];
xr = mesh[ee+1];
dx[ee] = xr-xl; // size of each element
if(dx[ee] < min_dx){
min_dx = dx[ee]; // finding minimum dx
}
for(ii=0;ii<np;ii++){
idx = ee*np+ii;
xx[idx] = xl + 0.5*(1+roots[ii])*dx[ee];
}
}
// mass matrix
for(ii=0;ii<ne*np*np;ii++){
mmat[ii] = 0;
}
for(ee=0;ee<ne;ee++){
jac = fabs(dx[ee])/2;
for(kk=0;kk<np;kk++){
lagrange(roots[kk], ll, roots);
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
idx = ee*np*np+jj*np+ii;
// mass matrix mmat[ne][np][np] in 1d index representation
mmat[idx] += jac*weights[kk]*ll[ii]*ll[jj];
}
}
}
}
// stiffness matrix
for(ii=0;ii<np*np;ii++){
smat[ii] = 0;
}
for(kk=0;kk<np;kk++){
lagrange(roots[kk], ll, roots);
lagrange_deriv(roots[kk], dl, roots);
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
idx = jj*np+ii;
// stiffness matrix smat[np][np] in 1d index representation
smat[idx] += weights[kk]*ll[jj]*dl[ii];
}
}
}
// face integration
for(ii=0;ii<np*2;ii++){
mf[ii] = 0;
}
lagrange(-1,mf, roots); // mf[ii] for(ii=0, ii<np,ii++) represents element left face integration
lagrange( 1,mf+np,roots); // mf[ii] for ii=np, ii<2*np, ii++) reresents element right face integration
// boundary interpolation
for(ii=0;ii<np*2;ii++){
ib[ii] = 0;
}
lagrange(-1,ib, roots); // element left edge interpolation
lagrange( 1,ib+np,roots); // element right edge interpolation
// divergence operators
for(ii=0;ii<ne*np*np;ii++){
dv[ii] = 0;
}
for(ii=0;ii<ne*np*2;ii++){
dv[ii] = 0;
}
for(ee=0;ee<ne;ee++){
for(jj=0;jj<np;jj++){
// it turn out that mmat is diagonal. i.e., ii != jj, mmat[ee][jj][ii] = 0
// the inverse of mmat is just the inverse of the diagonal components
// here, we are extracting the inverse diagonal components only
minv_vec[ee*np+jj] = 1./mmat[ee*np*np+jj*np+jj];
}
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
dv[ee*np*np+jj*np+ii] = minv_vec[ee*np+ii]*smat[jj*np+ii];
}
}
for(jj=0;jj<2;jj++){
for(ii=0;ii<np;ii++){
df[ee*np*2+jj*np+ii] = minv_vec[ee*np+ii]*mf[jj*np+ii];
}
}
}
// initialize qq field
initialize(qq, xx, xmax, xmin, init_type);
cfl = 1./(np*np);
dt = cfl * min_dx / fabs(speed);
rtime = 0.;
nstep = 0;
printf("Start Time Integration\n");
// Runge-Kutta 4th order Time integration loop
while(rtime < tend){
dt = fmin(dt, tend-rtime);
rhs(qq, k1, dv, df, ib, speed);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+0.5*dt*k1[ii];
rhs(qtemp, k2, dv, df, ib, speed);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+0.5*dt*k2[ii];
rhs(qtemp, k3, dv, df, ib, speed);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+dt*k3[ii];
rhs(qtemp, k4, dv, df, ib, speed);
for(ii=0;ii<ne*np;ii++)
qq[ii] += 1./6.*dt*(k1[ii]+2*k2[ii]+2*k3[ii]+k4[ii]);
rtime += dt;
nstep += 1;
if(nstep%10000 == 0)
printf("nstep = %10ld, %5.1f%% complete\n", nstep, rtime/tend*100);
}
// timeloop ends here;
printf("Integration complete\n");
if(ne > 200){
eres = 2;
}
else if (ne > 60){
eres = 3;
}
else if (ne > 30){
eres = 6;
}
else {
eres = 10;
}
// final report
printf("-----------------------------------------------\n");
printf("code type : c serial\n");
printf("Final time : %13.5e\n", rtime);
printf("CFL : %13.5e\n", cfl);
printf("DOF : %13d\n", ne*np);
printf("No. of Elem : %13d\n", ne);
printf("Order : %13d\n", np);
printf("eres : %13d\n", eres);
printf("time steps : %13ld\n", nstep);
printf("-----------------------------------------------\n");
save_field(xx, qq, ne, roots, eres);
free(roots);
free(weights);
free(ll);
free(dl);
free(dx);
free(mesh);
free(smat);
free(xx);
free(qq);
free(qtemp);
free(k1);
free(k2);
free(k3);
free(k4);
free(minv_vec);
free(mmat);
free(dv);
free(mf);
free(ib);
free(fstar);
free(df);
return 0;
}
////////////////////////////////////////////////////////////////////////////
// parallelization needed functions: interface_flux & rhs
void interface_flux(double *qq, double *fstar, double *ib, double speed){
int ii, ee;
double qb[2*ne];
for(ii=0;ii<2*ne;ii++){
fstar[ii] = 0;
}
// fstar[0:ne-1] stores numerical flux of the left edge on each elements
// fstar[ne:2*ne-1] stores numerical flux of the right edge on each elements
for(ii=0;ii<ne;ii++){
qb[ii] = dot_product(ib, qq+ii*np, np); // left edge interpolated value of qq at element ii
qb[ne+ii] = dot_product(ib+np, qq+ii*np, np); // right edge interpolated value of qq at element ii
}
ii = 0; // calculating numerical flux (fstar) with periodic boundary condition
fstar[ii] = -((qb[ne+ne-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ne-1]-qb[ii])/2);
fstar[ne+ne-1] = -fstar[0];
for(ii=1;ii<ne;ii++){
fstar[ii] = -((qb[ne+ii-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ii-1]-qb[ii])/2);
fstar[ne+ii-1] = -fstar[ii];
}
}
void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed){
int ii, jj, ee, idx;
double fstar[2*ne], term;
for(ii=0;ii<np*ne;ii++){
rr[ii] = 0;
}
for(ee=0;ee<ne;ee++){
for(ii=0;ii<np;ii++){
for(jj=0; jj<np; jj++){
rr[ee*np+ii] += dv[ee*np*np+jj*np+ii]*speed*qq[ee*np+jj];
}
}
}
interface_flux(qq, fstar, ib, speed);
for(ee=0;ee<ne;ee++){
for(ii=0; ii<np; ii++){
for(jj=0;jj<2;jj++){
rr[ee*np+ii] -= df[ee*(np*2)+jj*np+ii]*fstar[ee+jj*ne];
}
}
}
}
answer.c
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <mpi.h>
#define IMIN(a, b) (((a) < (b)) ? (a) : (b))
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
int np = 30, tne = 512, ne;
/* int np = 4, tne = 5, ne; */
// np: number of nodes in each element
// tne: total number of elements
// ne: number of elements in each rank
void equal_load(int n1, int n2, int nprocs, int myid, int *istart, int *ifinish);
void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank);
void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed, int nprocs, int myrank);
void lagrange(double xx, double *pts, double *ll);
void lagrange_deriv(double xx, double *dl, double *pts);
void legendre(int n, double *x, int xsize, double *output);
double legendre_scalar(int n, double x);
double dlegendre_scalar(int n, double x);
double ddlegendre_scalar(int n, double x);
double dddlegendre_scalar(int n, double x);
void dlegendre_roots(int polyorder, double *output);
void gausslobatto_quadrature(int polyorder, double *roots, double *weights);
double dot_product(double *v, double *u, int n);
void save_field(double *xx, double *qq, int elem_num, double *roots, int eres);
void initialize(double *qq, double *xx, double xmax, double xmin, char *init_type);
double lxmin, lxmax;
int main(int argc, char **argv){
double tend = 0.5, speed = 1.;
// double tend = 1E-1, speed = 1.;
char *init_type="mixed2";
double *roots, *weights, *ll, *dl, xmin, xmax,
deltax, jac, xr, xl, cfl, dt, rtime, min_dx;
int ii, jj, kk, ee, idx, eres;
long nstep;
double *dx, *mesh;
double *smat, *xx, *qq, *qtemp, *k1, *k2, *k3, *k4, *minv_vec, *mmat, *dv,
*mf, *ib, *df, *fstar, *gxx, *gqq, *buff1, *buff2;
int istart, ifinish, myrank, nprocs;
MPI_Request ireq1, ireq2, ireq3, ireq4;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
equal_load(0, tne-1, nprocs, myrank, &istart, &ifinish);
ne = ifinish - istart + 1;
// initialize
// fortran index structure array[ii,jj,ee] where size(array) = (np, np, ne)
// c 1d index structure array = [ee*np*np + jj*np + ii]
roots = (double*)malloc(np* sizeof(double));
weights = (double*)malloc(np* sizeof(double));
ll = (double*)malloc(np* sizeof(double));
dl = (double*)malloc(np* sizeof(double));
dx = (double*)malloc(ne* sizeof(double));
mesh = (double*)malloc((ne+1)*sizeof(double));
smat = (double*)malloc(np*np*sizeof(double)); // [jj np, ii np]
xx = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
qq = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
qtemp = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k1 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k2 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k3 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k4 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
minv_vec= (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
mmat = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np]
dv = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np]
mf = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np]
ib = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np]
fstar = (double*)malloc(2*ne*sizeof(double)); // [jj 2, ii ne]
df = (double*)malloc(ne*2*np*sizeof(double)); // [ee ne, jj 2, ii np]
for (ii=0; ii<np; ++ii){
roots[ii] = 0;
weights[ii] = 0;
ll[ii] = 0;
dl[ii] = 0;
}
for (ii=0; ii<ne; ++ii){
dx[ii] = 0;
mesh[ii] = 0;
}
mesh[ne] = 0;
for (ii=0; ii<np*np; ++ii){
smat[ii] = 0;
}
for (ii=0; ii<ne*np; ++ii){
xx[ii] = 0;
qq[ii] = 0;
qtemp[ii] = 0;
k1[ii] = 0;
k2[ii] = 0;
k3[ii] = 0;
k4[ii] = 0;
minv_vec[ii] = 0;
}
for (ii=0; ii<ne*np*np; ++ii){
mmat[ii] = 0;
dv[ii] = 0;
}
for (ii=0; ii<np*2; ++ii){
mf[ii] = 0;
ib[ii] = 0;
}
for (ii=0; ii<ne*2; ++ii){
fstar[ii] = 0;
}
for (ii=0; ii<ne*2*np; ++ii){
df[ii] = 0;
}
// mesh setup
xmin = 0.;
xmax = 10.;
deltax = (xmax-xmin)/(double)tne;
lxmin = xmin + (istart)*deltax;
lxmax = xmin + (ifinish+1)*deltax;
mesh[ne] = lxmax;
for(ee=0;ee<ne;++ee){
mesh[ee] = lxmin+ee*deltax;
}
// gauss lobatto quadrature point, weight setup
gausslobatto_quadrature(np, roots, weights);
// coordinates and element size
min_dx = xmax - xmin; // initial guess
for(ee=0;ee<ne;ee++){
xl = mesh[ee];
xr = mesh[ee+1];
dx[ee] = xr-xl; // size of each element
if(dx[ee] < min_dx){
min_dx = dx[ee]; // finding minimum dx
}
for(ii=0;ii<np;ii++){
idx = ee*np+ii;
xx[idx] = xl + 0.5*(1+roots[ii])*dx[ee];
}
}
// mass matrix
for(ii=0;ii<ne*np*np;ii++){
mmat[ii] = 0;
}
for(ee=0;ee<ne;ee++){
jac = fabs(dx[ee])/2;
for(kk=0;kk<np;kk++){
lagrange(roots[kk], ll, roots);
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
idx = ee*np*np+jj*np+ii;
// mass matrix mmat[ne][np][np] in 1d index representation
mmat[idx] += jac*weights[kk]*ll[ii]*ll[jj];
}
}
}
}
// stiffness matrix
for(ii=0;ii<np*np;ii++){
smat[ii] = 0;
}
for(kk=0;kk<np;kk++){
lagrange(roots[kk], ll, roots);
lagrange_deriv(roots[kk], dl, roots);
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
idx = jj*np+ii;
// stiffness matrix smat[np][np] in 1d index representation
smat[idx] += weights[kk]*ll[jj]*dl[ii];
}
}
}
// face integration
for(ii=0;ii<np*2;ii++){
mf[ii] = 0;
}
lagrange(-1,mf, roots); // mf[ii] for(ii=0, ii<np,ii++) represents element left face integration
lagrange( 1,mf+np,roots); // mf[ii] for ii=np, ii<2*np, ii++) reresents element right face integration
// boundary interpolation
for(ii=0;ii<np*2;ii++){
ib[ii] = 0;
}
lagrange(-1,ib, roots); // element left edge interpolation
lagrange( 1,ib+np,roots); // element right edge interpolation
// divergence operators
for(ii=0;ii<ne*np*np;ii++){
dv[ii] = 0;
}
for(ii=0;ii<ne*np*2;ii++){
dv[ii] = 0;
}
for(ee=0;ee<ne;ee++){
for(jj=0;jj<np;jj++){
// it turn out that mmat is diagonal. i.e., ii != jj, mmat[ee][jj][ii] = 0
// the inverse of mmat is just the inverse of the diagonal components
// here, we are extracting the inverse diagonal components only
minv_vec[ee*np+jj] = 1./mmat[ee*np*np+jj*np+jj];
}
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
dv[ee*np*np+jj*np+ii] = minv_vec[ee*np+ii]*smat[jj*np+ii];
}
}
for(jj=0;jj<2;jj++){
for(ii=0;ii<np;ii++){
df[ee*np*2+jj*np+ii] = minv_vec[ee*np+ii]*mf[jj*np+ii];
}
}
}
// initialize qq field
initialize(qq, xx, xmax, xmin, init_type);
cfl = 1./(np*np);
dt = cfl * min_dx / fabs(speed);
rtime = 0.;
nstep = 0;
if(myrank == 0){
printf("Start Time Integration\n");
}
// Runge-Kutta 4th order Time integration loop
while(rtime < tend){
dt = fmin(dt, tend-rtime);
rhs(qq, k1, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+0.5*dt*k1[ii];
rhs(qtemp, k2, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+0.5*dt*k2[ii];
rhs(qtemp, k3, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+dt*k3[ii];
rhs(qtemp, k4, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qq[ii] += 1./6.*dt*(k1[ii]+2*k2[ii]+2*k3[ii]+k4[ii]);
rtime += dt;
nstep += 1;
if(myrank == 0 && (nstep%10000 == 0))
printf("nstep = %10ld, %5.1f%% complete\n", nstep, rtime/tend*100);
}
// timeloop ends here;
if(myrank != 0){
MPI_Isend(xx, np*ne, MPI_DOUBLE, 0, 10, MPI_COMM_WORLD, &ireq1);
MPI_Isend(qq, np*ne, MPI_DOUBLE, 0, 20, MPI_COMM_WORLD, &ireq2);
MPI_Isend(&istart, 1, MPI_INT, 0, 30, MPI_COMM_WORLD, &ireq3);
MPI_Isend(&ifinish, 1, MPI_INT, 0, 40, MPI_COMM_WORLD, &ireq4);
MPI_Wait(&ireq3, &status);
MPI_Wait(&ireq4, &status);
MPI_Wait(&ireq1, &status);
MPI_Wait(&ireq2, &status);
}
if(myrank == 0){
printf("Integration complete\n");
if(tne > 200){
eres = 2;
}
else if (tne > 60){
eres = 3;
}
else if (tne > 30){
eres = 6;
}
else {
eres = 10;
}
// final report
printf("-----------------------------------------------\n");
printf("code type : c parallel\n");
printf("Final time : %13.5e\n", rtime);
printf("CFL : %13.5e\n", cfl);
printf("DOF : %13d\n", tne*np);
printf("No. of Elem : %13d\n", tne);
printf("Order : %13d\n", np);
printf("eres : %13d\n", eres);
printf("time steps : %13ld\n", nstep);
printf("-----------------------------------------------\n");
gxx = (double*)malloc(tne*np*sizeof(double));
gqq = (double*)malloc(tne*np*sizeof(double));
buff1 = (double*)malloc((tne/nprocs+1)*np*sizeof(double));
buff2 = (double*)malloc((tne/nprocs+1)*np*sizeof(double));
for(ii=0;ii<ne*np;ii++){
gxx[ii] = xx[ii];
gqq[ii] = qq[ii];
}
kk = ne*np;
for(ii=1;ii<nprocs;ii++){
MPI_Irecv(&istart, 1, MPI_INT, ii, 30, MPI_COMM_WORLD, &ireq3);
MPI_Irecv(&ifinish, 1, MPI_INT, ii, 40, MPI_COMM_WORLD, &ireq4);
MPI_Wait(&ireq3, &status);
MPI_Wait(&ireq4, &status);
ne = ifinish-istart+1;
MPI_Irecv(buff1, np*ne, MPI_DOUBLE, ii, 10, MPI_COMM_WORLD, &ireq1);
MPI_Irecv(buff2, np*ne, MPI_DOUBLE, ii, 20, MPI_COMM_WORLD, &ireq2);
MPI_Wait(&ireq1, &status);
MPI_Wait(&ireq2, &status);
for(jj=0; jj<ne*np;jj++){
gxx[kk+jj] = buff1[jj];
gqq[kk+jj] = buff2[jj];
}
kk += ne*np;
}
save_field(gxx, gqq, tne, roots, eres);
free(gxx);
free(gqq);
free(buff1);
free(buff2);
}
free(roots);
free(weights);
free(ll);
free(dl);
free(dx);
free(mesh);
free(smat);
free(xx);
free(qq);
free(qtemp);
free(k1);
free(k2);
free(k3);
free(k4);
free(minv_vec);
free(mmat);
free(dv);
free(mf);
free(ib);
free(fstar);
free(df);
MPI_Finalize();
return 0;
}
////////////////////////////////////////////////////////////////////////////
// parallelization needed functions: interface_flux & rhs
void equal_load(int n1, int n2, int nprocs, int myid, int *istart, int *ifinish){
int iw1, iw2;
iw1 = (n2-n1+1)/nprocs;
iw2 = (n2-n1+1)%nprocs;
*istart = myid*iw1+n1+IMIN(myid,iw2);
*ifinish= *istart+iw1-1;
if(iw2>myid) *ifinish+=1;
if(n2 < *istart) *ifinish = *istart-1;
}
void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank){
int ii, ee;
double qb[2*ne], qb_start;
MPI_Status status;
MPI_Request ireq1, ireq2, ireq3, ireq4;
for(ii=0;ii<2*ne;ii++){
fstar[ii] = 0;
}
for(ii=0;ii<ne;ii++){
qb[ii] = dot_product(ib, qq+ii*np, np); // left edge interpolated value of qq at element ii
qb[ne+ii] = dot_product(ib+np, qq+ii*np, np); // right edge interpolated value of qq at element ii
}
// fstar[0:ne-1] stores numerical flux of the left edge on each elements
// fstar[ne:2*ne-1] stores numerical flux of the right edge on each elements
if(myrank == 0){
MPI_Isend(qb+2*ne-1, 1, MPI_DOUBLE, myrank+1, 80, MPI_COMM_WORLD, &ireq1);
MPI_Irecv(&qb_start, 1, MPI_DOUBLE, nprocs-1, 80, MPI_COMM_WORLD, &ireq2);
}
else if (myrank == nprocs-1){
MPI_Isend(qb+2*ne-1, 1, MPI_DOUBLE, 0, 80, MPI_COMM_WORLD, &ireq1);
MPI_Irecv(&qb_start, 1, MPI_DOUBLE, myrank-1, 80, MPI_COMM_WORLD, &ireq2);
}
else{
MPI_Isend(qb+2*ne-1, 1, MPI_DOUBLE, myrank+1, 80, MPI_COMM_WORLD, &ireq1);
MPI_Irecv(&qb_start, 1, MPI_DOUBLE, myrank-1, 80, MPI_COMM_WORLD, &ireq2);
}
MPI_Wait(&ireq1, &status);
MPI_Wait(&ireq2, &status);
ee = 0; // calculating numerical flux (fstar) with periodic boundary condition
fstar[ee] = -((qb_start+qb[ee])/2.*speed + fabs(speed)*(qb_start-qb[ee])/2.);
if(myrank == 0){
MPI_Isend(fstar, 1, MPI_DOUBLE, nprocs-1, 90, MPI_COMM_WORLD, &ireq3);
}
else{
MPI_Isend(fstar, 1, MPI_DOUBLE, myrank-1, 90, MPI_COMM_WORLD, &ireq3);
}
if(myrank == nprocs-1){
MPI_Irecv(fstar+2*ne-1, 1, MPI_DOUBLE, 0, 90, MPI_COMM_WORLD, &ireq4);
}
else{
MPI_Irecv(fstar+2*ne-1, 1, MPI_DOUBLE, myrank+1, 90, MPI_COMM_WORLD, &ireq4);
}
for(ee=1;ee<ne;ee++){
fstar[ee] = -((qb[ne+ee-1]+qb[ee])/2.*speed + fabs(speed)*(qb[ne+ee-1]-qb[ee])/2.);
fstar[ne+ee-1] = -fstar[ee];
}
MPI_Wait(&ireq3, &status);
MPI_Wait(&ireq4, &status);
fstar[2*ne-1] *= -1.;
}
void rhs(double *qq, double *rr, double *dv, double *df, double *ib,
double speed, int nprocs, int myrank){
int ii, jj, ee;
double fstar[2*ne];
for(ii=0;ii<np*ne;ii++){
rr[ii] = 0;
}
for(ee=0;ee<ne;ee++){
for(ii=0;ii<np;ii++){
for(jj=0; jj<np; jj++){
rr[ee*np+ii] += dv[ee*np*np+jj*np+ii]*speed*qq[ee*np+jj];
}
}
}
interface_flux(qq, fstar, ib, speed, nprocs, myrank);
for(ee=0;ee<ne;ee++){
for(ii=0; ii<np; ii++){
for(jj=0;jj<2;jj++){
rr[ee*np+ii] -= df[ee*(np*2)+jj*np+ii]*fstar[ee+jj*ne];
}
}
}
}
myparallel.c
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <mpi.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
/* int np = 30, ne = 512; */
int np = 5, ne, tne = 10;
// np: number of nodes in each element
// ne: number of elements
// caution: np 20, ne 256 will take approx 10 minutes on a serial job
void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank);
void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed, int nprocs, int myrank);
void lagrange(double xx, double *pts, double *ll);
void lagrange_deriv(double xx, double *dl, double *pts);
void legendre(int n, double *x, int xsize, double *output);
double legendre_scalar(int n, double x);
double dlegendre_scalar(int n, double x);
double ddlegendre_scalar(int n, double x);
double dddlegendre_scalar(int n, double x);
void dlegendre_roots(int polyorder, double *output);
void gausslobatto_quadrature(int polyorder, double *roots, double *weights);
double dot_product(double *v, double *u, int n);
void save_field(double *xx, double *qq, int elem_num, double *roots, int eres);
void initialize(double *qq, double *xx, double xmax, double xmin, char *init_type);
int main(int argc, char **argv){
double tend = 1E2, speed = 1.;
// double tend = 1E-1, speed = 1.;
char *init_type="mixed2";
double *roots, *weights, *ll, *dl, xmin, xmax,
deltax, jac, xr, xl, cfl, dt, rtime, min_dx;
int ii, jj, kk, ee, idx, eres;
long nstep;
double *dx, *mesh;
double *smat, *xx, *qq, *qtemp, *k1, *k2, *k3, *k4, *minv_vec, *mmat, *dv,
*mf, *ib, *df, *fstar;
// newly added variables for parallelization
int nprocs, myrank, q, r, esta;
int *ircnt, *idisp;
double *xxtot, *qqtot;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
q = tne/nprocs;
r = tne%nprocs;
ne = myrank<r?(q+1):q;
esta = myrank<r?(q+1)*myrank:(q*myrank+r);
// initialize
// fortran index structure array[ii,jj,ee] where size(array) = (np, np, ne)
// c 1d index structure array = [ee*np*np + jj*np + ii]
roots = (double*)malloc(np* sizeof(double));
weights = (double*)malloc(np* sizeof(double));
ll = (double*)malloc(np* sizeof(double));
dl = (double*)malloc(np* sizeof(double));
dx = (double*)malloc(ne* sizeof(double));
mesh = (double*)malloc((ne+1)*sizeof(double));
smat = (double*)malloc(np*np*sizeof(double)); // [jj np, ii np]
xx = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
qq = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
qtemp = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k1 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k2 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k3 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
k4 = (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
minv_vec= (double*)malloc(ne*np*sizeof(double)); // [ee ne, ii np]
mmat = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np]
dv = (double*)malloc(ne*np*np*sizeof(double)); // [ee ne, jj np, ii np]
mf = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np]
ib = (double*)malloc(2*np*sizeof(double)); // [jj 2, ii np]
fstar = (double*)malloc(2*ne*sizeof(double)); // [jj 2, ii ne]
df = (double*)malloc(ne*2*np*sizeof(double)); // [ee ne, jj 2, ii np]
if(!myrank) {
xxtot = (double*)malloc(tne*np*sizeof(double)); // [ee ne, ii np]
qqtot = (double*)malloc(tne*np*sizeof(double)); // [ee ne, ii np]
ircnt = (int*)malloc(nprocs*sizeof(int));
idisp = (int*)malloc(nprocs*sizeof(int));
idisp[0] = 0;
for(ii=0; ii<nprocs; ii++){
ircnt[ii] = (myrank<r?(q+1):q)*np;
if(ii>0) idisp[ii] = idisp[ii-1] + ircnt[ii-1];
printf("ircnt[%d] = %d, idisp[%d] = %d\n", ii, ircnt[ii], ii, idisp[ii]);
}
}
for (ii=0; ii<np; ++ii){
roots[ii] = 0;
weights[ii] = 0;
ll[ii] = 0;
dl[ii] = 0;
}
for (ii=0; ii<ne; ++ii){
dx[ii] = 0;
mesh[ii] = 0;
}
mesh[ne] = 0;
for (ii=0; ii<np*np; ++ii){
smat[ii] = 0;
}
for (ii=0; ii<ne*np; ++ii){
xx[ii] = 0;
qq[ii] = 0;
k1[ii] = 0;
k2[ii] = 0;
k3[ii] = 0;
k4[ii] = 0;
minv_vec[ii] = 0;
}
for (ii=0; ii<ne*np*np; ++ii){
mmat[ii] = 0;
dv[ii] = 0;
}
for (ii=0; ii<np*2; ++ii){
mf[ii] = 0;
ib[ii] = 0;
}
for (ii=0; ii<ne*2; ++ii){
fstar[ii] = 0;
}
for (ii=0; ii<ne*2*np; ++ii){
df[ii] = 0;
}
// mesh setup
xmin = 0.;
xmax = 10.;
deltax = (xmax-xmin)/(double)tne;
//mesh[ne] = xmax;
for(ee=0;ee<ne+1;++ee){
mesh[ee] = xmin+(ee+esta)*deltax;
}
if(myrank == nprocs-1) mesh[ne] = xmax;
// gauss lobatto quadrature point, weight setup
gausslobatto_quadrature(np, roots, weights);
// coordinates and element size
min_dx = xmax - xmin; // initial guess
for(ee=0;ee<ne;ee++){
xl = mesh[ee];
xr = mesh[ee+1];
dx[ee] = xr-xl; // size of each element
if(dx[ee] < min_dx){
min_dx = dx[ee]; // finding minimum dx
}
for(ii=0;ii<np;ii++){
idx = ee*np+ii;
xx[idx] = xl + 0.5*(1+roots[ii])*dx[ee];
}
}
// mass matrix
for(ii=0;ii<ne*np*np;ii++){
mmat[ii] = 0;
}
for(ee=0;ee<ne;ee++){
jac = fabs(dx[ee])/2;
for(kk=0;kk<np;kk++){
lagrange(roots[kk], ll, roots);
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
idx = ee*np*np+jj*np+ii;
// mass matrix mmat[ne][np][np] in 1d index representation
mmat[idx] += jac*weights[kk]*ll[ii]*ll[jj];
}
}
}
}
// stiffness matrix
for(ii=0;ii<np*np;ii++){
smat[ii] = 0;
}
for(kk=0;kk<np;kk++){
lagrange(roots[kk], ll, roots);
lagrange_deriv(roots[kk], dl, roots);
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
idx = jj*np+ii;
// stiffness matrix smat[np][np] in 1d index representation
smat[idx] += weights[kk]*ll[jj]*dl[ii];
}
}
}
// face integration
for(ii=0;ii<np*2;ii++){
mf[ii] = 0;
}
lagrange(-1,mf, roots); // mf[ii] for(ii=0, ii<np,ii++) represents element left face integration
lagrange( 1,mf+np,roots); // mf[ii] for ii=np, ii<2*np, ii++) reresents element right face integration
// boundary interpolation
for(ii=0;ii<np*2;ii++){
ib[ii] = 0;
}
lagrange(-1,ib, roots); // element left edge interpolation
lagrange( 1,ib+np,roots); // element right edge interpolation
// divergence operators
for(ii=0;ii<ne*np*np;ii++){
dv[ii] = 0;
}
for(ii=0;ii<ne*np*2;ii++){
dv[ii] = 0;
}
for(ee=0;ee<ne;ee++){
for(jj=0;jj<np;jj++){
// it turn out that mmat is diagonal. i.e., ii != jj, mmat[ee][jj][ii] = 0
// the inverse of mmat is just the inverse of the diagonal components
// here, we are extracting the inverse diagonal components only
minv_vec[ee*np+jj] = 1./mmat[ee*np*np+jj*np+jj];
}
for(jj=0;jj<np;jj++){
for(ii=0;ii<np;ii++){
dv[ee*np*np+jj*np+ii] = minv_vec[ee*np+ii]*smat[jj*np+ii];
}
}
for(jj=0;jj<2;jj++){
for(ii=0;ii<np;ii++){
df[ee*np*2+jj*np+ii] = minv_vec[ee*np+ii]*mf[jj*np+ii];
}
}
}
// initialize qq field
initialize(qq, xx, xmax, xmin, init_type);
cfl = 1./(np*np);
dt = cfl * min_dx / fabs(speed);
rtime = 0.;
nstep = 0;
if(!myrank)
printf("Start Time Integration\n");
// Runge-Kutta 4th order Time integration loop
while(rtime < tend){
dt = fmin(dt, tend-rtime);
rhs(qq, k1, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+0.5*dt*k1[ii];
rhs(qtemp, k2, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+0.5*dt*k2[ii];
rhs(qtemp, k3, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qtemp[ii] = qq[ii]+dt*k3[ii];
rhs(qtemp, k4, dv, df, ib, speed, nprocs, myrank);
for(ii=0;ii<ne*np;ii++)
qq[ii] += 1./6.*dt*(k1[ii]+2*k2[ii]+2*k3[ii]+k4[ii]);
rtime += dt;
nstep += 1;
if((myrank==0) && nstep%10000 == 0)
printf("nstep = %10ld, %5.1f%% complete\n", nstep, rtime/tend*100);
}
printf("end..?\n");
// timeloop ends here;
//printf("ne*np = %d, ircnt[0] = %d, idisp[0] = %d\n", ne*np, ircnt[0], idisp[0]);
MPI_Gatherv(xx, ne*np, MPI_DOUBLE, xxtot, ircnt, idisp, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Gatherv(qq, ne*np, MPI_DOUBLE, qqtot, ircnt, idisp, MPI_DOUBLE, 0, MPI_COMM_WORLD);
printf("end..?\n");
if(!myrank){
printf("Integration complete\n");
if(ne > 200){
eres = 2;
}
else if (ne > 60){
eres = 3;
}
else if (ne > 30){
eres = 6;
}
else {
eres = 10;
}
// final report
printf("-----------------------------------------------\n");
printf("code type : c parallel\n");
printf("Final time : %13.5e\n", rtime);
printf("CFL : %13.5e\n", cfl);
printf("DOF : %13d\n", tne*np);
printf("No. of Elem : %13d\n", tne);
printf("Order : %13d\n", np);
printf("eres : %13d\n", eres);
printf("time steps : %13ld\n", nstep);
printf("-----------------------------------------------\n");
save_field(xxtot, qqtot, tne, roots, eres);
free(idisp);
free(ircnt);
free(xxtot);
free(qqtot);
}
free(roots);
free(weights);
free(ll);
free(dl);
free(dx);
free(mesh);
free(smat);
free(xx);
free(qq);
free(qtemp);
free(k1);
free(k2);
free(k3);
free(k4);
free(minv_vec);
free(mmat);
free(dv);
free(mf);
free(ib);
free(fstar);
free(df);
MPI_Finalize();
return 0;
}
////////////////////////////////////////////////////////////////////////////
// parallelization needed functions: interface_flux & rhs
void interface_flux(double *qq, double *fstar, double *ib, double speed, int nprocs, int myrank){
int ii, ee;
double qb[2*ne];
for(ii=0;ii<2*ne;ii++){
fstar[ii] = 0;
}
// fstar[0:ne-1] stores numerical flux of the left edge on each elements
// fstar[ne:2*ne-1] stores numerical flux of the right edge on each elements
for(ii=0;ii<ne;ii++){
qb[ii] = dot_product(ib, qq+ii*np, np); // left edge interpolated value of qq at element ii
qb[ne+ii] = dot_product(ib+np, qq+ii*np, np); // right edge interpolated value of qq at element ii
}
ii = 0; // calculating numerical flux (fstar) with periodic boundary condition
double qbf, fsta;
double aspeed = fabs(speed);
int prev = (myrank - 1 + nprocs)%nprocs;
int next = (myrank + 1)%nprocs;
MPI_Request req[4];
MPI_Status stat[4];
MPI_Isend(&qb[ne+ne-1], 1, MPI_DOUBLE, next, 123, MPI_COMM_WORLD, &req[0]);
MPI_Irecv(&qbf, 1, MPI_DOUBLE, prev, 123, MPI_COMM_WORLD, &req[1]);
MPI_Waitall(2, req, stat);
fstar[0] = -((qbf+qb[0])/2*speed + fabs(speed)*(qbf-qb[0])/2);
MPI_Isend(&fstar[0], 1, MPI_DOUBLE, prev, 321, MPI_COMM_WORLD, &req[2]);
MPI_Irecv(&fsta, 1, MPI_DOUBLE, next, 321, MPI_COMM_WORLD, &req[3]);
MPI_Waitall(2, req+2, stat+2);
fstar[ne+ne-1] = -fsta;
for(ii=1;ii<ne;ii++){
fstar[ii] = -((qb[ne+ii-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ii-1]-qb[ii])/2);
fstar[ne+ii-1] = -fstar[ii];
}
/*
fstar[0] = -((qb[ne+ne-1]+qb[0])/2*speed + fabs(speed)*(qb[ne+ne-1]-qb[0])/2);
fstar[ne+ne-1] = -fstar[0];
for(ii=1;ii<ne;ii++){
fstar[ii] = -((qb[ne+ii-1]+qb[ii])/2*speed + fabs(speed)*(qb[ne+ii-1]-qb[ii])/2);
fstar[ne+ii-1] = -fstar[ii];
}
*/
}
void rhs(double *qq, double *rr, double *dv, double *df, double *ib, double speed, int nprocs, int myrank){
int ii, jj, ee, idx;
double fstar[2*ne], term;
for(ii=0;ii<np*ne;ii++){
rr[ii] = 0;
}
for(ee=0;ee<ne;ee++){
for(ii=0;ii<np;ii++){
for(jj=0; jj<np; jj++){
rr[ee*np+ii] += dv[ee*np*np+jj*np+ii]*speed*qq[ee*np+jj];
}
}
}
interface_flux(qq, fstar, ib, speed, nprocs, myrank);
for(ee=0;ee<ne;ee++){
for(ii=0; ii<np; ii++){
for(jj=0;jj<2;jj++){
rr[ee*np+ii] -= df[ee*(np*2)+jj*np+ii]*fstar[ee+jj*ne];
}
}
}
}'Parallel Programming' 카테고리의 다른 글
| MPI Function Dictionary (1) | 2016.10.04 |
|---|---|
| KSC 2015 3번 문제 및 풀이 (0) | 2016.10.01 |
| KSC 2015 1번 문제 및 답안 (0) | 2016.10.01 |
| KSC 2014 3번 문제 및 답안 (0) | 2016.10.01 |
| KSC 2014 2번 문제 및 답안 (0) | 2016.10.01 |
댓글