diff --git a/ChangeLog b/ChangeLog index f092b693..40aedd89 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,15 @@ -changes + +-------------- +Mar 30, 2021 +Name: Qimen Xu, Abhraj Sharma +Changes: (parallelization.c, tools.c, stress.c) +1. Add new default parallelization parameter choosing routine. +2. Fix bug in kinetic stress for spin case without kpts for single processor. + + -------------- Mar 22, 2021 Name: Qimen Xu diff --git a/src/include/parallelization.h b/src/include/parallelization.h index 6c6204df..74a616ee 100644 --- a/src/include/parallelization.h +++ b/src/include/parallelization.h @@ -109,6 +109,132 @@ void SPARC_Dims_create(int nproc, int ndims, int *gridsizes, int minsize, int *d void ScaLAPACK_Dims_2D_BLCYC(int nproc, int *gridsizes, int *dims); +/** + * @brief Caluclate a division of processors in a 2D Cartesian grid. + * + * For a given number of processors, choose np1 and np2 so that the maximum + * work assigned to each process is the smallest, i.e., choose x * y <= np, + * s.t., ceil(N1/x) * ceil(N2/y) is minimum. + * + * Here we assume the parallelization of the two properties are the same, + * therefore there's no preference to use more or less processes in any + * dimension. + * + * The objective function and the constraint are both quadratic, which + * makes this a very hard problem in general. + * + * In this func, we try to find a reasonably good solution using the following + * strategy: note that if x0 | N1 and y0 | N2 ( "|" means divides), then + * (x0,y0) is a solution to the problem with a weaker constraint x*y <= x0*y0. + * We can then keep reducing the constraint from np down until we find a + * solution for the subproblem, and choose the best of all the searched combo. + * + * @param N1 Number of properties to parallelize on the grid in the 1st dimension. + * @param N2 Number of properties to parallelize on the grid in the 2nd dimension. + * @param np Number of processors available. + * + **/ +void dims_divide_2d(const int N1, const int N2, const int np, int *np1, int *np2); + + +/** + * @brief For the given parallelization params, find how many count of + * work load each process is assigned, we take the ceiling since + * at least one process would have that many columns and will + * dominate the cost. + * + **/ +int workload_assigned(const int Nk, const int Ns, + const int np1, const int np2); + + +/** + * @brief Caluclate the efficiency for a given set of parameters. + * This is derived based on the assumption that there are + * infinite grid points (Nd) so that np can not be larger + * than Nk*Ns*Nd in any case. In practice, Nd ~ O(1e6), so + * this assumption is quite reasonable. + * 1. Ideal load: Nk*Ns*Nd/np + * 2. Actual load: ceil(Nk/np1)*ceil(Ns/np2)*ceil(Nd/np3) + * ~= ceil(Nk/np1)*ceil(Ns/np2)* (Nd/np3) #since Nd is large + * 3. efficiency = Ideal load / Actual load + * ~= Nk*Ns*np3/ (ceil(Nk/np1)*ceil(Ns/np2)*np) + * @param Nk Number of kpoints (after symmetry reduction). + * @param Ns Number of states/bands. + * @param np Number of processors available. + * @param np1 Number of kpoint groups for kpoint parallelization. + * @param np2 Number of band groups for band parallelization. + * @param np3 Number of domain groups for domain parallelization. + * + **/ +double work_load_efficiency( + const int Nk, const int Ns, const int np, + const int np1, const int np2, const int np3 +); + + + +/** + * @brief Caluclate a division of processors in for kpoint, band, domain (KBD) + * parallelization. + * + * For a given number of processors, choose npkpt, npband, and npdomain so + * that the maximum work assigned to each process is the smallest, i.e., + * choose x * y * z <= np, s.t., + * ceil(Nk/x) * ceil(Ns/y) * ceil(Nd/z) is minimum. + * + * Here we assume the parallelization of the two properties are the same, + * therefore there's no preference to use more or less processes in any + * dimension. + * + * The objective function and the constraint are both nonlinear, which + * makes this a very hard problem in general. + * + * In this func, we try to find a reasonably good solution using the following + * strategy: since the parallelization over kpoint and band are both very + * efficient, we try to parallel over kpoint and band first, then we consider + * parallelization over domain. There are two cases: np <= or > Nk * Ns. + * Case 1: np <= Nk * Ns. Then we try to fix npdomain = 1:10, and find the best + * parameters for the given npdomain, we always prefer to use less npdomain if + * possible. + * Case 2: np > Nk * Ns. Then we try to provide Nk*Ns ./ [1:10] for kpoint and + * band parallelization, and pick the best combination. Again we prefer to use + * as less npdomain (more for K & B) if work load is the similar. + * + * @param Nk Number of kpoints (after symmetry reduction). + * @param Ns Number of states. + * @param gridsizes Number of grid points in all three directions. + * @param np Number of processors available. + * @param np1 (OUT) Number of kpoint groups. + * @param np2 (OUT) Number of band groups. + * @param np3 (OUT) Number of domain groups. + **/ +void dims_divide_kbd( + const int Nk, const int Ns, const int *gridsizes, + const int np, int *np1, int *np2, int *np3); + + +/** + * @brief Caluclate a division of processors in for spin, kpoint, band, + * domain (SKBD). + * parallelization. + * + * @param Nspin Number of spin, 1 or 2. + * @param Nk Number of kpoints (after symmetry reduction). + * @param Ns Number of states. + * @param gridsizes Number of grid points in all three directions. + * @param np Number of processors available. + * @param nps (OUT) Number of spin groups. + * @param npk (OUT) Number of kpoint groups. + * @param npp (OUT) Number of band groups. + * @param npd (OUT) Number of domain groups. + **/ +void dims_divide_skbd( + const int Nspin, const int Nk, const int Ns, + const int *gridsizes, const int np, + int *nps, int *npk, int *npb, int *npd); + + /** * @brief Transfer data from one 3-d Domain Decomposition to another. * diff --git a/src/include/tools.h b/src/include/tools.h index c1c2073e..6ae0c38d 100644 --- a/src/include/tools.h +++ b/src/include/tools.h @@ -70,7 +70,10 @@ typedef struct { void xferFactors( Factors *fctrs, int *flist, int flix ); Factors *factor( int num, Factors *fctrs); +void sorted_factor( int num, Factors *fctrs); +// Equivalent to ceil(x / (double)y), where x, y are positive integers +int ceil_div(const unsigned int x, const unsigned int y); /** * @brief Calculates derivatives of a tabulated function required for spline interpolation. diff --git a/src/initialization.c b/src/initialization.c index f0077bcf..a311532c 100644 --- a/src/initialization.c +++ b/src/initialization.c @@ -2264,7 +2264,7 @@ void write_output_init(SPARC_OBJ *pSPARC) { } fprintf(output_fp,"***************************************************************************\n"); - fprintf(output_fp,"* SPARC (version Mar 22, 2021) *\n"); + fprintf(output_fp,"* SPARC (version Mar 30, 2021) *\n"); fprintf(output_fp,"* Copyright (c) 2020 Material Physics & Mechanics Group, Georgia Tech *\n"); fprintf(output_fp,"* Distributed under GNU General Public License 3 (GPL) *\n"); fprintf(output_fp,"* Start time: %s *\n",c_time_str); diff --git a/src/parallelization.c b/src/parallelization.c index 06741956..97b118b9 100644 --- a/src/parallelization.c +++ b/src/parallelization.c @@ -67,6 +67,23 @@ void Setup_Comms(SPARC_OBJ *pSPARC) { if (rank == 0) printf("Set up communicators.\n"); #endif + // in the case user doesn't set any parallelization parameters, search for a good + // combination in advance + gridsizes[0] = pSPARC->Nx; gridsizes[1] = pSPARC->Ny; gridsizes[2] = pSPARC->Nz; + npNd = pSPARC->npNdx * pSPARC->npNdy * pSPARC->npNdz; + minsize = pSPARC->order/2; + if (pSPARC->npspin == 0 && pSPARC->npkpt == 0 && + pSPARC->npband == 0 && npNd == 0) + { + dims_divide_skbd(pSPARC->Nspin, pSPARC->Nkpts_sym, pSPARC->Nstates, + gridsizes, nproc, &pSPARC->npspin, &pSPARC->npkpt, &pSPARC->npband, &npNd); + SPARC_Dims_create(npNd, 3, gridsizes, minsize, dims, &ierr); + pSPARC->npNdx = dims[0]; + pSPARC->npNdy = dims[1]; + pSPARC->npNdz = dims[2]; + } + + //------------------------------------------------// // set up spincomm // //------------------------------------------------// @@ -1476,6 +1493,339 @@ void ScaLAPACK_Dims_2D_BLCYC(int nproc, int *gridsizes, int *dims) + +/** + * @brief For the given parallelization params, find how many count of + * work load each process is assigned, we take the ceiling since + * at least one process would have that many columns and will + * dominate the cost. + * + **/ +int workload_assigned(const int Nk, const int Ns, + const int np1, const int np2) +{ + return ceil_div(Nk,np1) * ceil_div(Ns,np2); +} + + +/** + * @brief Caluclate the efficiency for a given set of parameters. + * This is derived based on the assumption that there are + * infinite grid points (Nd) so that np can not be larger + * than Nk*Ns*Nd in any case. In practice, Nd ~ O(1e6), so + * this assumption is quite reasonable. + * 1. Ideal load: Nk*Ns*Nd/np + * 2. Actual load: ceil(Nk/np1)*ceil(Ns/np2)*ceil(Nd/np3) + * ~= ceil(Nk/np1)*ceil(Ns/np2)* (Nd/np3) #since Nd is large + * 3. efficiency = Ideal load / Actual load + * ~= Nk*Ns*np3/ (ceil(Nk/np1)*ceil(Ns/np2)*np) + * @param Nk Number of kpoints (after symmetry reduction). + * @param Ns Number of states/bands. + * @param np Number of processors available. + * @param np1 Number of kpoint groups for kpoint parallelization. + * @param np2 Number of band groups for band parallelization. + * @param np3 Number of domain groups for domain parallelization. + * + **/ +double work_load_efficiency( + const int Nk, const int Ns, const int np, + const int np1, const int np2, const int np3 +) +{ + int ncols = workload_assigned(Nk, Ns, np1, np2); + return Nk * Ns * np3 / (double)(ncols * np); +} + + +/** + * @brief Caluclate a division of processors in a 2D Cartesian grid. + * + * For a given number of processors, choose np1 and np2 so that the maximum + * work assigned to each process is the smallest, i.e., choose x * y <= np, + * s.t., ceil(N1/x) * ceil(N2/y) is minimum. + * + * Here we assume the parallelization of the two properties are the same, + * therefore there's no preference to use more or less processes in any + * dimension. + * + * The objective function and the constraint are both quadratic, which + * makes this a very hard problem in general. + * + * In this func, we try to find a reasonably good solution using the following + * strategy: note that if x0 | N1 and y0 | N2 ( "|" means divides), then + * (x0,y0) is a solution to the problem with a weaker constraint x*y <= x0*y0. + * We can then keep reducing the constraint from np down until we find a + * solution for the subproblem, and choose the best of all the searched combo. + * + * @param N1 Number of properties to parallelize on the grid in the 1st dimension. + * @param N2 Number of properties to parallelize on the grid in the 2nd dimension. + * @param np Number of processors available. + * + **/ +void dims_divide_2d(const int N1, const int N2, const int np, int *np1, int *np2) +{ + int search_count = 0; + + // initialize with the naive way, parallelize as much as possible for N1 first + int cur1 = min(np, N1); + int cur2 = min(np/cur1, N2); + int best = workload_assigned(N1,N2,cur1,cur2); + + int np_up = np > N1*N2 ? N1*N2 : np; // upper bound of np that can be used + for (int p = np_up; p > 0; p--) { + // find factors of p + Factors facs = {NULL, 0}; + sorted_factor(p, &facs); + int nfacs = facs.count; // total number of factors + for (int i = 0; i < nfacs; i++) { + int fac1 = facs.list[i]; // small to large + int fac2 = p / fac1; // large to small + // use larger factor for the first dim from the beginning + int load = workload_assigned(N1,N2,fac2,fac1); + if (load < best) { + best = load; + cur1 = fac2; + cur2 = fac1; + } + if ((N1 % fac2 == 0 && N2 % fac1 == 0) || search_count > 1e5) { + *np1 = cur1; + *np2 = cur2; + free(facs.list); + return; + } + } + free(facs.list); + } +} + + + +/** + * @brief Caluclate a division of processors in for kpoint, band, domain (KBD) + * parallelization. + * + * For a given number of processors, choose npkpt, npband, and npdomain so + * that the maximum work assigned to each process is the smallest, i.e., + * choose x * y * z <= np, s.t., + * ceil(Nk/x) * ceil(Ns/y) * ceil(Nd/z) is minimum. + * + * Here we assume the parallelization of the two properties are the same, + * therefore there's no preference to use more or less processes in any + * dimension. + * + * The objective function and the constraint are both nonlinear, which + * makes this a very hard problem in general. + * + * In this func, we try to find a reasonably good solution using the following + * strategy: since the parallelization over kpoint and band are both very + * efficient, we try to parallel over kpoint and band first, then we consider + * parallelization over domain. There are two cases: np <= or > Nk * Ns. + * Case 1: np <= Nk * Ns. Then we try to fix npdomain = 1:10, and find the best + * parameters for the given npdomain, we always prefer to use less npdomain if + * possible. + * Case 2: np > Nk * Ns. Then we try to provide Nk*Ns ./ [1:10] for kpoint and + * band parallelization, and pick the best combination. Again we prefer to use + * as less npdomain (more for K & B) if work load is the similar. + * + * @param Nk Number of kpoints (after symmetry reduction). + * @param Ns Number of states. + * @param gridsizes Number of grid points in all three directions. + * @param np Number of processors available. + * @param np1 (OUT) Number of kpoint groups. + * @param np2 (OUT) Number of band groups. + * @param np3 (OUT) Number of domain groups. + **/ +void dims_divide_kbd( + const int Nk, const int Ns, const int *gridsizes, + const int np, int *np1, int *np2, int *np3) +{ +#define LEN_NPDM 8 + + // SPARC_Dims_create(nproc, ndims, gridsizes, minsize, int *dims, int *ierr); + int cur1 = min(np,Nk); + int cur2 = min(np/cur1,Ns); + int cur3 = np / (cur1 * cur2); + double best_weight = work_load_efficiency(Nk,Ns,np,cur1,cur2,cur3); + if (best_weight > 0.97) { // prefer kpoint to band + *np1 = cur1; *np2 = cur2; *np3 = cur3; + return; + } + + int npkpt, npband, npdm; + double weight; + + int npdm_list[LEN_NPDM]; + int npdm_count = LEN_NPDM; + if (np > Nk * Ns) { + int npdm_base = np / (Nk * Ns); + int count = 0; + for (int i = 1; i <= LEN_NPDM; i++) { + if (npdm_base * i <= np) { + npdm_list[count] = npdm_base * i; + count++; + } + } + npdm_count = count; + } else { + int count = 0; + for (int i = 1; i <= LEN_NPDM; i++) { + npdm_list[count] = i; + count++; + } + npdm_count = count; + } + + for (int i = 0; i < npdm_count; i++) { + // # TODO: check domain parallelization first + // # npNx,npNy,npNz = domain_paral(Nx,Ny,Nz,npdomain) + // # if npNx*npNy*npNz != npdomain: + // # continue + npdm = npdm_list[i]; + int np_2d = np / npdm; + if (np_2d < 1) continue; + + dims_divide_2d(Nk, Ns, np_2d, &npkpt, &npband); + npdm = np / (npkpt * npband); + weight = work_load_efficiency(Nk,Ns,np,npkpt,npband,npdm); + if (weight > best_weight) { + cur1 = npkpt; + cur2 = npband; + cur3 = npdm; + best_weight = weight; + } + if (weight > 0.95) break; + } + + *np1 = cur1; + *np2 = cur2; + *np3 = cur3; +} + + +double skbd_band_efficiency(const int x) { + double eff = 1.00; + double x_table[5] = {1, 8, 27, 125, 512}; + double y_table[5] = {1.00,0.95,0.75,0.55,0.45}; + int len = 5; + for (int i = 0; i < len - 1; i++) { + if (x >= x_table[i] && x < x_table[i+1]) { + double slope = (y_table[i+1]-y_table[i])/(x_table[i+1]-x_table[i]); + eff = y_table[i] + (x - x_table[i]) * slope; + } + } + + if (x > x_table[len-1]) eff = 0.44; + return eff; +} + + +double skbd_domain_efficiency(const int x) { + double eff = 1.00; + double x_table[7] = {1, 2, 8, 20, 27 ,120 ,200 }; + double y_table[7] = {1.00,0.80,0.75,0.65,0.60,0.40,0.30}; + int len = 7; + for (int i = 0; i < len - 1; i++) { + if (x >= x_table[i] && x < x_table[i+1]) { + double slope = (y_table[i+1]-y_table[i])/(x_table[i+1]-x_table[i]); + eff = y_table[i] + (x - x_table[i]) * slope; + } + } + + if (x > x_table[len-1]) eff = 0.29; + return eff; +} + + + +double skbd_weighted_efficiency( + const int Nspin, const int Nk, const int Ns, + const int *gridsizes, const int np, + const int nps, const int npk, const int npb, const int npd) +{ + int ncols = workload_assigned(Nspin*Nk, Ns, nps*npk, npb); + double eff = Nspin * Nk * Ns * npd / (double)(ncols * np); + // apply weight for band + eff *= skbd_band_efficiency(npb); + // apply weight for domain + eff *= skbd_domain_efficiency(npd); + return eff; +} + + +/** + * @brief Caluclate a division of processors in for spin, kpoint, band, + * domain (SKBD). + * parallelization. + * + * @param Nspin Number of spin, 1 or 2. + * @param Nk Number of kpoints (after symmetry reduction). + * @param Ns Number of states. + * @param gridsizes Number of grid points in all three directions. + * @param np Number of processors available. + * @param nps (OUT) Number of spin groups. + * @param npk (OUT) Number of kpoint groups. + * @param npp (OUT) Number of band groups. + * @param npd (OUT) Number of domain groups. + **/ +void dims_divide_skbd( + const int Nspin, const int Nk, const int Ns, + const int *gridsizes, const int np, + int *nps, int *npk, int *npb, int *npd) +{ + if (Nspin != 1 && Nspin != 2) + exit(1); + + int nps_cur, npk_cur, npb_cur, npd_cur; + double weight_cur; + // first set naive way + nps_cur = min(Nspin,np); + npk_cur = min(Nk,np/nps_cur); + npb_cur = min(Ns,np/(nps_cur*npk_cur)); + npd_cur = np/(nps_cur*npk_cur*npb_cur); + weight_cur = skbd_weighted_efficiency(Nspin,Nk,Ns,gridsizes,np,nps_cur,npk_cur,npb_cur,npd_cur); + + + // try with both Nspin = 1 + int nps_1, npk_1, npb_1, npd_1; + nps_1 = 1; + dims_divide_kbd(Nk,Ns,gridsizes,np/(nps_1),&npk_1,&npb_1,&npd_1); + // double weight_1 = work_load_efficiency(Nspin*Nk,Ns,np,nps_1*npk_1,npb_1,npd_1); + double weight_1; + weight_1 = skbd_weighted_efficiency(Nspin,Nk,Ns,gridsizes,np,nps_1,npk_1,npb_1,npd_1); + if (weight_1 > weight_cur) { + nps_cur = nps_1; + npk_cur = npk_1; + npb_cur = npb_1; + npd_cur = npd_1; + weight_cur = weight_1; + } + + // if there's spin, also try npspin = 2, and see if gives a better result + if (Nspin > 1 && np > 1) { + int nps_2, npk_2, npb_2, npd_2; + nps_2 = 2; + dims_divide_kbd(Nk,Ns,gridsizes,np/(nps_2),&npk_2,&npb_2,&npd_2); + double weight_2; + weight_2 = skbd_weighted_efficiency(Nspin,Nk,Ns,gridsizes,np,nps_2,npk_2,npb_2,npd_2); + if (weight_2 >= weight_cur) { + nps_cur = nps_2; + npk_cur = npk_2; + npb_cur = npb_2; + npd_cur = npd_2; + weight_cur = weight_2; + } + } + + *nps = nps_cur; + *npk = npk_cur; + *npb = npb_cur; + *npd = npd_cur; +} + + + + + /** * @brief This function sets up the input structures needed for the function D2D(), * which transfers data from one domain decomposition to another domain diff --git a/src/stress.c b/src/stress.c index e4a96c15..2e0f51aa 100644 --- a/src/stress.c +++ b/src/stress.c @@ -1163,10 +1163,6 @@ void Calculate_nonlocal_kinetic_stress(SPARC_OBJ *pSPARC) stress_k[3] += dpsi2_dpsi2 * g_nk; } - stress_k[0] *= -(2.0/pSPARC->Nspin); - stress_k[1] *= -(2.0/pSPARC->Nspin); - stress_k[3] *= -(2.0/pSPARC->Nspin); - Gradient_vectors_dir(pSPARC, DMnd, pSPARC->DMVertices_dmcomm, ncol, 0.0, pSPARC->Xorb+spn_i*size_s, dpsi_full+spn_i*size_s, 2, pSPARC->dmcomm); for(n = 0; n < ncol; n++){ dpsi_ptr = pSPARC->Yorb + spn_i * size_s + n * DMnd; // dpsi_1 @@ -1180,8 +1176,6 @@ void Calculate_nonlocal_kinetic_stress(SPARC_OBJ *pSPARC) stress_k[2] += dpsi1_dpsi3 * g_nk; stress_k[5] += dpsi3_dpsi3 * g_nk; } - stress_k[2] *= -(2.0/pSPARC->Nspin); - stress_k[5] *= -(2.0/pSPARC->Nspin); } else if(dim == 1){ for(n = 0; n < ncol; n++){ dpsi_ptr = pSPARC->Yorb + spn_i * size_s + n * DMnd; // dpsi_2 @@ -1192,12 +1186,18 @@ void Calculate_nonlocal_kinetic_stress(SPARC_OBJ *pSPARC) } stress_k[4] += dpsi2_dpsi3 * pSPARC->occ[spn_i*Ns + n + pSPARC->band_start_indx]; } - stress_k[4] *= -(2.0/pSPARC->Nspin); } count++; } } + stress_k[0] *= -(2.0/pSPARC->Nspin); + stress_k[1] *= -(2.0/pSPARC->Nspin); + stress_k[2] *= -(2.0/pSPARC->Nspin); + stress_k[3] *= -(2.0/pSPARC->Nspin); + stress_k[4] *= -(2.0/pSPARC->Nspin); + stress_k[5] *= -(2.0/pSPARC->Nspin); + free(dpsi_full); if (pSPARC->npNd > 1) { diff --git a/src/tools.c b/src/tools.c index 79d4ddd9..f5b41c89 100644 --- a/src/tools.c +++ b/src/tools.c @@ -290,6 +290,39 @@ Factors *factor( int num, Factors *fctrs) +void sorted_factor( int num, Factors *fctrs) { + // call factors to do the calculation + factor(num, fctrs); + + short len = fctrs->count; + int *new_list = (int *)malloc(len * sizeof(int)); + + // the returned list comes in pairs, f1xf2, f3xf4, ... + // sort the list in ascending order + // copy the 1st half + for (int i = 0; i < (len+1)/2; i++) { + new_list[i] = fctrs->list[2*i]; + } + + short ind = len-1; + // copy the 2nd half + for (int i = 1; i < len; i+=2) { + new_list[ind--] = fctrs->list[i]; + } + + free(fctrs->list); + fctrs->list = new_list; + +} + + +// Equivalent to ceil(x / (double)y), where x, y are positive integers +int ceil_div(const unsigned int x, const unsigned int y) +{ + return (x + y - 1) / y; +} + + /** * @brief Calculates derivatives of a tabulated function required for spline interpolation. */