33#ifndef __STDC_CONSTANT_MACROS
34#define __STDC_CONSTANT_MACROS
36#ifndef __STDC_LIMIT_MACROS
37#define __STDC_LIMIT_MACROS
48#include "CombBLAS/CombBLAS.h"
62 template <
typename T1,
typename T2>
66 static T_promote id(){
return std::numeric_limits<T_promote>::max(); };
68 static MPI_Op
mpi_op() {
return MPI_MIN; };
72 return std::min(arg1, arg2);
88 template <
class T,
class I>
91 int p=omp_get_max_threads();
99#pragma omp parallel for
100 for(
int i=0; i<p; i++){
101 int start=i*step_size;
102 int end=start+step_size;
105 for(I j=start+1; j<end; j++)
112 sum[i]=sum[i-1]+
B[i*step_size-1]+
A[i*step_size-1];
114#pragma omp parallel for
115 for(
int i=1; i<p; i++){
116 int start=i*step_size;
117 int end=start+step_size;
120 for(I j=start; j<end; j++)
132 template <
typename T>
134 T* rbuff_,
int* r_cnt_,
int* rdisp_, MPI_Comm c,
int kway=2)
137 MPI_Comm_size(c, &np);
138 MPI_Comm_rank(c, &pid);
142 return MPI_Alltoallv(sbuff_, s_cnt_, sdisp_, MPIType<T>(), rbuff_, r_cnt_, rdisp_, MPIType<T>(), c);
147 std::vector<int> s_cnt(np);
148#pragma omp parallel for
149 for(
int i=0;i<np;i++){
150 s_cnt[i]=s_cnt_[i]*
sizeof(T)+2*
sizeof(
int);
152 std::vector<int> sdisp(np); sdisp[0]=0;
155 char* sbuff=
new char[sdisp[np-1]+s_cnt[np-1]];
156#pragma omp parallel for
157 for(
int i=0;i<np;i++){
158 ((
int*)&sbuff[sdisp[i]])[0]=s_cnt[i];
159 ((
int*)&sbuff[sdisp[i]])[1]=pid;
160 memcpy(&sbuff[sdisp[i]]+2*
sizeof(
int),&sbuff_[sdisp_[i]],s_cnt[i]-2*
sizeof(
int));
165 while(range[1]-range[0]>1){
167 if(kway>range[1]-range[0])
168 kway=range[1]-range[0];
170 std::vector<int> new_range(kway+1);
171 for(
int i=0;i<=kway;i++)
172 new_range[i]=(range[0]*(kway-i)+range[1]*i)/kway;
173 int p_class=(std::upper_bound(&new_range[0],&new_range[kway],pid)-&new_range[0]-1);
174 int new_np=new_range[p_class+1]-new_range[p_class];
175 int new_pid=pid-new_range[p_class];
179 std::vector<int> r_cnt (new_np*kway, 0);
180 std::vector<int> r_cnt_ext(new_np*kway, 0);
182 for(
int i=0;i<kway;i++){
184 int cmp_np=new_range[i+1]-new_range[i];
185 int partner=(new_pid<cmp_np? new_range[i]+new_pid: new_range[i+1]-1) ;
186 assert( (new_pid<cmp_np?
true: new_range[i]+new_pid==new_range[i+1] ));
187 MPI_Sendrecv(&s_cnt[new_range[i]-new_range[0]], cmp_np, MPI_INT, partner, 0,
188 &r_cnt[new_np *i ], new_np, MPI_INT, partner, 0, c, &status);
191 if(new_pid==new_np-1 && cmp_np>new_np){
192 int partner=new_range[i+1]-1;
193 std::vector<int> s_cnt_ext(cmp_np, 0);
194 MPI_Sendrecv(&s_cnt_ext[ 0], cmp_np, MPI_INT, partner, 0,
195 &r_cnt_ext[new_np*i], new_np, MPI_INT, partner, 0, c, &status);
200 std::vector<int> rdisp (new_np*kway, 0);
201 std::vector<int> rdisp_ext(new_np*kway, 0);
202 int rbuff_size, rbuff_size_ext;
203 char *rbuff, *rbuff_ext;
207 rbuff_size = rdisp [new_np*kway-1] + r_cnt [new_np*kway-1];
208 rbuff_size_ext = rdisp_ext[new_np*kway-1] + r_cnt_ext[new_np*kway-1];
209 rbuff =
new char[rbuff_size ];
210 rbuff_ext =
new char[rbuff_size_ext];
216 while(pid<new_range[my_block]) my_block--;
218 for(
int i_=0;i_<=kway/2;i_++){
219 int i1=(my_block+i_)%kway;
220 int i2=(my_block+kway-i_)%kway;
222 for(
int j=0;j<(i_==0 || i_==kway/2?1:2);j++){
223 int i=(i_==0?i1:((j+my_block/i_)%2?i1:i2));
225 int cmp_np=new_range[i+1]-new_range[i];
226 int partner=(new_pid<cmp_np? new_range[i]+new_pid: new_range[i+1]-1) ;
228 int send_dsp =sdisp[new_range[i ]-new_range[0] ];
229 int send_dsp_last=sdisp[new_range[i+1]-new_range[0]-1];
230 int send_cnt =s_cnt[new_range[i+1]-new_range[0]-1]+send_dsp_last-send_dsp;
233 MPI_Sendrecv(&sbuff[send_dsp], send_cnt, MPI_BYTE, partner, 0,
234 &rbuff[rdisp[new_np * i ]], r_cnt[new_np *(i+1)-1]+rdisp[new_np *(i+1)-1]-rdisp[new_np * i ], MPI_BYTE, partner, 0, c, &status);
237 if(pid==new_np-1 && cmp_np>new_np){
238 int partner=new_range[i+1]-1;
239 std::vector<int> s_cnt_ext(cmp_np, 0);
240 MPI_Sendrecv( NULL, 0, MPI_BYTE, partner, 0,
241 &rbuff[rdisp_ext[new_np*i]], r_cnt_ext[new_np*(i+1)-1]+rdisp_ext[new_np*(i+1)-1]-rdisp_ext[new_np*i], MPI_BYTE, partner, 0, c, &status);
248 if(sbuff!=NULL)
delete[] sbuff;
249 sbuff=
new char[rbuff_size+rbuff_size_ext];
251 std::vector<int> cnt_new(2*new_np*kway, 0);
252 std::vector<int> disp_new(2*new_np*kway, 0);
253 for(
int i=0;i<new_np;i++)
254 for(
int j=0;j<kway;j++){
255 cnt_new[(i*2 )*kway+j]=r_cnt [j*new_np+i];
256 cnt_new[(i*2+1)*kway+j]=r_cnt_ext[j*new_np+i];
260#pragma omp parallel for
261 for(
int i=0;i<new_np;i++)
262 for(
int j=0;j<kway;j++){
263 memcpy(&sbuff[disp_new[(i*2 )*kway+j]], &rbuff [rdisp [j*new_np+i]], r_cnt [j*new_np+i]);
264 memcpy(&sbuff[disp_new[(i*2+1)*kway+j]], &rbuff_ext[rdisp_ext[j*new_np+i]], r_cnt_ext[j*new_np+i]);
268 if(rbuff !=NULL)
delete[] rbuff ;
269 if(rbuff_ext!=NULL)
delete[] rbuff_ext;
272 s_cnt.resize(new_np,0);
273 sdisp.resize(new_np);
274 for(
int i=0;i<new_np;i++){
275 for(
int j=0;j<2*kway;j++)
276 s_cnt[i]+=cnt_new[i*2*kway+j];
277 sdisp[i]=disp_new[i*2*kway];
282 range[0]=new_range[p_class ];
283 range[1]=new_range[p_class+1];
287 std::vector<char*> buff_ptr(np);
289 for(
int i=0;i<np;i++){
290 int& blk_size=((
int*)tmp_ptr)[0];
294#pragma omp parallel for
295 for(
int i=0;i<np;i++){
296 int& blk_size=((
int*)buff_ptr[i])[0];
297 int& src_pid=((
int*)buff_ptr[i])[1];
298 assert(blk_size-2*
sizeof(
int)<=r_cnt_[src_pid]*
sizeof(T));
299 memcpy(&rbuff_[rdisp_[src_pid]],buff_ptr[i]+2*
sizeof(
int),blk_size-2*
sizeof(
int));
303 if(sbuff !=NULL)
delete[] sbuff;
310 template <
typename T>
312 T* rbuff,
int* r_cnt,
int* rdisp, MPI_Comm comm)
315 MPI_Comm_size(comm, &
nprocs);
316 MPI_Comm_rank(comm, &
rank);
319 for(
int i = 0; i <
nprocs; i++)
321 if(i==
rank)
continue;
322 if(s_cnt[i] > 0) commCnt++;
323 if(r_cnt[i] > 0) commCnt++;
325 int totalCommCnt = 0;
326 MPI_Allreduce(&commCnt, &totalCommCnt, 1, MPI_INT, MPI_SUM, comm);
328 if(totalCommCnt < 2*log2(
nprocs))
338 return MPI_Alltoallv(sbuff, s_cnt, sdisp, MPIType<T>(), rbuff, r_cnt, rdisp, MPIType<T>(), comm);
346 template <
class IT,
class NT>
350 MPI_Comm World = commGrid->GetWorld();
351 int nprocs = commGrid->GetSize();
353 vector<int> sendcnt (
nprocs,0);
354 vector<int> recvcnt (
nprocs,0);
356 IT riloclen = rinum.size();
357 for(
IT i=0; i < riloclen; ++i)
360 int owner = dense.Owner(rinum[i], locind);
364 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
365 IT totrecv = std::accumulate(recvcnt.begin(),recvcnt.end(),
static_cast<IT>(0));
369 vector<IT> bcastcnt(
nprocs,0);
372 if(broadcast_cost < totrecv)
376 MPI_Allgather(&bcastsize, 1, MPIType<IT>(), bcastcnt.data(), 1, MPIType<IT>(), World);
378 for(
int i=0; i<
nprocs; i++)
380 if(bcastcnt[i]>0) nbcast++;
385 MPI_Request* requests =
new MPI_Request[nbcast];
388 MPI_Status* statuses =
new MPI_Status[nbcast];
393 for(
int i=0; i<
nprocs; i++)
397 bcastBuffer[i].resize(bcastcnt[i]);
398 std::copy(arr, arr+bcastcnt[i], bcastBuffer[i].begin());
399 MPI_Ibcast(bcastBuffer[i].data(), bcastcnt[i], MPIType<NT>(), i, World, &requests[ibcast++]);
403 MPI_Waitall(nbcast, requests, statuses);
419 template <
class IT,
class NT>
424 double ts = MPI_Wtime();
425 std::ostringstream outs;
428 outs<<
" Extract timing: ";
431 MPI_Comm World = commGrid->GetWorld();
432 int nprocs = commGrid->GetSize();
436 std::cout <<
"Grids are not comparable for dense vector subsref" << std::endl;
442 vector<vector<NT>> bcastBuffer(
nprocs);
444 double t1 = MPI_Wtime();
446 int nbcast =
replicate(dense, ri, bcastBuffer);
448 double bcast = MPI_Wtime() - t1;
449 outs <<
"bcast ( " << nbcast <<
" ): " << bcast <<
" ";
452 std::vector< std::vector< IT > > data_req(
nprocs);
453 std::vector< std::vector< IT > > revr_map(
nprocs);
457 IT riloclen = rinum.size();
458 std::vector<NT> num(riloclen);
459 for(
IT i=0; i < riloclen; ++i)
462 int owner = dense.Owner(rinum[i], locind);
463 if(bcastBuffer[owner].
size() == 0)
465 data_req[owner].push_back(locind);
466 revr_map[owner].push_back(i);
470 num[i] =bcastBuffer[owner][locind];
474 int * sendcnt =
new int[
nprocs];
475 int * sdispls =
new int[
nprocs];
476 for(
int i=0; i<
nprocs; ++i)
477 sendcnt[i] = (
int) data_req[i].size();
479 int * rdispls =
new int[
nprocs];
480 int * recvcnt =
new int[
nprocs];
485 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
487 double all2ll1 = MPI_Wtime() - t1;
488 outs <<
"all2ll1: " << all2ll1 <<
" ";
493 for(
int i=0; i<
nprocs-1; ++i)
495 sdispls[i+1] = sdispls[i] + sendcnt[i];
496 rdispls[i+1] = rdispls[i] + recvcnt[i];
498 IT totsend = std::accumulate(sendcnt,sendcnt+
nprocs,
static_cast<IT>(0));
499 IT totrecv = std::accumulate(recvcnt,recvcnt+
nprocs,
static_cast<IT>(0));
502 IT * sendbuf =
new IT[totsend];
503 for(
int i=0; i<
nprocs; ++i)
505 std::copy(data_req[i].begin(), data_req[i].end(), sendbuf+sdispls[i]);
506 std::vector<IT>().swap(data_req[i]);
510 IT * reversemap =
new IT[totsend];
511 for(
int i=0; i<
nprocs; ++i)
513 std::copy(revr_map[i].begin(), revr_map[i].end(), reversemap+sdispls[i]);
514 std::vector<IT>().swap(revr_map[i]);
517 IT * recvbuf =
new IT[totrecv];
522 Mpi_Alltoallv(sendbuf, sendcnt, sdispls, recvbuf, recvcnt, rdispls, World);
525 double all2ll2 = MPI_Wtime() - t1;
526 outs <<
"all2ll2: " << all2ll2 <<
" ";
531 NT * databack =
new NT[totrecv];
534#pragma omp parallel for
536 for(
int i=0; i<totrecv; ++i)
537 databack[i] = arr[recvbuf[i]];
541 NT * databuf =
new NT[totsend];
548 Mpi_Alltoallv(databack, recvcnt, rdispls,databuf, sendcnt, sdispls, World);
552 double all2ll3 = MPI_Wtime() - t1;
553 outs <<
"all2ll3: " << all2ll3 <<
" ";
557 for(
int i=0; i<totsend; ++i)
558 num[reversemap[i]] = databuf[i];
561 DeleteAll(sdispls, sendcnt, databuf,reversemap);
563 IT globallen = ri.TotalLength();
567 double total = MPI_Wtime() - ts;
568 outs <<
"others: " << total - (bcast + all2ll1 + all2ll2 + all2ll3) <<
" ";
579 template <
class IT,
class NT>
583 MPI_Comm World = commGrid->GetWorld();
584 int nprocs = commGrid->GetSize();
586 MPI_Comm_rank(World,&myrank);
588 vector<int> sendcnt (
nprocs,0);
589 vector<int> recvcnt (
nprocs);
590 std::vector<std::vector<IT>> indBuf(
nprocs);
591 std::vector<std::vector<NT>> valBuf(
nprocs);
595 IT riloclen = indices.size();
596 for(
IT i=0; i < riloclen; ++i)
599 int owner = ind.Owner(indices[i], locind);
600 indBuf[owner].push_back(locind);
601 valBuf[owner].push_back(values[i]);
606 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
607 IT totrecv = std::accumulate(recvcnt.begin(),recvcnt.end(),
static_cast<IT>(0));
608 double reduceCost = ind.MyLocLength() * log2(
nprocs);
610 vector<IT> reducecnt(
nprocs,0);
613 if(reduceCost < totrecv)
615 reducesize = ind.MyLocLength();
617 MPI_Allgather(&reducesize, 1, MPIType<IT>(), reducecnt.data(), 1, MPIType<IT>(), World);
620 for(
int i=0; i<
nprocs; ++i)
622 if(reducecnt[i]>0) nreduce++;
627 MPI_Request* requests =
new MPI_Request[nreduce];
630 MPI_Status* statuses =
new MPI_Status[nreduce];
634 for(
int i=0; i<
nprocs; ++i)
638 reduceBuffer[i].resize(reducecnt[i], MAX_FOR_REDUCE);
639 for(
int j=0; j<sendcnt[i]; j++)
640 reduceBuffer[i][indBuf[i][j]] = std::min(reduceBuffer[i][indBuf[i][j]], valBuf[i][j]);
642 MPI_Ireduce(MPI_IN_PLACE, reduceBuffer[i].data(), reducecnt[i], MPIType<NT>(), MPI_MIN, i, World, &requests[ireduce++]);
644 MPI_Ireduce(reduceBuffer[i].data(), NULL, reducecnt[i], MPIType<NT>(), MPI_MIN, i, World, &requests[ireduce++]);
648 MPI_Waitall(nreduce, requests, statuses);
661 template <
class IT,
class NT>
665 MPI_Comm World = commGrid->GetWorld();
666 int nprocs = commGrid->GetSize();
668 MPI_Comm_rank(World,&myrank);
670 vector<int> sendcnt (
nprocs,0);
671 vector<int> recvcnt (
nprocs);
672 std::vector<std::vector<IT>> indBuf(
nprocs);
675 IT riloclen = indices.size();
676 for(
IT i=0; i < riloclen; ++i)
679 int owner = ind.Owner(indices[i], locind);
680 indBuf[owner].push_back(locind);
685 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
686 IT totrecv = std::accumulate(recvcnt.begin(),recvcnt.end(),
static_cast<IT>(0));
687 double reduceCost = ind.MyLocLength() * log2(
nprocs);
689 vector<IT> reducecnt(
nprocs,0);
692 if(reduceCost < totrecv)
694 reducesize = ind.MyLocLength();
696 MPI_Allgather(&reducesize, 1, MPIType<IT>(), reducecnt.data(), 1, MPIType<IT>(), World);
699 for(
int i=0; i<
nprocs; ++i)
701 if(reducecnt[i]>0) nreduce++;
706 MPI_Request* requests =
new MPI_Request[nreduce];
709 MPI_Status* statuses =
new MPI_Status[nreduce];
713 for(
int i=0; i<
nprocs; ++i)
717 reduceBuffer[i].resize(reducecnt[i], MAX_FOR_REDUCE);
718 for(
int j=0; j<sendcnt[i]; j++)
719 reduceBuffer[i][indBuf[i][j]] = val;
721 MPI_Ireduce(MPI_IN_PLACE, reduceBuffer[i].data(), reducecnt[i], MPIType<NT>(), MPI_MIN, i, World, &requests[ireduce++]);
723 MPI_Ireduce(reduceBuffer[i].data(), NULL, reducecnt[i], MPIType<NT>(), MPI_MIN, i, World, &requests[ireduce++]);
727 MPI_Waitall(nreduce, requests, statuses);
746 template <
class IT,
class NT>
752 SpParHelper::Print(
"Assign error: Index and value vectors have different size !!!\n");
756 IT globallen = ind.TotalLength();
758 if(maxInd >= globallen)
760 std::cout <<
"At least one requested index is larger than the global length" << std::endl;
765 double ts = MPI_Wtime();
766 std::ostringstream outs;
769 outs<<
" Assign timing: ";
772 MPI_Comm World = commGrid->GetWorld();
773 int nprocs = commGrid->GetSize();
774 int * rdispls =
new int[
nprocs+1];
775 int * recvcnt =
new int[
nprocs];
776 int * sendcnt =
new int[
nprocs]();
777 int * sdispls =
new int[
nprocs+1];
779 vector<vector<NT>> reduceBuffer(
nprocs);
783 double t1 = MPI_Wtime();
785 NT MAX_FOR_REDUCE =
static_cast<NT>(globallen);
786 int nreduce =
ReduceAssign(ind, val, reduceBuffer, MAX_FOR_REDUCE);
789 double reduce = MPI_Wtime() - t1;
790 outs <<
"reduce (" << nreduce <<
"): " <<
reduce <<
" ";
795 std::vector<std::vector<IT>> indBuf(
nprocs);
796 std::vector<std::vector<NT>> valBuf(
nprocs);
799 IT riloclen = indices.size();
800 for(
IT i=0; i < riloclen; ++i)
803 int owner = ind.Owner(indices[i], locind);
804 if(reduceBuffer[owner].
size() == 0)
806 indBuf[owner].push_back(locind);
807 valBuf[owner].push_back(values[i]);
816 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
818 double all2ll1 = MPI_Wtime() - t1;
819 outs <<
"all2ll1: " << all2ll1 <<
" ";
823 for(
int i=0; i<
nprocs; ++i)
825 sdispls[i+1] = sdispls[i] + sendcnt[i];
826 rdispls[i+1] = rdispls[i] + recvcnt[i];
832 vector<IT> sendInd(totsend);
833 vector<NT> sendVal(totsend);
834 for(
int i=0; i<
nprocs; ++i)
836 std::copy(indBuf[i].begin(), indBuf[i].end(), sendInd.begin()+sdispls[i]);
837 std::vector<IT>().swap(indBuf[i]);
838 std::copy(valBuf[i].begin(), valBuf[i].end(), sendVal.begin()+sdispls[i]);
839 std::vector<NT>().swap(valBuf[i]);
842 vector<IT> recvInd(totrecv);
843 vector<NT> recvVal(totrecv);
849 Mpi_Alltoallv(sendInd.data(), sendcnt, sdispls, recvInd.data(), recvcnt, rdispls, World);
852 double all2ll2 = MPI_Wtime() - t1;
853 outs <<
"all2ll2: " << all2ll2 <<
" ";
859 Mpi_Alltoallv(sendVal.data(), sendcnt, sdispls, recvVal.data(), recvcnt, rdispls, World);
862 double all2ll3 = MPI_Wtime() - t1;
863 outs <<
"all2ll3: " << all2ll3 <<
" ";
865 DeleteAll(sdispls, rdispls, sendcnt, recvcnt);
869 MPI_Comm_rank(World,&myrank);
870 if(reduceBuffer[myrank].
size()>0)
873 for(
int i=0; i<reduceBuffer[myrank].size(); i++)
876 if(reduceBuffer[myrank][i] < MAX_FOR_REDUCE)
878 recvInd.push_back(i);
879 recvVal.push_back(reduceBuffer[myrank][i]);
889 double total = MPI_Wtime() - ts;
890 outs <<
"others: " << total - (
reduce + all2ll1 + all2ll2 + all2ll3) <<
" ";
904 template <
class IT,
class NT>
907 IT globallen = ind.TotalLength();
909 if(maxInd >= globallen)
911 std::cout <<
"At least one requested index is larger than the global length" << std::endl;
916 double ts = MPI_Wtime();
917 std::ostringstream outs;
920 outs<<
" Assign timing: ";
923 MPI_Comm World = commGrid->GetWorld();
924 int nprocs = commGrid->GetSize();
925 int * rdispls =
new int[
nprocs+1];
926 int * recvcnt =
new int[
nprocs];
927 int * sendcnt =
new int[
nprocs]();
928 int * sdispls =
new int[
nprocs+1];
930 vector<vector<NT>> reduceBuffer(
nprocs);
934 double t1 = MPI_Wtime();
936 NT MAX_FOR_REDUCE =
static_cast<NT>(globallen);
937 int nreduce =
ReduceAssign(ind, val, reduceBuffer, MAX_FOR_REDUCE);
940 double reduce = MPI_Wtime() - t1;
941 outs <<
"reduce ( " << nreduce <<
" ): " <<
reduce <<
" ";
946 std::vector<std::vector<IT>> indBuf(
nprocs);
948 IT riloclen = indices.size();
949 for(
IT i=0; i < riloclen; ++i)
952 int owner = ind.Owner(indices[i], locind);
953 if(reduceBuffer[owner].
size() == 0)
955 indBuf[owner].push_back(locind);
964 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
966 double all2ll1 = MPI_Wtime() - t1;
967 outs <<
"all2ll1: " << all2ll1 <<
" ";
971 for(
int i=0; i<
nprocs; ++i)
973 sdispls[i+1] = sdispls[i] + sendcnt[i];
974 rdispls[i+1] = rdispls[i] + recvcnt[i];
980 vector<IT> sendInd(totsend);
981 for(
int i=0; i<
nprocs; ++i)
983 std::copy(indBuf[i].begin(), indBuf[i].end(), sendInd.begin()+sdispls[i]);
984 std::vector<IT>().swap(indBuf[i]);
987 vector<IT> recvInd(totrecv);
993 Mpi_Alltoallv(sendInd.data(), sendcnt, sdispls, recvInd.data(), recvcnt, rdispls, World);
996 double all2ll2 = MPI_Wtime() - t1;
997 outs <<
"all2ll2: " << all2ll2 <<
" ";
998 outs <<
"all2ll3: " << 0 <<
" ";
1000 DeleteAll(sdispls, rdispls, sendcnt, recvcnt);
1004 MPI_Comm_rank(World,&myrank);
1005 vector<NT> recvVal(totrecv);
1006 if(reduceBuffer[myrank].
size()>0)
1009 for(
int i=0; i<reduceBuffer[myrank].size(); i++)
1011 if(reduceBuffer[myrank][i] < MAX_FOR_REDUCE)
1013 recvInd.push_back(i);
1014 recvVal.push_back(val);
1021 double total = MPI_Wtime() - ts;
1022 outs <<
"others: " << total - (
reduce + all2ll1 + all2ll2) <<
" ";
1034 template <
typename IT,
typename NT,
typename DER>
1041 if(isStar2StarHookPossible)
1057 [](
short isStar,
IT p){
return p;},
1058 [](
short isStar,
IT p){
return true;},
1059 false,
static_cast<short>(0));
1061 star.
Set(isParentStar);
1125 template <
typename IT>
1138 [](
short isStar,
IT p){
return p;},
1139 [](
short isStar,
IT p){
return true;},
1140 false,
static_cast<short>(0));
1148 [](
short isStar,
IT p){
return p;},
1149 [](
short isStar,
IT p){
return true;},
1150 false,
static_cast<short>(0));
1154 gpOfNonStars_pindexed = EWiseApply<IT>(temp, gpOfNonStars_pindexed,
1155 [](
IT p,
IT gp){
return gp;},
1156 [](
IT p,
IT gp){
return p!=gp;},
1157 false,
false,
static_cast<IT>(0),
static_cast<IT>(0));
1163 stars.
EWiseApply(gpOfNonStars_pindexed, [](
short isStar,
IT idx){
return static_cast<short>(
NONSTAR);},
1169 [](
IT p,
short isStar){
return p;},
1170 [](
IT p,
short isStar){
return isStar==
NONSTAR;},
1171 false,
static_cast<IT>(0));
1175 stars.
Set( rootsOfNonStarsIdx);
1181 [](
short s,
short isStar){
return static_cast<IT> (s);},
1182 [](
short s,
short isStar){
return isStar==
STAR;},
1183 false,
static_cast<short>(0));
1184 pOflevel1V = EWiseApply<IT>(pOflevel1V, parents,
1185 [](
IT s,
IT p){
return p;},
1186 [](
IT s,
IT p){
return true;},
1187 false,
static_cast<IT>(0));
1190 stars.
Set(isParentStar);
1194 template <
typename IT,
typename NT,
typename DER>
1199 double t1 = MPI_Wtime();
1202 minNeighborparent = SpMV<Select2ndMinSR<NT, IT>>(
A, parent);
1205 double tspmv = MPI_Wtime() - t1;
1209 hooksMNP = EWiseApply<IT>(hooksMNP, minNeighborparent, [](
IT x,
IT mnp){
return mnp;},
1210 [](
IT x,
IT mnp){
return true;},
false,
static_cast<IT> (0));
1211 hooksMNP = EWiseApply<IT>(hooksMNP, parent, [](
IT mnp,
IT p){
return mnp;},
1212 [](
IT mnp,
IT p){
return p > mnp;},
false,
static_cast<IT> (0));
1217 finalhooks = hooksMNP;
1222 [](
IT mnp,
IT p){
return true;},
false,
static_cast<IT> (0));
1224 finalhooks =
Assign(hooksP, hooksMNP);
1227 parent.
Set(finalhooks);
1230 double tall = MPI_Wtime() - t1;
1231 std::ostringstream outs;
1234 outs <<
" Conditional Hooking Time: SpMV: " << tspmv <<
" Other: "<< tall-tspmv;
1242 template <
typename IT,
typename NT,
typename DER>
1247 double ts = MPI_Wtime();
1250 string spmv =
"dense";
1251 IT nNonStars = stars.
Reduce(std::plus<IT>(),
static_cast<IT>(0), [](
short isStar){
return static_cast<IT>(isStar==
NONSTAR);});
1252 IT nv =
A.getnrow();
1256 if(nNonStars * 50 < nv)
1261 [](
short isStar,
IT p){
return p;},
1262 [](
short isStar,
IT p){
return true;},
1263 false,
static_cast<IT>(0));
1268 SpMV<Select2ndMinSR<NT, IT>>(
A, pOfNonStars, hooks,
false);
1270 tspmv = MPI_Wtime() - t1;
1272 hooks = EWiseApply<IT>(hooks, stars, [](
IT mnp,
short isStar){
return mnp;},
1273 [](
IT mnp,
short isStar){
return isStar==
STAR;},
1274 false,
static_cast<IT> (0));
1279 parents1.
EWiseApply(stars, [nv](
IT p,
short isStar){
return isStar ==
STAR? nv: p;});
1286 minNeighborParent = SpMV<Select2ndMinSR<NT, IT>>(
A, parents1);
1288 tspmv = MPI_Wtime() - t1;
1290 hooks = minNeighborParent.
Find([nv](
IT mnf){
return mnf != nv;});
1291 hooks = EWiseApply<IT>(hooks, stars, [](
IT mnp,
short isStar){
return mnp;},
1292 [](
IT mnp,
short isStar){
return isStar==
STAR;},
1293 false,
static_cast<IT> (0));
1300 [](
IT mnp,
IT p){
return true;},
false,
static_cast<IT> (0));
1303 parents.
Set(finalHooks);
1306 double tall = MPI_Wtime() - ts;
1307 std::ostringstream outs;
1310 outs <<
" Unconditional Hooking Time " << spmv <<
" : " << tspmv <<
" Other: "<< tall-tspmv;
1321 template <
typename IT>
1325 parent = grandparent;
1331 template <
typename IT>
1336 [](
short isStar,
IT p){
return p;},
1337 [](
short isStar,
IT p){
return true;},
1338 false,
static_cast<short>(0));
1340 parents.
Set(grandParentsOfNonStars);
1346 template <
typename IT,
typename NT,
typename DER>
1350 minNeighborCCLabel = SpMV<Select2ndMinSR<NT, IT>>(
A, cclabel);
1351 return minNeighborCCLabel==cclabel;
1356 template <
typename IT,
typename NT,
typename DER>
1359 DER* spSeq =
A.seqptr();
1361 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
1363 IT j = colit.colid();
1364 for(
auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
1366 IT i = nzit.rowid();
1367 if( cclabel[i] != cclabel[j])
1369 std::cout << i <<
" (" << parent[i] <<
", "<< cclabel[i] <<
") & "<< j <<
"("<< parent[j] <<
", " << cclabel[j] <<
")\n";
1383 template <
typename IT>
1387 cclabel.
ApplyInd([](
IT val,
IT ind){
return val==ind ? -1 : val;});
1398 cclabel.
Set(labelOfParents);
1404 template <
typename IT,
typename NT,
typename DER>
1407 IT nrows =
A.getnrow();
1410 parent.
iota(nrows, 0);
1413 std::ostringstream outs;
1417 stars.
EWiseApply(degree, [](
short isStar,
double degree){
return degree == 0.0?
CONVERGED: isStar;});
1423 nthreads = omp_get_num_threads();
1433 double t1 = MPI_Wtime();
1437 double t_cond_hook = MPI_Wtime() - t1;
1458 stars.
Set(pNonStar);
1463 double t_starcheck1 = MPI_Wtime() - t1;
1469 double t_uncond_hook = MPI_Wtime() - t1;
1481 stars.
EWiseApply(uncondHooks, [](
short isStar,
IT x){
return static_cast<short>(
NONSTAR);},
1485 IT nconverged = stars.
Reduce(std::plus<IT>(),
static_cast<IT>(0), [](
short isStar){
return static_cast<IT>(isStar==
CONVERGED);});
1487 if(nconverged==nrows)
1490 outs <<
"Iteration: " << iteration <<
" converged: " << nrows <<
" stars: 0" <<
" nonstars: 0" ;
1497 double t_starcheck2 = MPI_Wtime() - t1;
1502 double t_shortcut = MPI_Wtime() - t1;
1509 double t_starcheck = MPI_Wtime() - t1;
1512 IT nonstars = stars.
Reduce(std::plus<IT>(),
static_cast<IT>(0), [](
short isStar){
return static_cast<IT>(isStar==
NONSTAR);});
1513 IT nstars = nrows - (nonstars + nconverged);
1521 double t2 = MPI_Wtime();
1524 outs <<
"Iteration: " << iteration <<
" converged: " << nconverged <<
" stars: " << nstars <<
" nonstars: " << nonstars;
1547 template <
typename IT>
1550 for(
IT i=0; i< nCC; i++)
1558 template <
typename IT>
1566 std::ostringstream outs;
1569 outs <<
"Size of the first component: " << cc1.
getnnz() << std::endl;
1570 outs <<
"Size of the second component: " << cc2.
getnnz() << std::endl;
1571 outs <<
"Size of the third component: " << cc3.
getnnz() << std::endl;
1572 outs <<
"Size of the fourth component: " << cc4.
getnnz() << std::endl;
1576 template <
typename IT>
1580 for(
IT i=0; i< nCC; i++)
1591 std::vector<IT> localHist(numBins,0);
1594 IT bin = (locCCSizes[i]*(numBins-1))/largestCCSise;
1598 std::vector<IT> globalHist(numBins,0);
1599 MPI_Comm world =
CC.getcommgrid()->GetWorld();
1600 MPI_Reduce(localHist.data(), globalHist.data(), numBins, MPIType<IT>(), MPI_SUM, 0, world);
1604 MPI_Comm_rank(world,&myrank);
1607 std::cout <<
"The largest component size: " << largestCCSise << std::endl;
1609 output.open(
"hist.txt", std::ios_base::app );
1610 std::copy(globalHist.begin(), globalHist.end(), std::ostream_iterator<IT> (
output,
" "));
NT Reduce(_BinaryOperation __binary_op, NT init) const
std::vector< NT > GetLocalNum()
void nziota(NT first)
iota over existing nonzero entries
std::shared_ptr< CommGrid > getcommgrid() const
std::vector< IT > GetLocalInd()
void EWiseApply(const FullyDistVec< IT, NT2 > &other, _BinaryOperation __binary_op, _BinaryPredicate _do_op, const bool useExtendedBinOp)
NT Reduce(_BinaryOperation __binary_op, NT identity) const
FullyDistSpVec< IT, NT > Find(_Predicate pred) const
Return the elements for which pred is true.
void ApplyInd(_BinaryOperation __binary_op)
void SetElement(IT indx, NT numx)
void iota(IT globalsize, NT first)
std::shared_ptr< CommGrid > getcommgrid() const
void Set(const FullyDistSpVec< IT, NT > &rhs)
void Apply(_UnaryOperation __unary_op)
const NT * GetLocArr() const
static void Print(const std::string &s)
void ActivateThreading(int numsplits)
void omp_par_scan(T *A, T *B, I cnt)
int Mpi_Alltoallv(T *sbuff, int *s_cnt, int *sdisp, T *rbuff, int *r_cnt, int *rdisp, MPI_Comm comm)
int replicate(const FullyDistVec< IT, NT > dense, FullyDistSpVec< IT, IT > ri, vector< vector< NT > > &bcastBuffer)
FullyDistSpVec< IT, IT > UnconditionalHook2(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &parents, FullyDistVec< IT, short > stars)
bool neigborsInSameCC(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &cclabel)
void Correctness(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &cclabel, IT nCC, FullyDistVec< IT, IT > parent)
void Shortcut(FullyDistVec< IT, IT > &parent)
IT LabelCC(FullyDistVec< IT, IT > &parent, FullyDistVec< IT, IT > &cclabel)
void First4Clust(FullyDistVec< IT, IT > &cc)
void StarCheck(FullyDistVec< IT, IT > &parents, FullyDistVec< IT, short > &stars)
FullyDistSpVec< IT, NT > Assign(FullyDistSpVec< IT, IT > &ind, FullyDistSpVec< IT, NT > &val)
FullyDistVec< IT, IT > CC(SpParMat< IT, NT, DER > &A, IT &nCC)
int ReduceAssign(FullyDistSpVec< IT, IT > &ind, FullyDistSpVec< IT, NT > &val, vector< vector< NT > > &reduceBuffer, NT MAX_FOR_REDUCE)
void PrintCC(FullyDistVec< IT, IT > CC, IT nCC)
FullyDistSpVec< IT, NT > Extract(const FullyDistVec< IT, NT > dense, FullyDistSpVec< IT, IT > ri)
void HistCC(FullyDistVec< IT, IT > CC, IT nCC)
int Mpi_Alltoallv_kway(T *sbuff_, int *s_cnt_, int *sdisp_, T *rbuff_, int *r_cnt_, int *rdisp_, MPI_Comm c, int kway=2)
void StarCheckAfterHooking(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &parent, FullyDistVec< IT, short > &star, FullyDistSpVec< IT, IT > condhooks, bool isStar2StarHookPossible)
FullyDistSpVec< IT, IT > ConditionalHook(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &parent, FullyDistVec< IT, short > stars, int iteration)
int Mpi_Alltoallv_sparse(T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm)
static bool returnedSAID()
promote_trait< T1, T2 >::T_promote T_promote
static T_promote add(const T_promote &arg1, const T_promote &arg2)
static T_promote multiply(const T1 &arg1, const T2 &arg2)
static T2 add(const T2 &arg1, const T2 &arg2)
static T2 multiply(const T1 &arg1, const T2 &arg2)
static void axpy(const T1 a, const T2 &x, T_promote &y)
Compute the maximum of two values.