41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
52 #include "Tpetra_Details_radixSort.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
81 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
82 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
87 namespace MatrixMatrix{
95 template <
class Scalar,
105 bool call_FillComplete_on_result,
106 const std::string& label,
107 const Teuchos::RCP<Teuchos::ParameterList>& params)
113 typedef LocalOrdinal LO;
114 typedef GlobalOrdinal GO;
122 #ifdef HAVE_TPETRA_MMM_TIMINGS
123 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
124 using Teuchos::TimeMonitor;
127 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Setup"))));
130 const std::string prefix =
"TpetraExt::MatrixMatrix::Multiply(): ";
135 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
136 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
141 RCP<const crs_matrix_type> Aprime =
null;
145 RCP<const crs_matrix_type> Bprime =
null;
155 const bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
157 bool use_optimized_ATB =
false;
158 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
159 use_optimized_ATB =
true;
161 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
162 use_optimized_ATB =
false;
165 using Teuchos::ParameterList;
166 RCP<ParameterList> transposeParams (
new ParameterList);
167 transposeParams->set (
"sort",
false);
169 if (!use_optimized_ATB && transposeA) {
170 transposer_type transposer (rcpFromRef (A));
171 Aprime = transposer.createTranspose (transposeParams);
174 Aprime = rcpFromRef(A);
178 transposer_type transposer (rcpFromRef (B));
179 Bprime = transposer.createTranspose (transposeParams);
182 Bprime = rcpFromRef(B);
192 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
193 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
194 "must match for matrix-matrix product. op(A) is "
195 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
201 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
202 prefix <<
"ERROR, dimensions of result C must "
204 <<
" rows, should have at least " << Aouter << std::endl);
216 int numProcs = A.
getComm()->getSize();
220 crs_matrix_struct_type Aview;
221 crs_matrix_struct_type Bview;
223 RCP<const map_type> targetMap_A = Aprime->getRowMap();
224 RCP<const map_type> targetMap_B = Bprime->getRowMap();
226 #ifdef HAVE_TPETRA_MMM_TIMINGS
228 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All I&X")));
234 if (!use_optimized_ATB) {
235 RCP<const import_type> dummyImporter;
236 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
242 targetMap_B = Aprime->getColMap();
245 if (!use_optimized_ATB)
246 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(),
false, label, params);
248 #ifdef HAVE_TPETRA_MMM_TIMINGS
251 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Multiply"))));
255 if (use_optimized_ATB) {
256 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
258 }
else if (call_FillComplete_on_result && newFlag) {
259 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
261 }
else if (call_FillComplete_on_result) {
262 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
268 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
270 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
273 #ifdef HAVE_TPETRA_MMM_TIMINGS
274 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All FillComplete")));
283 C.
fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
288 template <
class Scalar,
297 bool call_FillComplete_on_result,
298 const std::string& label,
299 const Teuchos::RCP<Teuchos::ParameterList>& params)
303 typedef LocalOrdinal LO;
304 typedef GlobalOrdinal GO;
311 #ifdef HAVE_TPETRA_MMM_TIMINGS
312 std::string prefix_mmm = std::string(
"TpetraExt ")+ label + std::string(
": ");
313 using Teuchos::TimeMonitor;
314 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string(
"Jacobi All Setup")));
317 const std::string prefix =
"TpetraExt::MatrixMatrix::Jacobi(): ";
322 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
323 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
325 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
326 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
335 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
336 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
337 "must match for matrix-matrix product. op(A) is "
338 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
344 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
345 prefix <<
"ERROR, dimensions of result C must "
347 <<
" rows, should have at least "<< Aouter << std::endl);
359 int numProcs = A.
getComm()->getSize();
363 crs_matrix_struct_type Aview;
364 crs_matrix_struct_type Bview;
366 RCP<const map_type> targetMap_A = Aprime->getRowMap();
367 RCP<const map_type> targetMap_B = Bprime->getRowMap();
369 #ifdef HAVE_TPETRA_MMM_TIMINGS
370 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All I&X")));
376 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(
new Teuchos::ParameterList);
377 if(!params.is_null()) {
378 importParams1->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
379 int mm_optimization_core_count=0;
380 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
382 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
383 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
384 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
385 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
386 auto & ip1slist = importParams1->sublist(
"matrixmatrix: kernel params",
false);
387 ip1slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
388 ip1slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
389 ip1slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
393 RCP<const import_type> dummyImporter;
394 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
false, label,importParams1);
399 targetMap_B = Aprime->getColMap();
404 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(
new Teuchos::ParameterList);
405 if(!params.is_null()) {
406 importParams2->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
408 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
410 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
411 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
412 auto & ip2slist = importParams2->sublist(
"matrixmatrix: kernel params",
false);
413 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
414 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
415 ip2slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
416 ip2slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
417 ip2slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
420 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(),
false, label,importParams2);
422 #ifdef HAVE_TPETRA_MMM_TIMINGS
424 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All Multiply")));
428 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
431 bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
433 if (call_FillComplete_on_result && newFlag) {
434 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
436 }
else if (call_FillComplete_on_result) {
437 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
440 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
"jacobi_A_B_general not implemented");
463 template <
class Scalar,
474 using Teuchos::Array;
478 typedef LocalOrdinal LO;
479 typedef GlobalOrdinal GO;
484 const std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
486 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error,
487 prefix <<
"ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
488 "(Result matrix B is not required to be isFillComplete()).");
489 TEUCHOS_TEST_FOR_EXCEPTION(B.
isFillComplete() , std::runtime_error,
490 prefix <<
"ERROR, input matrix B must not be fill complete!");
491 TEUCHOS_TEST_FOR_EXCEPTION(B.
isStaticGraph() , std::runtime_error,
492 prefix <<
"ERROR, input matrix B must not have static graph!");
494 prefix <<
"ERROR, input matrix B must not be locally indexed!");
496 using Teuchos::ParameterList;
497 RCP<ParameterList> transposeParams (
new ParameterList);
498 transposeParams->set (
"sort",
false);
500 RCP<const crs_matrix_type> Aprime =
null;
502 transposer_type transposer (rcpFromRef (A));
503 Aprime = transposer.createTranspose (transposeParams);
506 Aprime = rcpFromRef(A);
514 if (scalarB != Teuchos::ScalarTraits<SC>::one())
519 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
520 for (LO i = 0; (size_t)i < numMyRows; ++i) {
521 row = B.
getRowMap()->getGlobalElement(i);
522 Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
524 if (scalarA != Teuchos::ScalarTraits<SC>::one())
525 for (
size_t j = 0; j < a_numEntries; ++j)
526 a_vals[j] *= scalarA;
536 namespace ColMapFunctors
538 template<
typename ByteView,
typename GView>
541 UnionEntries(ByteView entryUnion_,
const GView gids_) : entryUnion(entryUnion_), gids(gids_) {}
542 KOKKOS_INLINE_FUNCTION
void operator()(
const size_t i)
const
544 entryUnion(gids(i)) = 1;
550 template<
typename LView,
typename GView>
551 struct ConvertGlobalToLocal
553 ConvertGlobalToLocal(
const LView gtol_,
const GView gids_, LView lids_) : gtol(gtol_), gids(gids_), lids(lids_) {}
554 KOKKOS_INLINE_FUNCTION
void operator()(
const size_t i)
const
556 lids(i) = gtol(gids(i));
566 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
567 Teuchos::RCP<Map<LocalOrdinal, GlobalOrdinal, Node> > AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
568 makeColMapAndConvertGids(GlobalOrdinal ncols,
569 const typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::global_col_inds_array& gids,
570 typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::col_inds_array& lids,
571 const Teuchos::RCP<
const Teuchos::Comm<int>>& comm)
573 using namespace ColMapFunctors;
576 typedef Kokkos::View<char*, device_type> ByteView;
577 typedef global_col_inds_array GView;
578 typedef col_inds_array LView;
580 auto nentries = gids.extent(0);
582 ByteView entryUnion(
"entry union", ncols);
583 UnionEntries<ByteView, GView> ue(entryUnion, gids);
584 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_unionEntries", range_type(0, nentries), ue);
586 LView gtol(
"global col -> local col", ncols + 1);
587 ::Tpetra::Details::computeOffsetsFromCounts<decltype(gtol), decltype(entryUnion)>(gtol, entryUnion);
589 ConvertGlobalToLocal<LView, GView> cgtl(gtol, gids, lids);
590 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertGlobalToLocal", range_type(0, gids.extent(0)), cgtl);
592 execution_space().fence();
593 GView colmap(
"column map", gtol(ncols));
594 size_t localIter = 0;
595 execution_space().fence();
596 for(
size_t i = 0; i < entryUnion.extent(0); i++)
598 if(entryUnion(i) != 0)
600 colmap(localIter++) = i;
603 execution_space().fence();
605 return rcp(
new map_type(Teuchos::OrdinalTraits<GlobalOrdinal>::invalid(), colmap, 0, comm));
608 template <
class Scalar,
612 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
614 const bool transposeA,
617 const bool transposeB,
621 const Teuchos::RCP<Teuchos::ParameterList>& params)
627 Teuchos::RCP<crs_matrix_type> C = rcp(
new crs_matrix_type(CrowMap, 0));
629 add(alpha,transposeA,A,beta,transposeB,B,*C,domainMap,rangeMap,params);
633 template <
class Scalar,
639 const bool transposeA,
642 const bool transposeB,
647 const Teuchos::RCP<Teuchos::ParameterList>& params)
651 using Teuchos::rcpFromRef;
652 using Teuchos::rcp_implicit_cast;
653 using Teuchos::rcp_dynamic_cast;
654 using Teuchos::TimeMonitor;
656 typedef LocalOrdinal LO;
657 typedef GlobalOrdinal GO;
664 typedef AddDetails::AddKernels<SC,LO,GO,NO> AddKern;
665 const char* prefix_mmm =
"TpetraExt::MatrixMatrix::add: ";
666 constexpr
bool debug =
false;
668 #ifdef HAVE_TPETRA_MMM_TIMINGS
669 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"transpose"))));
673 std::ostringstream os;
674 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
675 <<
"TpetraExt::MatrixMatrix::add" << std::endl;
676 std::cerr << os.str ();
679 TEUCHOS_TEST_FOR_EXCEPTION
681 prefix_mmm <<
"A and B must both be fill complete.");
682 #ifdef HAVE_TPETRA_DEBUG
685 const bool domainMapsSame =
686 (! transposeA && ! transposeB &&
688 (! transposeA && transposeB &&
690 ( transposeA && ! transposeB &&
692 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
693 prefix_mmm <<
"The domain Maps of Op(A) and Op(B) are not the same.");
695 const bool rangeMapsSame =
696 (! transposeA && ! transposeB &&
698 (! transposeA && transposeB &&
700 ( transposeA && ! transposeB &&
702 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
703 prefix_mmm <<
"The range Maps of Op(A) and Op(B) are not the same.");
705 #endif // HAVE_TPETRA_DEBUG
707 using Teuchos::ParameterList;
708 RCP<ParameterList> transposeParams (
new ParameterList);
709 transposeParams->set (
"sort",
false);
713 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
715 transposer_type transposer (Aprime);
716 Aprime = transposer.createTranspose (transposeParams);
719 #ifdef HAVE_TPETRA_DEBUG
720 TEUCHOS_TEST_FOR_EXCEPTION
721 (Aprime.is_null (), std::logic_error,
722 prefix_mmm <<
"Failed to compute Op(A). "
723 "Please report this bug to the Tpetra developers.");
724 #endif // HAVE_TPETRA_DEBUG
727 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
730 std::ostringstream os;
731 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
732 <<
"Form explicit xpose of B" << std::endl;
733 std::cerr << os.str ();
735 transposer_type transposer (Bprime);
736 Bprime = transposer.createTranspose (transposeParams);
738 #ifdef HAVE_TPETRA_DEBUG
739 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
740 prefix_mmm <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
741 TEUCHOS_TEST_FOR_EXCEPTION(
742 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
743 prefix_mmm <<
"Aprime and Bprime must both be fill complete. "
744 "Please report this bug to the Tpetra developers.");
745 #endif // HAVE_TPETRA_DEBUG
746 RCP<const map_type> CDomainMap = domainMap;
747 RCP<const map_type> CRangeMap = rangeMap;
748 if(CDomainMap.is_null())
750 CDomainMap = Bprime->getDomainMap();
752 if(CRangeMap.is_null())
754 CRangeMap = Bprime->getRangeMap();
756 assert(!(CDomainMap.is_null()));
757 assert(!(CRangeMap.is_null()));
758 typedef typename AddKern::values_array values_array;
759 typedef typename AddKern::row_ptrs_array row_ptrs_array;
760 typedef typename AddKern::col_inds_array col_inds_array;
761 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
762 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
764 row_ptrs_array rowptrs;
765 col_inds_array colinds;
766 auto acolmap = Aprime->getColMap()->getMyGlobalIndices();
767 auto bcolmap = Bprime->getColMap()->getMyGlobalIndices();
768 #ifdef HAVE_TPETRA_MMM_TIMINGS
770 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"rowmap check/import"))));
772 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
775 auto import = rcp(
new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
781 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *
import, Bprime->getDomainMap(), Bprime->getRangeMap());
783 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
784 bool sorted = AGraphSorted && BGraphSorted;
785 RCP<const map_type> CrowMap;
786 RCP<const map_type> CcolMap;
787 RCP<const import_type> Cimport = Teuchos::null;
788 RCP<export_type> Cexport = Teuchos::null;
790 if(!matchingColMaps && !(CDomainMap->isContiguous()))
793 #ifdef HAVE_TPETRA_MMM_TIMINGS
795 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"fallback to CrsMatrix::add"))));
798 std::ostringstream os;
799 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
800 <<
"Call Bprime->add(...)" << std::endl;
801 std::cerr << os.str ();
803 Teuchos::RCP<crs_matrix_type> C_ = Teuchos::rcp_static_cast<crs_matrix_type>(Bprime->add(alpha, *Aprime, beta, CDomainMap, CRangeMap, params));
805 C.
setAllValues(C_->getLocalMatrix().graph.row_map,C_->getLocalMatrix().graph.entries,C_->getLocalMatrix().values);
806 C.
expertStaticFillComplete(CDomainMap, CRangeMap, C_->getGraph()->getImporter(), C_->getGraph()->getExporter(), params);
809 else if(!matchingColMaps)
811 #ifdef HAVE_TPETRA_MMM_TIMINGS
813 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mismatched col map full kernel"))));
816 auto Acolmap = Aprime->getColMap()->getMyGlobalIndices();
817 auto Bcolmap = Bprime->getColMap()->getMyGlobalIndices();
818 typename AddKern::global_col_inds_array globalColinds(
"", 0);
820 std::ostringstream os;
821 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
822 <<
"Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
823 std::cerr << os.str ();
825 AddKern::convertToGlobalAndAdd(
826 Aprime->getLocalMatrix(), alpha, Bprime->getLocalMatrix(), beta, Acolmap, Bcolmap,
827 CRangeMap->getMinGlobalIndex(), Aprime->getGlobalNumCols(), vals, rowptrs, globalColinds);
828 colinds = col_inds_array(
"C colinds", globalColinds.extent(0));
830 std::ostringstream os;
831 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
832 <<
"Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
833 std::cerr << os.str ();
835 #ifdef HAVE_TPETRA_MMM_TIMINGS
837 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"building optimized column map"))));
839 CrowMap = Bprime->getRowMap();
842 CcolMap = AddKern::makeColMapAndConvertGids(Aprime->getGlobalNumCols(), globalColinds, colinds, comm);
847 auto localA = Aprime->getLocalMatrix();
848 auto localB = Bprime->getLocalMatrix();
849 auto Avals = localA.values;
850 auto Bvals = localB.values;
851 auto Arowptrs = localA.graph.row_map;
852 auto Browptrs = localB.graph.row_map;
853 auto Acolinds = localA.graph.entries;
854 auto Bcolinds = localB.graph.entries;
858 #ifdef HAVE_TPETRA_MMM_TIMINGS
860 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"sorted entries full kernel"))));
863 std::ostringstream os;
864 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
865 <<
"Call AddKern::addSorted(...)" << std::endl;
866 std::cerr << os.str ();
868 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
873 #ifdef HAVE_TPETRA_MMM_TIMINGS
875 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mm add unsorted entries full kernel"))));
878 std::ostringstream os;
879 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
880 <<
"Call AddKern::addUnsorted(...)" << std::endl;
881 std::cerr << os.str ();
883 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
885 CrowMap = Bprime->getRowMap();
886 CcolMap = Bprime->getColMap();
889 #ifdef HAVE_TPETRA_MMM_TIMINGS
891 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Tpetra::Crs constructor"))));
896 #ifdef HAVE_TPETRA_MMM_TIMINGS
898 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Tpetra::Crs expertStaticFillComplete"))));
900 if(!CDomainMap->isSameAs(*CcolMap))
903 std::ostringstream os;
904 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
905 <<
"Create Cimport" << std::endl;
906 std::cerr << os.str ();
908 Cimport = rcp(
new import_type(CDomainMap, CcolMap));
910 if(!CrowMap->isSameAs(*CRangeMap))
913 std::ostringstream os;
914 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
915 <<
"Create Cexport" << std::endl;
916 std::cerr << os.str ();
918 Cexport = rcp(
new export_type(CrowMap, CRangeMap));
922 std::ostringstream os;
923 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
924 <<
"Call C->expertStaticFillComplete(...)" << std::endl;
925 std::cerr << os.str ();
931 template <
class Scalar,
944 using Teuchos::Array;
945 using Teuchos::ArrayRCP;
946 using Teuchos::ArrayView;
949 using Teuchos::rcp_dynamic_cast;
950 using Teuchos::rcpFromRef;
951 using Teuchos::tuple;
954 typedef Teuchos::ScalarTraits<Scalar> STS;
962 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
964 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
965 prefix <<
"The case C == null does not actually work. Fixing this will require an interface change.");
967 TEUCHOS_TEST_FOR_EXCEPTION(
969 prefix <<
"Both input matrices must be fill complete before calling this function.");
971 #ifdef HAVE_TPETRA_DEBUG
973 const bool domainMapsSame =
977 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
978 prefix <<
"The domain Maps of Op(A) and Op(B) are not the same.");
980 const bool rangeMapsSame =
984 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
985 prefix <<
"The range Maps of Op(A) and Op(B) are not the same.");
987 #endif // HAVE_TPETRA_DEBUG
989 using Teuchos::ParameterList;
990 RCP<ParameterList> transposeParams (
new ParameterList);
991 transposeParams->set (
"sort",
false);
994 RCP<const crs_matrix_type> Aprime;
996 transposer_type theTransposer (rcpFromRef (A));
997 Aprime = theTransposer.createTranspose (transposeParams);
1000 Aprime = rcpFromRef (A);
1003 #ifdef HAVE_TPETRA_DEBUG
1004 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1005 prefix <<
"Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1006 #endif // HAVE_TPETRA_DEBUG
1009 RCP<const crs_matrix_type> Bprime;
1011 transposer_type theTransposer (rcpFromRef (B));
1012 Bprime = theTransposer.createTranspose (transposeParams);
1015 Bprime = rcpFromRef (B);
1018 #ifdef HAVE_TPETRA_DEBUG
1019 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1020 prefix <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1021 #endif // HAVE_TPETRA_DEBUG
1024 if (! C.is_null ()) {
1025 C->setAllToScalar (STS::zero ());
1033 if (Aprime->getRowMap ()->isSameAs (* (Bprime->getRowMap ())) {
1034 RCP<const map_type> rowMap = Aprime->getRowMap ();
1036 RCP<const crs_graph_type> A_graph =
1037 rcp_dynamic_cast<const crs_graph_type> (Aprime->getGraph (), true);
1038 #ifdef HAVE_TPETRA_DEBUG
1039 TEUCHOS_TEST_FOR_EXCEPTION(A_graph.is_null (), std::logic_error,
1040 "Tpetra::MatrixMatrix::Add: Graph of Op(A) is null. "
1041 "Please report this bug to the Tpetra developers.");
1043 RCP<const crs_graph_type> B_graph =
1044 rcp_dynamic_cast<const crs_graph_type> (Bprime->getGraph (), true);
1045 #ifdef HAVE_TPETRA_DEBUG
1046 TEUCHOS_TEST_FOR_EXCEPTION(B_graph.is_null (), std::logic_error,
1047 "Tpetra::MatrixMatrix::Add: Graph of Op(B) is null. "
1048 "Please report this bug to the Tpetra developers.");
1050 RCP<const map_type> A_colMap = A_graph->getColMap ();
1051 #ifdef HAVE_TPETRA_DEBUG
1052 TEUCHOS_TEST_FOR_EXCEPTION(A_colMap.is_null (), std::logic_error,
1053 "Tpetra::MatrixMatrix::Add: Column Map of Op(A) is null. "
1054 "Please report this bug to the Tpetra developers.");
1056 RCP<const map_type> B_colMap = B_graph->getColMap ();
1057 #ifdef HAVE_TPETRA_DEBUG
1058 TEUCHOS_TEST_FOR_EXCEPTION(B_colMap.is_null (), std::logic_error,
1059 "Tpetra::MatrixMatrix::Add: Column Map of Op(B) is null. "
1060 "Please report this bug to the Tpetra developers.");
1061 TEUCHOS_TEST_FOR_EXCEPTION(A_graph->getImporter ().is_null (),
1063 "Tpetra::MatrixMatrix::Add: Op(A)'s Import is null. "
1064 "Please report this bug to the Tpetra developers.");
1065 TEUCHOS_TEST_FOR_EXCEPTION(B_graph->getImporter ().is_null (),
1067 "Tpetra::MatrixMatrix::Add: Op(B)'s Import is null. "
1068 "Please report this bug to the Tpetra developers.");
1072 RCP<const import_type> sumImport =
1073 A_graph->getImporter ()->setUnion (* (B_graph->getImporter ()));
1074 RCP<const map_type> C_colMap = sumImport->getTargetMap ();
1082 ArrayView<const LocalOrdinal> A_local_ind;
1083 Array<GlobalOrdinal> A_global_ind;
1084 ArrayView<const LocalOrdinal> B_local_ind;
1085 Array<GlobalOrdinal> B_global_ind;
1087 const size_t localNumRows = rowMap->getNodeNumElements ();
1088 ArrayRCP<size_t> numEntriesPerRow (localNumRows);
1091 size_t maxNumEntriesPerRow = 0;
1092 for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
1094 A_graph->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind);
1095 const size_type A_numEnt = A_local_ind.size ();
1096 if (A_numEnt > A_global_ind.size ()) {
1097 A_global_ind.resize (A_numEnt);
1100 for (size_type k = 0; k < A_numEnt; ++k) {
1101 A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1105 B_graph->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind);
1106 const size_type B_numEnt = B_local_ind.size ();
1107 if (B_numEnt > B_global_ind.size ()) {
1108 B_global_ind.resize (B_numEnt);
1111 for (size_type k = 0; k < B_numEnt; ++k) {
1112 B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1116 const size_t curNumEntriesPerRow =
1117 keyMergeCount (A_global_ind.begin (), A_global_ind.end (),
1118 B_global_ind.begin (), B_global_ind.end ());
1119 numEntriesPerRow[localRow] = curNumEntriesPerRow;
1120 maxNumEntriesPerRow = std::max (maxNumEntriesPerRow, curNumEntriesPerRow);
1127 C = rcp (
new crs_matrix_type (rowMap, C_colMap, numEntriesPerRow, StaticProfile));
1132 ArrayView<const Scalar> A_val;
1133 ArrayView<const Scalar> B_val;
1135 Array<LocalOrdinal> AplusB_local_ind (maxNumEntriesPerRow);
1136 Array<GlobalOrdinal> AplusB_global_ind (maxNumEntriesPerRow);
1137 Array<Scalar> AplusB_val (maxNumEntriesPerRow);
1139 for (
size_t localRow = 0; localRow < localNumRows; ++localRow) {
1141 Aprime->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind, A_val);
1143 for (size_type k = 0; k < A_local_ind.size (); ++k) {
1144 A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1148 Bprime->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind, B_val);
1150 for (size_type k = 0; k < B_local_ind.size (); ++k) {
1151 B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1154 const size_t curNumEntries = numEntriesPerRow[localRow];
1155 ArrayView<LocalOrdinal> C_local_ind = AplusB_local_ind (0, curNumEntries);
1156 ArrayView<GlobalOrdinal> C_global_ind = AplusB_global_ind (0, curNumEntries);
1157 ArrayView<Scalar> C_val = AplusB_val (0, curNumEntries);
1161 A_val.begin (), A_val.end (),
1162 B_global_ind.begin (), B_global_ind.end (),
1163 B_val.begin (), B_val.end (),
1164 C_global_ind.begin (), C_val.begin (),
1165 std::plus<Scalar> ());
1167 for (size_type k = 0; k < as<size_type> (numEntriesPerRow[localRow]); ++k) {
1168 C_local_ind[k] = C_colMap->getLocalElement (C_global_ind[k]);
1171 C->replaceLocalValues (localRow, C_local_ind, C_val);
1176 RCP<const map_type> domainMap = A_graph->getDomainMap ();
1177 RCP<const map_type> rangeMap = A_graph->getRangeMap ();
1178 C->expertStaticFillComplete (domainMap, rangeMap, sumImport, A_graph->getExporter ());
1186 C = rcp (
new crs_matrix_type (Aprime->getRowMap (),
null));
1193 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), 0));
1197 #ifdef HAVE_TPETRA_DEBUG
1198 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1199 prefix <<
"At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1200 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1201 prefix <<
"At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1202 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1203 prefix <<
"At this point, C is null. Please report this bug to the Tpetra developers.");
1204 #endif // HAVE_TPETRA_DEBUG
1206 Array<RCP<const crs_matrix_type> > Mat =
1207 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1208 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1211 for (
int k = 0; k < 2; ++k) {
1212 Array<GlobalOrdinal> Indices;
1213 Array<Scalar> Values;
1221 #ifdef HAVE_TPETRA_DEBUG
1222 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1223 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1224 #endif // HAVE_TPETRA_DEBUG
1225 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1226 #ifdef HAVE_TPETRA_DEBUG
1227 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1228 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1229 #endif // HAVE_TPETRA_DEBUG
1231 const size_t localNumRows = Mat[k]->getNodeNumRows ();
1232 for (
size_t i = 0; i < localNumRows; ++i) {
1233 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1234 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1235 if (numEntries > 0) {
1236 Indices.resize (numEntries);
1237 Values.resize (numEntries);
1238 Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1240 if (scalar[k] != STS::one ()) {
1241 for (
size_t j = 0; j < numEntries; ++j) {
1242 Values[j] *= scalar[k];
1246 if (C->isFillComplete ()) {
1247 C->sumIntoGlobalValues (globalRow, Indices, Values);
1249 C->insertGlobalValues (globalRow, Indices, Values);
1256 template<
typename SC,
typename LO,
typename GO,
typename NO>
1257 void AddDetails::AddKernels<SC, LO, GO, NO>::
1259 const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1260 const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1261 const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1262 const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1263 const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1264 const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1265 const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1266 const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1267 typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1268 typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1269 typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1271 using Teuchos::TimeMonitor;
1272 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
1273 auto nrows = Arowptrs.extent(0) - 1;
1274 Crowptrs = row_ptrs_array(
"C row ptrs", nrows + 1);
1275 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, LO, impl_scalar_type,
1276 execution_space, memory_space, memory_space> KKH;
1278 handle.create_spadd_handle(
true);
1279 auto addHandle = handle.get_spadd_handle();
1280 #ifdef HAVE_TPETRA_MMM_TIMINGS
1281 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
1283 KokkosSparse::Experimental::spadd_symbolic
1285 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
1286 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
1287 row_ptrs_array, col_inds_array>
1288 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1289 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
1290 Ccolinds = col_inds_array(
"C colinds", addHandle->get_max_result_nnz());
1291 #ifdef HAVE_TPETRA_MMM_TIMINGS
1293 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted numeric")));
1295 KokkosSparse::Experimental::spadd_numeric(&handle,
1296 Arowptrs, Acolinds, Avals, scalarA,
1297 Browptrs, Bcolinds, Bvals, scalarB,
1298 Crowptrs, Ccolinds, Cvals);
1301 template<
typename SC,
typename LO,
typename GO,
typename NO>
1302 void AddDetails::AddKernels<SC, LO, GO, NO>::
1304 const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1305 const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1306 const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1307 const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1308 const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1309 const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1310 const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1311 const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1313 typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1314 typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1315 typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1317 using Teuchos::TimeMonitor;
1318 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
1319 auto nrows = Arowptrs.extent(0) - 1;
1320 Crowptrs = row_ptrs_array(
"C row ptrs", nrows + 1);
1321 typedef AddDetails::AddKernels<SC, LO, GO, NO> AddKern;
1322 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
1323 AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
1325 handle.create_spadd_handle(
false);
1326 auto addHandle = handle.get_spadd_handle();
1327 #ifdef HAVE_TPETRA_MMM_TIMINGS
1328 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
1330 KokkosSparse::Experimental::spadd_symbolic
1332 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
1333 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
1334 row_ptrs_array, col_inds_array>
1335 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1336 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
1337 Ccolinds = col_inds_array(
"C colinds", addHandle->get_max_result_nnz());
1338 #ifdef HAVE_TPETRA_MMM_TIMINGS
1340 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
1342 KokkosSparse::Experimental::spadd_numeric(&handle,
1343 Arowptrs, Acolinds, Avals, scalarA,
1344 Browptrs, Bcolinds, Bvals, scalarB,
1345 Crowptrs, Ccolinds, Cvals);
1348 template<
typename GO,
1349 typename LocalIndicesType,
1350 typename GlobalIndicesType,
1351 typename ColMapType>
1352 struct ConvertColIndsFunctor
1354 ConvertColIndsFunctor (
const GO minGlobal_,
1355 const LocalIndicesType& colindsOrig_,
1356 const GlobalIndicesType& colindsConverted_,
1357 const ColMapType& colmap_) :
1358 minGlobal (minGlobal_),
1359 colindsOrig (colindsOrig_),
1360 colindsConverted (colindsConverted_),
1363 KOKKOS_INLINE_FUNCTION
void
1364 operator() (
const size_t& i)
const
1366 colindsConverted[i] = colmap[colindsOrig[i]];
1369 LocalIndicesType colindsOrig;
1370 GlobalIndicesType colindsConverted;
1374 template<
typename SC,
typename LO,
typename GO,
typename NO>
1375 void AddDetails::AddKernels<SC, LO, GO, NO>::
1376 convertToGlobalAndAdd(
1377 const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
1378 const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1379 const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
1380 const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1381 const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
1382 const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
1385 typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1386 typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1387 typename AddDetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
1389 using Teuchos::TimeMonitor;
1391 const values_array& Avals = A.values;
1392 const values_array& Bvals = B.values;
1393 const col_inds_array& Acolinds = A.graph.entries;
1394 const col_inds_array& Bcolinds = B.graph.entries;
1395 auto Arowptrs = A.graph.row_map;
1396 auto Browptrs = B.graph.row_map;
1397 global_col_inds_array AcolindsConverted(
"A colinds (converted)", Acolinds.extent(0));
1398 global_col_inds_array BcolindsConverted(
"B colinds (converted)", Bcolinds.extent(0));
1399 #ifdef HAVE_TPETRA_MMM_TIMINGS
1400 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string(
"column map conversion"))));
1402 ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(minGlobalCol, Acolinds, AcolindsConverted, AcolMap);
1403 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
1404 ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(minGlobalCol, Bcolinds, BcolindsConverted, BcolMap);
1405 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
1406 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, GO, impl_scalar_type,
1407 execution_space, memory_space, memory_space> KKH;
1409 handle.create_spadd_handle(
false);
1410 auto addHandle = handle.get_spadd_handle();
1411 #ifdef HAVE_TPETRA_MMM_TIMINGS
1413 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
1415 auto nrows = Arowptrs.extent(0) - 1;
1416 Crowptrs = row_ptrs_array(
"C row ptrs", nrows + 1);
1417 KokkosSparse::Experimental::spadd_symbolic
1418 <KKH,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
1419 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
1420 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
1421 Ccolinds = global_col_inds_array(
"C colinds", addHandle->get_max_result_nnz());
1422 #ifdef HAVE_TPETRA_MMM_TIMINGS
1424 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
1426 KokkosSparse::Experimental::spadd_numeric(&handle,
1427 Arowptrs, AcolindsConverted, Avals, scalarA,
1428 Browptrs, BcolindsConverted, Bvals, scalarB,
1429 Crowptrs, Ccolinds, Cvals);
1434 namespace MMdetails{
1438 template <
class TransferType>
1439 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer,
const std::string &label) {
1440 if (Transfer.is_null())
1443 const Distributor & Distor = Transfer->getDistributor();
1444 Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1446 size_t rows_send = Transfer->getNumExportIDs();
1447 size_t rows_recv = Transfer->getNumRemoteIDs();
1449 size_t round1_send = Transfer->getNumExportIDs() *
sizeof(size_t);
1450 size_t round1_recv = Transfer->getNumRemoteIDs() *
sizeof(size_t);
1451 size_t num_send_neighbors = Distor.getNumSends();
1452 size_t num_recv_neighbors = Distor.getNumReceives();
1453 size_t round2_send, round2_recv;
1454 Distor.getLastDoStatistics(round2_send,round2_recv);
1456 int myPID = Comm->getRank();
1457 int NumProcs = Comm->getSize();
1464 size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1465 size_t gstats_min[8], gstats_max[8];
1467 double lstats_avg[8], gstats_avg[8];
1468 for(
int i=0; i<8; i++)
1469 lstats_avg[i] = ((
double)lstats[i])/NumProcs;
1471 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1472 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1473 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1476 printf(
"%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1477 (
int)gstats_min[0],gstats_avg[0],(
int)gstats_max[0], (
int)gstats_min[2],gstats_avg[2],(
int)gstats_max[2],
1478 (
int)gstats_min[4],gstats_avg[4],(
int)gstats_max[4], (
int)gstats_min[6],gstats_avg[6],(
int)gstats_max[6]);
1479 printf(
"%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1480 (
int)gstats_min[1],gstats_avg[1],(
int)gstats_max[1], (
int)gstats_min[3],gstats_avg[3],(
int)gstats_max[3],
1481 (
int)gstats_min[5],gstats_avg[5],(
int)gstats_max[5], (
int)gstats_min[7],gstats_avg[7],(
int)gstats_max[7]);
1486 template<
class Scalar,
1488 class GlobalOrdinal,
1490 void mult_AT_B_newmatrix(
1491 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1492 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1493 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1494 const std::string & label,
1495 const Teuchos::RCP<Teuchos::ParameterList>& params)
1500 typedef LocalOrdinal LO;
1501 typedef GlobalOrdinal GO;
1503 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1504 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1506 #ifdef HAVE_TPETRA_MMM_TIMINGS
1507 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1508 using Teuchos::TimeMonitor;
1509 RCP<TimeMonitor> MM = rcp (
new TimeMonitor
1510 (*TimeMonitor::getNewTimer (prefix_mmm +
"MMM-T Transpose")));
1516 transposer_type transposer (rcpFromRef (A), label + std::string(
"XP: "));
1518 using Teuchos::ParameterList;
1519 RCP<ParameterList> transposeParams (
new ParameterList);
1520 transposeParams->set (
"sort",
false);
1521 if(! params.is_null ()) {
1522 transposeParams->set (
"compute global constants",
1523 params->get (
"compute global constants: temporaries",
1526 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1527 transposer.createTransposeLocal (transposeParams);
1532 #ifdef HAVE_TPETRA_MMM_TIMINGS
1534 MM = rcp (
new TimeMonitor
1535 (*TimeMonitor::getNewTimer (prefix_mmm + std::string (
"MMM-T I&X"))));
1539 crs_matrix_struct_type Aview;
1540 crs_matrix_struct_type Bview;
1541 RCP<const Import<LO, GO, NO> > dummyImporter;
1544 RCP<Teuchos::ParameterList> importParams1 (
new ParameterList);
1545 if (! params.is_null ()) {
1546 importParams1->set (
"compute global constants",
1547 params->get (
"compute global constants: temporaries",
1549 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1550 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1551 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1552 int mm_optimization_core_count =
1554 mm_optimization_core_count =
1555 slist.get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1556 int mm_optimization_core_count2 =
1557 params->get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1558 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1559 mm_optimization_core_count = mm_optimization_core_count2;
1561 auto & sip1 = importParams1->sublist (
"matrixmatrix: kernel params",
false);
1562 sip1.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1563 sip1.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1564 sip1.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1567 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1568 Aview, dummyImporter,
true,
1569 label, importParams1);
1571 RCP<ParameterList> importParams2 (
new ParameterList);
1572 if (! params.is_null ()) {
1573 importParams2->set (
"compute global constants",
1574 params->get (
"compute global constants: temporaries",
1576 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1577 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1578 bool overrideAllreduce = slist.get (
"MM_TAFC_OverrideAllreduceCheck",
false);
1579 int mm_optimization_core_count =
1581 mm_optimization_core_count =
1582 slist.get (
"MM_TAFC_OptimizationCoreCount",
1583 mm_optimization_core_count);
1584 int mm_optimization_core_count2 =
1585 params->get (
"MM_TAFC_OptimizationCoreCount",
1586 mm_optimization_core_count);
1587 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1588 mm_optimization_core_count = mm_optimization_core_count2;
1590 auto & sip2 = importParams2->sublist (
"matrixmatrix: kernel params",
false);
1591 sip2.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1592 sip2.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1593 sip2.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1596 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1597 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,
true, label,importParams2);
1600 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,
false, label,importParams2);
1603 #ifdef HAVE_TPETRA_MMM_TIMINGS
1605 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T AB-core"))));
1608 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1611 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1612 if (needs_final_export) {
1616 Ctemp = rcp (&C,
false);
1619 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1624 #ifdef HAVE_TPETRA_MMM_TIMINGS
1626 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T exportAndFillComplete"))));
1629 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C,
false);
1631 if (needs_final_export) {
1632 ParameterList labelList;
1633 labelList.set(
"Timer Label", label);
1634 if(!params.is_null()) {
1635 ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
1636 ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
1638 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1639 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1640 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1641 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
1642 bool isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
1643 bool overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1645 labelList_subList.set (
"isMatrixMatrix_TransferAndFillComplete", isMM,
1646 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1647 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
1648 labelList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1650 Ctemp->exportAndFillComplete (Crcp,
1651 *Ctemp->getGraph ()->getExporter (),
1654 rcp (&labelList,
false));
1656 #ifdef HAVE_TPETRA_MMM_STATISTICS
1657 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(
" AT_B MMM"));
1663 template<
class Scalar,
1665 class GlobalOrdinal,
1668 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1669 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1670 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1671 const std::string& ,
1672 const Teuchos::RCP<Teuchos::ParameterList>& )
1674 using Teuchos::Array;
1675 using Teuchos::ArrayRCP;
1676 using Teuchos::ArrayView;
1677 using Teuchos::OrdinalTraits;
1678 using Teuchos::null;
1680 typedef Teuchos::ScalarTraits<Scalar> STS;
1682 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1683 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1685 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1686 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1688 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1689 ArrayView<const GlobalOrdinal> bcols_import =
null;
1690 if (Bview.importColMap !=
null) {
1691 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1692 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1694 bcols_import = Bview.importColMap->getNodeElementList();
1697 size_t C_numCols = C_lastCol - C_firstCol +
1698 OrdinalTraits<LocalOrdinal>::one();
1699 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1700 OrdinalTraits<LocalOrdinal>::one();
1702 if (C_numCols_import > C_numCols)
1703 C_numCols = C_numCols_import;
1705 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1706 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1707 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1709 Array<Scalar> C_row_i = dwork;
1710 Array<GlobalOrdinal> C_cols = iwork;
1711 Array<size_t> c_index = iwork2;
1712 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1713 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1715 size_t C_row_i_length, j, k, last_index;
1718 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1719 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1720 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1721 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1723 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1724 Aview.colMap->getMaxLocalIndex(); i++)
1729 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1730 Aview.colMap->getMaxLocalIndex(); i++) {
1731 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1732 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1733 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1734 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1744 ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1745 ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1746 ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1747 ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1748 ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1749 ArrayView<const Scalar> Avals, Bvals, Ivals;
1750 Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1751 Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1752 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1753 Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1754 if(!Bview.importMatrix.is_null()) {
1755 Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1756 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1759 bool C_filled = C.isFillComplete();
1761 for (
size_t i = 0; i < C_numCols; i++)
1762 c_index[i] = OrdinalTraits<size_t>::invalid();
1765 size_t Arows = Aview.rowMap->getNodeNumElements();
1766 for(
size_t i=0; i<Arows; ++i) {
1770 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1776 C_row_i_length = OrdinalTraits<size_t>::zero();
1778 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1779 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1780 const Scalar Aval = Avals[k];
1781 if (Aval == STS::zero())
1784 if (Ak == LO_INVALID)
1787 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1788 LocalOrdinal col = Bcolind[j];
1791 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1794 C_row_i[C_row_i_length] = Aval*Bvals[j];
1795 C_cols[C_row_i_length] = col;
1796 c_index[col] = C_row_i_length;
1800 C_row_i[c_index[col]] += Aval*Bvals[j];
1805 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1806 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1807 C_cols[ii] = bcols[C_cols[ii]];
1808 combined_index[ii] = C_cols[ii];
1809 combined_values[ii] = C_row_i[ii];
1811 last_index = C_row_i_length;
1817 C_row_i_length = OrdinalTraits<size_t>::zero();
1819 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1820 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1821 const Scalar Aval = Avals[k];
1822 if (Aval == STS::zero())
1825 if (Ak!=LO_INVALID)
continue;
1827 Ak = Acol2Irow[Acolind[k]];
1828 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1829 LocalOrdinal col = Icolind[j];
1832 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1835 C_row_i[C_row_i_length] = Aval*Ivals[j];
1836 C_cols[C_row_i_length] = col;
1837 c_index[col] = C_row_i_length;
1842 C_row_i[c_index[col]] += Aval*Ivals[j];
1847 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1848 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1849 C_cols[ii] = bcols_import[C_cols[ii]];
1850 combined_index[last_index] = C_cols[ii];
1851 combined_values[last_index] = C_row_i[ii];
1858 C.sumIntoGlobalValues(
1860 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1861 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1863 C.insertGlobalValues(
1865 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1866 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1872 template<
class Scalar,
1874 class GlobalOrdinal,
1876 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1877 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1878 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1880 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1881 Mview.maxNumRowEntries = Mview.indices[0].size();
1883 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1884 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1885 Mview.maxNumRowEntries = Mview.indices[i].size();
1890 template<
class CrsMatrixType>
1891 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1893 size_t Aest = 100, Best=100;
1894 if (A.getNodeNumEntries() > 0)
1895 Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1896 if (B.getNodeNumEntries() > 0)
1897 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1899 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
1900 nnzperrow *= nnzperrow;
1902 return (
size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1912 template<
class Scalar,
1914 class GlobalOrdinal,
1916 void mult_A_B_newmatrix(
1917 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1918 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1919 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1920 const std::string& label,
1921 const Teuchos::RCP<Teuchos::ParameterList>& params)
1923 using Teuchos::Array;
1924 using Teuchos::ArrayRCP;
1925 using Teuchos::ArrayView;
1930 typedef LocalOrdinal LO;
1931 typedef GlobalOrdinal GO;
1933 typedef Import<LO,GO,NO> import_type;
1934 typedef Map<LO,GO,NO> map_type;
1937 typedef typename map_type::local_map_type local_map_type;
1939 typedef typename KCRS::StaticCrsGraphType graph_t;
1940 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1941 typedef typename NO::execution_space execution_space;
1942 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1943 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1945 #ifdef HAVE_TPETRA_MMM_TIMINGS
1946 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1947 using Teuchos::TimeMonitor;
1948 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM M5 Cmap")))));
1950 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1953 RCP<const import_type> Cimport;
1954 RCP<const map_type> Ccolmap;
1955 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1956 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1957 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1958 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1959 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1960 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1961 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1962 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1969 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1971 if (Bview.importMatrix.is_null()) {
1974 Ccolmap = Bview.colMap;
1975 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getNodeNumElements());
1977 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1978 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1979 KOKKOS_LAMBDA(
const LO i) {
1992 if (!Bimport.is_null() && !Iimport.is_null()) {
1993 Cimport = Bimport->setUnion(*Iimport);
1995 else if (!Bimport.is_null() && Iimport.is_null()) {
1996 Cimport = Bimport->setUnion();
1998 else if (Bimport.is_null() && !Iimport.is_null()) {
1999 Cimport = Iimport->setUnion();
2002 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
2004 Ccolmap = Cimport->getTargetMap();
2009 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2010 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2017 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2018 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2019 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2020 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2022 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2023 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2032 C.replaceColMap(Ccolmap);
2050 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2051 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2053 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2054 GO aidx = Acolmap_local.getGlobalElement(i);
2055 LO B_LID = Browmap_local.getLocalElement(aidx);
2056 if (B_LID != LO_INVALID) {
2057 targetMapToOrigRow(i) = B_LID;
2058 targetMapToImportRow(i) = LO_INVALID;
2060 LO I_LID = Irowmap_local.getLocalElement(aidx);
2061 targetMapToOrigRow(i) = LO_INVALID;
2062 targetMapToImportRow(i) = I_LID;
2069 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2076 template<
class Scalar,
2078 class GlobalOrdinal,
2080 class LocalOrdinalViewType>
2081 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2082 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2083 const LocalOrdinalViewType & targetMapToOrigRow,
2084 const LocalOrdinalViewType & targetMapToImportRow,
2085 const LocalOrdinalViewType & Bcol2Ccol,
2086 const LocalOrdinalViewType & Icol2Ccol,
2087 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2088 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2089 const std::string& label,
2090 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2091 #ifdef HAVE_TPETRA_MMM_TIMINGS
2092 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2093 using Teuchos::TimeMonitor;
2094 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore"))));
2097 using Teuchos::Array;
2098 using Teuchos::ArrayRCP;
2099 using Teuchos::ArrayView;
2105 typedef typename KCRS::StaticCrsGraphType graph_t;
2106 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2107 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2108 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2109 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2112 typedef LocalOrdinal LO;
2113 typedef GlobalOrdinal GO;
2115 typedef Map<LO,GO,NO> map_type;
2116 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2117 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2118 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2121 RCP<const map_type> Ccolmap = C.getColMap();
2122 size_t m = Aview.origMatrix->getNodeNumRows();
2123 size_t n = Ccolmap->getNodeNumElements();
2124 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2127 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2128 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2130 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2131 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2132 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2134 c_lno_view_t Irowptr;
2135 lno_nnz_view_t Icolind;
2136 scalar_view_t Ivals;
2137 if(!Bview.importMatrix.is_null()) {
2138 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2139 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2140 Ivals = Bview.importMatrix->getLocalMatrix().values;
2141 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2144 #ifdef HAVE_TPETRA_MMM_TIMINGS
2145 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2155 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2156 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2157 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2158 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2168 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2169 std::vector<size_t> c_status(n, ST_INVALID);
2179 size_t CSR_ip = 0, OLD_ip = 0;
2180 for (
size_t i = 0; i < m; i++) {
2183 Crowptr[i] = CSR_ip;
2186 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2187 LO Aik = Acolind[k];
2188 const SC Aval = Avals[k];
2189 if (Aval == SC_ZERO)
2192 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2199 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2202 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2203 LO Bkj = Bcolind[j];
2204 LO Cij = Bcol2Ccol[Bkj];
2206 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2208 c_status[Cij] = CSR_ip;
2209 Ccolind[CSR_ip] = Cij;
2210 Cvals[CSR_ip] = Aval*Bvals[j];
2214 Cvals[c_status[Cij]] += Aval*Bvals[j];
2225 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2226 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2227 LO Ikj = Icolind[j];
2228 LO Cij = Icol2Ccol[Ikj];
2230 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2232 c_status[Cij] = CSR_ip;
2233 Ccolind[CSR_ip] = Cij;
2234 Cvals[CSR_ip] = Aval*Ivals[j];
2237 Cvals[c_status[Cij]] += Aval*Ivals[j];
2244 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2246 Kokkos::resize(Ccolind,CSR_alloc);
2247 Kokkos::resize(Cvals,CSR_alloc);
2252 Crowptr[m] = CSR_ip;
2255 Kokkos::resize(Ccolind,CSR_ip);
2256 Kokkos::resize(Cvals,CSR_ip);
2258 #ifdef HAVE_TPETRA_MMM_TIMINGS
2260 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix Final Sort")));
2264 if (params.is_null() || params->get(
"sort entries",
true))
2265 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2266 C.setAllValues(Crowptr,Ccolind, Cvals);
2269 #ifdef HAVE_TPETRA_MMM_TIMINGS
2271 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix ESFC")));
2283 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2284 labelList->set(
"Timer Label",label);
2285 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2286 RCP<const Export<LO,GO,NO> > dummyExport;
2287 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2288 #ifdef HAVE_TPETRA_MMM_TIMINGS
2290 MM2 = Teuchos::null;
2300 template<
class Scalar,
2302 class GlobalOrdinal,
2304 void mult_A_B_reuse(
2305 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2306 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2307 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2308 const std::string& label,
2309 const Teuchos::RCP<Teuchos::ParameterList>& params)
2311 using Teuchos::Array;
2312 using Teuchos::ArrayRCP;
2313 using Teuchos::ArrayView;
2318 typedef LocalOrdinal LO;
2319 typedef GlobalOrdinal GO;
2321 typedef Import<LO,GO,NO> import_type;
2322 typedef Map<LO,GO,NO> map_type;
2325 typedef typename map_type::local_map_type local_map_type;
2327 typedef typename KCRS::StaticCrsGraphType graph_t;
2328 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2329 typedef typename NO::execution_space execution_space;
2330 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2331 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2333 #ifdef HAVE_TPETRA_MMM_TIMINGS
2334 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2335 using Teuchos::TimeMonitor;
2336 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse Cmap"))));
2338 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2341 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2342 RCP<const map_type> Ccolmap = C.getColMap();
2343 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2344 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2345 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2346 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2347 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2348 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2351 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2355 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2356 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2359 if (!Bview.importMatrix.is_null()) {
2360 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2361 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2363 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2364 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2365 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2371 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2372 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2373 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2374 GO aidx = Acolmap_local.getGlobalElement(i);
2375 LO B_LID = Browmap_local.getLocalElement(aidx);
2376 if (B_LID != LO_INVALID) {
2377 targetMapToOrigRow(i) = B_LID;
2378 targetMapToImportRow(i) = LO_INVALID;
2380 LO I_LID = Irowmap_local.getLocalElement(aidx);
2381 targetMapToOrigRow(i) = LO_INVALID;
2382 targetMapToImportRow(i) = I_LID;
2389 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2393 template<
class Scalar,
2395 class GlobalOrdinal,
2397 class LocalOrdinalViewType>
2398 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2399 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2400 const LocalOrdinalViewType & targetMapToOrigRow,
2401 const LocalOrdinalViewType & targetMapToImportRow,
2402 const LocalOrdinalViewType & Bcol2Ccol,
2403 const LocalOrdinalViewType & Icol2Ccol,
2404 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2405 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2406 const std::string& label,
2407 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2408 #ifdef HAVE_TPETRA_MMM_TIMINGS
2409 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2410 using Teuchos::TimeMonitor;
2411 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
2412 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2422 typedef typename KCRS::StaticCrsGraphType graph_t;
2423 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2424 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2425 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2428 typedef LocalOrdinal LO;
2429 typedef GlobalOrdinal GO;
2431 typedef Map<LO,GO,NO> map_type;
2432 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2433 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2434 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2437 RCP<const map_type> Ccolmap = C.getColMap();
2438 size_t m = Aview.origMatrix->getNodeNumRows();
2439 size_t n = Ccolmap->getNodeNumElements();
2442 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2443 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2444 const KCRS & Cmat = C.getLocalMatrix();
2446 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2447 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2448 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2449 scalar_view_t Cvals = Cmat.values;
2451 c_lno_view_t Irowptr;
2452 lno_nnz_view_t Icolind;
2453 scalar_view_t Ivals;
2454 if(!Bview.importMatrix.is_null()) {
2455 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2456 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2457 Ivals = Bview.importMatrix->getLocalMatrix().values;
2460 #ifdef HAVE_TPETRA_MMM_TIMINGS
2461 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2473 std::vector<size_t> c_status(n, ST_INVALID);
2476 size_t CSR_ip = 0, OLD_ip = 0;
2477 for (
size_t i = 0; i < m; i++) {
2480 OLD_ip = Crowptr[i];
2481 CSR_ip = Crowptr[i+1];
2482 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2483 c_status[Ccolind[k]] = k;
2489 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2490 LO Aik = Acolind[k];
2491 const SC Aval = Avals[k];
2492 if (Aval == SC_ZERO)
2495 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2497 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2499 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2500 LO Bkj = Bcolind[j];
2501 LO Cij = Bcol2Ccol[Bkj];
2503 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2504 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2505 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2507 Cvals[c_status[Cij]] += Aval * Bvals[j];
2512 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2513 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2514 LO Ikj = Icolind[j];
2515 LO Cij = Icol2Ccol[Ikj];
2517 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2518 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2519 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2521 Cvals[c_status[Cij]] += Aval * Ivals[j];
2527 #ifdef HAVE_TPETRA_MMM_TIMINGS
2528 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse ESFC"))));
2531 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2537 template<
class Scalar,
2539 class GlobalOrdinal,
2541 void jacobi_A_B_newmatrix(
2543 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2544 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2545 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2546 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2547 const std::string& label,
2548 const Teuchos::RCP<Teuchos::ParameterList>& params)
2550 using Teuchos::Array;
2551 using Teuchos::ArrayRCP;
2552 using Teuchos::ArrayView;
2556 typedef LocalOrdinal LO;
2557 typedef GlobalOrdinal GO;
2560 typedef Import<LO,GO,NO> import_type;
2561 typedef Map<LO,GO,NO> map_type;
2562 typedef typename map_type::local_map_type local_map_type;
2566 typedef typename KCRS::StaticCrsGraphType graph_t;
2567 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2568 typedef typename NO::execution_space execution_space;
2569 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2570 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2573 #ifdef HAVE_TPETRA_MMM_TIMINGS
2574 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2575 using Teuchos::TimeMonitor;
2576 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi M5 Cmap"))));
2578 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2581 RCP<const import_type> Cimport;
2582 RCP<const map_type> Ccolmap;
2583 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2584 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2585 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2586 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2587 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2588 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2589 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2590 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2597 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2599 if (Bview.importMatrix.is_null()) {
2602 Ccolmap = Bview.colMap;
2606 Kokkos::RangePolicy<execution_space, LO> range (0,
static_cast<LO
> (Bview.colMap->getNodeNumElements ()));
2607 Kokkos::parallel_for (range, KOKKOS_LAMBDA (
const size_t i) {
2608 Bcol2Ccol(i) =
static_cast<LO
> (i);
2619 if (!Bimport.is_null() && !Iimport.is_null()){
2620 Cimport = Bimport->setUnion(*Iimport);
2621 Ccolmap = Cimport->getTargetMap();
2623 }
else if (!Bimport.is_null() && Iimport.is_null()) {
2624 Cimport = Bimport->setUnion();
2626 }
else if(Bimport.is_null() && !Iimport.is_null()) {
2627 Cimport = Iimport->setUnion();
2630 throw std::runtime_error(
"TpetraExt::Jacobi status of matrix importers is nonsensical");
2632 Ccolmap = Cimport->getTargetMap();
2634 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2635 std::runtime_error,
"Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2642 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2643 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2644 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2645 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2647 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2648 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2657 C.replaceColMap(Ccolmap);
2675 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2676 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2677 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2678 GO aidx = Acolmap_local.getGlobalElement(i);
2679 LO B_LID = Browmap_local.getLocalElement(aidx);
2680 if (B_LID != LO_INVALID) {
2681 targetMapToOrigRow(i) = B_LID;
2682 targetMapToImportRow(i) = LO_INVALID;
2684 LO I_LID = Irowmap_local.getLocalElement(aidx);
2685 targetMapToOrigRow(i) = LO_INVALID;
2686 targetMapToImportRow(i) = I_LID;
2693 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2702 template<
class Scalar,
2704 class GlobalOrdinal,
2706 class LocalOrdinalViewType>
2707 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2708 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2709 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2710 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2711 const LocalOrdinalViewType & targetMapToOrigRow,
2712 const LocalOrdinalViewType & targetMapToImportRow,
2713 const LocalOrdinalViewType & Bcol2Ccol,
2714 const LocalOrdinalViewType & Icol2Ccol,
2715 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2716 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2717 const std::string& label,
2718 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2720 #ifdef HAVE_TPETRA_MMM_TIMINGS
2721 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2722 using Teuchos::TimeMonitor;
2723 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Nemwmatrix SerialCore")));
2726 using Teuchos::Array;
2727 using Teuchos::ArrayRCP;
2728 using Teuchos::ArrayView;
2734 typedef typename KCRS::StaticCrsGraphType graph_t;
2735 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2736 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2737 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2738 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2741 typedef typename scalar_view_t::memory_space scalar_memory_space;
2744 typedef LocalOrdinal LO;
2745 typedef GlobalOrdinal GO;
2748 typedef Map<LO,GO,NO> map_type;
2749 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2750 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2753 RCP<const map_type> Ccolmap = C.getColMap();
2754 size_t m = Aview.origMatrix->getNodeNumRows();
2755 size_t n = Ccolmap->getNodeNumElements();
2756 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2759 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2760 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2762 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2763 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2764 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2766 c_lno_view_t Irowptr;
2767 lno_nnz_view_t Icolind;
2768 scalar_view_t Ivals;
2769 if(!Bview.importMatrix.is_null()) {
2770 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2771 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2772 Ivals = Bview.importMatrix->getLocalMatrix().values;
2773 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2777 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2785 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2786 Array<size_t> c_status(n, ST_INVALID);
2795 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2796 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2797 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2798 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2799 size_t CSR_ip = 0, OLD_ip = 0;
2801 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2815 for (
size_t i = 0; i < m; i++) {
2818 Crowptr[i] = CSR_ip;
2819 SC minusOmegaDval = -omega*Dvals(i,0);
2822 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2823 Scalar Bval = Bvals[j];
2824 if (Bval == SC_ZERO)
2826 LO Bij = Bcolind[j];
2827 LO Cij = Bcol2Ccol[Bij];
2830 c_status[Cij] = CSR_ip;
2831 Ccolind[CSR_ip] = Cij;
2832 Cvals[CSR_ip] = Bvals[j];
2837 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2838 LO Aik = Acolind[k];
2839 const SC Aval = Avals[k];
2840 if (Aval == SC_ZERO)
2843 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2845 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2847 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2848 LO Bkj = Bcolind[j];
2849 LO Cij = Bcol2Ccol[Bkj];
2851 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2853 c_status[Cij] = CSR_ip;
2854 Ccolind[CSR_ip] = Cij;
2855 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2859 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2865 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2866 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2867 LO Ikj = Icolind[j];
2868 LO Cij = Icol2Ccol[Ikj];
2870 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2872 c_status[Cij] = CSR_ip;
2873 Ccolind[CSR_ip] = Cij;
2874 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2877 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2884 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2886 Kokkos::resize(Ccolind,CSR_alloc);
2887 Kokkos::resize(Cvals,CSR_alloc);
2891 Crowptr[m] = CSR_ip;
2894 Kokkos::resize(Ccolind,CSR_ip);
2895 Kokkos::resize(Cvals,CSR_ip);
2898 #ifdef HAVE_TPETRA_MMM_TIMINGS
2899 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix Final Sort")));
2906 C.replaceColMap(Ccolmap);
2913 if (params.is_null() || params->get(
"sort entries",
true))
2914 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2915 C.setAllValues(Crowptr,Ccolind, Cvals);
2918 #ifdef HAVE_TPETRA_MMM_TIMINGS
2919 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix ESFC")));
2930 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2931 labelList->set(
"Timer Label",label);
2932 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2933 RCP<const Export<LO,GO,NO> > dummyExport;
2934 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2942 template<
class Scalar,
2944 class GlobalOrdinal,
2946 void jacobi_A_B_reuse(
2948 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2949 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2950 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2951 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2952 const std::string& label,
2953 const Teuchos::RCP<Teuchos::ParameterList>& params)
2955 using Teuchos::Array;
2956 using Teuchos::ArrayRCP;
2957 using Teuchos::ArrayView;
2961 typedef LocalOrdinal LO;
2962 typedef GlobalOrdinal GO;
2965 typedef Import<LO,GO,NO> import_type;
2966 typedef Map<LO,GO,NO> map_type;
2969 typedef typename map_type::local_map_type local_map_type;
2971 typedef typename KCRS::StaticCrsGraphType graph_t;
2972 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2973 typedef typename NO::execution_space execution_space;
2974 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2975 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2977 #ifdef HAVE_TPETRA_MMM_TIMINGS
2978 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2979 using Teuchos::TimeMonitor;
2980 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse Cmap"))));
2982 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2985 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2986 RCP<const map_type> Ccolmap = C.getColMap();
2987 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2988 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2989 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2990 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2991 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2992 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2995 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2999 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
3000 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
3003 if (!Bview.importMatrix.is_null()) {
3004 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
3005 std::runtime_error,
"Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
3007 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
3008 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
3009 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
3015 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
3016 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
3017 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
3018 GO aidx = Acolmap_local.getGlobalElement(i);
3019 LO B_LID = Browmap_local.getLocalElement(aidx);
3020 if (B_LID != LO_INVALID) {
3021 targetMapToOrigRow(i) = B_LID;
3022 targetMapToImportRow(i) = LO_INVALID;
3024 LO I_LID = Irowmap_local.getLocalElement(aidx);
3025 targetMapToOrigRow(i) = LO_INVALID;
3026 targetMapToImportRow(i) = I_LID;
3031 #ifdef HAVE_TPETRA_MMM_TIMINGS
3037 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
3043 template<
class Scalar,
3045 class GlobalOrdinal,
3047 class LocalOrdinalViewType>
3048 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
3049 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
3050 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3051 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3052 const LocalOrdinalViewType & targetMapToOrigRow,
3053 const LocalOrdinalViewType & targetMapToImportRow,
3054 const LocalOrdinalViewType & Bcol2Ccol,
3055 const LocalOrdinalViewType & Icol2Ccol,
3056 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
3057 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
3058 const std::string& label,
3059 const Teuchos::RCP<Teuchos::ParameterList>& ) {
3060 #ifdef HAVE_TPETRA_MMM_TIMINGS
3061 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
3062 using Teuchos::TimeMonitor;
3063 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore"))));
3064 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
3073 typedef typename KCRS::StaticCrsGraphType graph_t;
3074 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3075 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3076 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3077 typedef typename scalar_view_t::memory_space scalar_memory_space;
3080 typedef LocalOrdinal LO;
3081 typedef GlobalOrdinal GO;
3083 typedef Map<LO,GO,NO> map_type;
3084 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3085 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3086 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3089 RCP<const map_type> Ccolmap = C.getColMap();
3090 size_t m = Aview.origMatrix->getNodeNumRows();
3091 size_t n = Ccolmap->getNodeNumElements();
3094 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
3095 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
3096 const KCRS & Cmat = C.getLocalMatrix();
3098 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3099 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3100 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3101 scalar_view_t Cvals = Cmat.values;
3103 c_lno_view_t Irowptr;
3104 lno_nnz_view_t Icolind;
3105 scalar_view_t Ivals;
3106 if(!Bview.importMatrix.is_null()) {
3107 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
3108 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
3109 Ivals = Bview.importMatrix->getLocalMatrix().values;
3113 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
3115 #ifdef HAVE_TPETRA_MMM_TIMINGS
3116 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore - Compare"))));
3124 std::vector<size_t> c_status(n, ST_INVALID);
3127 size_t CSR_ip = 0, OLD_ip = 0;
3128 for (
size_t i = 0; i < m; i++) {
3132 OLD_ip = Crowptr[i];
3133 CSR_ip = Crowptr[i+1];
3134 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
3135 c_status[Ccolind[k]] = k;
3141 SC minusOmegaDval = -omega*Dvals(i,0);
3144 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3145 Scalar Bval = Bvals[j];
3146 if (Bval == SC_ZERO)
3148 LO Bij = Bcolind[j];
3149 LO Cij = Bcol2Ccol[Bij];
3151 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3152 std::runtime_error,
"Trying to insert a new entry into a static graph");
3154 Cvals[c_status[Cij]] = Bvals[j];
3158 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3159 LO Aik = Acolind[k];
3160 const SC Aval = Avals[k];
3161 if (Aval == SC_ZERO)
3164 if (targetMapToOrigRow[Aik] != LO_INVALID) {
3166 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
3168 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3169 LO Bkj = Bcolind[j];
3170 LO Cij = Bcol2Ccol[Bkj];
3172 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3173 std::runtime_error,
"Trying to insert a new entry into a static graph");
3175 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3180 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
3181 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3182 LO Ikj = Icolind[j];
3183 LO Cij = Icol2Ccol[Ikj];
3185 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3186 std::runtime_error,
"Trying to insert a new entry into a static graph");
3188 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3194 #ifdef HAVE_TPETRA_MMM_TIMINGS
3195 MM2 = Teuchos::null;
3196 MM2 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
3199 C.fillComplete(C.getDomainMap(), C.getRangeMap());
3200 #ifdef HAVE_TPETRA_MMM_TIMINGS
3201 MM2 = Teuchos::null;
3210 template<
class Scalar,
3212 class GlobalOrdinal,
3214 void import_and_extract_views(
3215 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3216 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3217 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3218 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3219 bool userAssertsThereAreNoRemotes,
3220 const std::string& label,
3221 const Teuchos::RCP<Teuchos::ParameterList>& params)
3223 using Teuchos::Array;
3224 using Teuchos::ArrayView;
3227 using Teuchos::null;
3230 typedef LocalOrdinal LO;
3231 typedef GlobalOrdinal GO;
3234 typedef Map<LO,GO,NO> map_type;
3235 typedef Import<LO,GO,NO> import_type;
3236 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3238 #ifdef HAVE_TPETRA_MMM_TIMINGS
3239 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
3240 using Teuchos::TimeMonitor;
3241 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Alloc"))));
3249 Aview.deleteContents();
3251 Aview.origMatrix = rcp(&A,
false);
3252 Aview.origRowMap = A.getRowMap();
3253 Aview.rowMap = targetMap;
3254 Aview.colMap = A.getColMap();
3255 Aview.domainMap = A.getDomainMap();
3256 Aview.importColMap =
null;
3259 if (userAssertsThereAreNoRemotes)
3262 RCP<const import_type> importer;
3263 if (params !=
null && params->isParameter(
"importer")) {
3264 importer = params->get<RCP<const import_type> >(
"importer");
3267 #ifdef HAVE_TPETRA_MMM_TIMINGS
3269 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap"))));
3274 RCP<const map_type> rowMap = A.getRowMap(), remoteRowMap;
3275 size_t numRemote = 0;
3277 if (!prototypeImporter.is_null() &&
3278 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3279 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3281 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3282 numRemote = prototypeImporter->getNumRemoteIDs();
3284 Array<GO> remoteRows(numRemote);
3285 for (
size_t i = 0; i < numRemote; i++)
3286 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3288 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3289 rowMap->getIndexBase(), rowMap->getComm()));
3292 }
else if (prototypeImporter.is_null()) {
3294 ArrayView<const GO> rows = targetMap->getNodeElementList();
3295 size_t numRows = targetMap->getNodeNumElements();
3297 Array<GO> remoteRows(numRows);
3298 for(
size_t i = 0; i < numRows; ++i) {
3299 const LO mlid = rowMap->getLocalElement(rows[i]);
3301 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3302 remoteRows[numRemote++] = rows[i];
3304 remoteRows.resize(numRemote);
3305 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3306 rowMap->getIndexBase(), rowMap->getComm()));
3314 const int numProcs = rowMap->getComm()->getSize();
3317 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3318 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3327 #ifdef HAVE_TPETRA_MMM_TIMINGS
3329 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Collective-0"))));
3333 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3335 if (globalMaxNumRemote > 0) {
3336 #ifdef HAVE_TPETRA_MMM_TIMINGS
3338 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-2"))));
3342 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3344 importer = rcp(
new import_type(rowMap, remoteRowMap));
3346 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match A.getRowMap()!");
3350 params->set(
"importer", importer);
3353 if (importer !=
null) {
3354 #ifdef HAVE_TPETRA_MMM_TIMINGS
3356 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-3"))));
3360 Teuchos::ParameterList labelList;
3361 labelList.set(
"Timer Label", label);
3362 auto & labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
3365 bool overrideAllreduce =
false;
3368 Teuchos::ParameterList params_sublist;
3369 if(!params.is_null()) {
3370 labelList.set(
"compute global constants", params->get(
"compute global constants",
false));
3371 params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
3372 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3373 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3374 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3375 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
3376 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
3378 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
3379 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3380 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3382 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3383 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3389 sprintf(str,
"import_matrix.%d.dat",count);
3394 #ifdef HAVE_TPETRA_MMM_STATISTICS
3395 printMultiplicationStatistics(importer, label + std::string(
" I&X MMM"));
3399 #ifdef HAVE_TPETRA_MMM_TIMINGS
3401 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-4"))));
3405 Aview.importColMap = Aview.importMatrix->getColMap();
3406 #ifdef HAVE_TPETRA_MMM_TIMINGS
3418 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3420 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3421 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3422 const LocalOrdinalViewType & Acol2Brow,
3423 const LocalOrdinalViewType & Acol2Irow,
3424 const LocalOrdinalViewType & Bcol2Ccol,
3425 const LocalOrdinalViewType & Icol2Ccol,
3426 const size_t mergedNodeNumCols) {
3430 typedef typename KCRS::StaticCrsGraphType graph_t;
3431 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3432 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3433 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3435 const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3436 const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3439 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3443 RCP<const KCRS> Ik_;
3444 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3445 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3447 if(Ik!=0) Iks = *Ik;
3448 size_t merge_numrows = Ak.numCols();
3450 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3452 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3455 typedef typename Node::execution_space execution_space;
3456 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3457 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3458 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3459 if(
final) Mrowptr(i) = update;
3462 if(Acol2Brow(i)!=LO_INVALID)
3463 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3465 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3468 if(
final && i+1==merge_numrows)
3469 Mrowptr(i+1)=update;
3473 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3474 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3475 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz);
3478 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3479 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3480 if(Acol2Brow(i)!=LO_INVALID) {
3481 size_t row = Acol2Brow(i);
3482 size_t start = Bk.graph.row_map(row);
3483 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3484 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3485 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3489 size_t row = Acol2Irow(i);
3490 size_t start = Iks.graph.row_map(row);
3491 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3492 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3493 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3498 KCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3521 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3523 void MatrixMatrix::Multiply( \
3524 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3526 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3528 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3529 bool call_FillComplete_on_result, \
3530 const std::string & label, \
3531 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3534 void MatrixMatrix::Jacobi( \
3536 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3537 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3538 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3539 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3540 bool call_FillComplete_on_result, \
3541 const std::string & label, \
3542 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3545 void MatrixMatrix::Add( \
3546 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3549 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3552 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3555 void MatrixMatrix::Add( \
3556 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3559 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3563 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3564 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3565 (const SCALAR & alpha, \
3566 const bool transposeA, \
3567 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3568 const SCALAR & beta, \
3569 const bool transposeB, \
3570 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3571 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3572 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3573 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3575 template struct MatrixMatrix::AddDetails::AddKernels<SCALAR, LO, GO, NODE>;
3579 #endif // TPETRA_MATRIXMATRIX_DEF_HPP