755 *
return 1. / (0.05 + 2. * p.square());
761 *
double Coefficient<dim>::value(
const Point<dim> & p,
762 *
const unsigned int component)
const
764 *
return value<double>(p, component);
771 * <a name=
"Matrixfreeimplementation"></a>
772 * <h3>Matrix-
free implementation</h3>
776 * The following
class, called <code>LaplaceOperator</code>, implements the
777 * differential
operator. For all practical purposes, it is a
matrix, i.e.,
778 * you can ask it
for its size (member functions <code>m(), n()</code>) and
779 * you can apply it to a vector (the <code>vmult()</code> function). The
780 * difference to a real
matrix of course lies in the fact that this class
781 * does not actually store the <i>elements</i> of the
matrix, but only knows
782 * how to compute the action of the operator when applied to a vector.
786 * The infrastructure describing the
matrix size, the initialization from a
788 * through vmult() and Tvmult() methods, is provided by the class
789 * MatrixFreeOperator::Base from which this class derives. The
790 * LaplaceOperator class defined here only has to provide a few interfaces,
791 * namely the actual action of the operator through the apply_add() method
792 * that gets used in the vmult()
functions, and a method to compute the
794 * definition of the multigrid smoother. Since we consider a problem with
795 * variable coefficient, we further implement a method that can fill the
800 * Note that the file <code>include/deal.II/matrix_free/operators.h</code>
801 * already contains an implementation of the Laplacian through the class
803 * operator is re-implemented in this tutorial program, explaining the
804 * ingredients and concepts used there.
808 * This program makes use of the data cache for finite element operator
809 * application that is integrated in deal.II. This data cache class is
810 * called
MatrixFree. It contains mapping information (Jacobians) and index
811 * relations between local and global degrees of freedom. It also contains
812 * constraints like the ones from hanging nodes or Dirichlet boundary
813 * conditions. Moreover, it can issue a
loop over all cells in %
parallel,
814 * making sure that only cells are worked on that do not share any degree of
815 * freedom (this makes the
loop thread-safe when writing into destination
816 * vectors). This is a more advanced strategy compared to the
WorkStream
817 * class described in the @ref threads module. Of course, to not destroy
818 * thread-safety, we have to be careful when writing into class-global
823 * The class implementing the Laplace operator has three template arguments,
824 *
one for the dimension (as many deal.II classes carry),
one for the degree
825 * of the finite element (which we need to enable efficient computations
827 * type. We want to use <code>
double</code>
numbers (i.
e.,
double precision,
828 * 64-bit floating
point) for the final
matrix, but floats (single
830 * matrices (as that is only a preconditioner, and floats can be processed
831 * twice as fast). The class
FEEvaluation also takes a template argument for
832 * the number of quadrature points in
one dimension. In the code below, we
833 * hard-code it to <code>fe_degree+1</code>. If we wanted to change it
834 * independently of the polynomial degree, we would need to add a template
839 * As a sidenote, if we implemented several different operations on the same
840 * grid and degrees of freedom (like a mass
matrix and a Laplace
matrix), we
841 * would define two classes like the current
one for each of the operators
845 * only provide a minimal
set of
functions. This concept allows for writing
846 * complex application codes with many
matrix-
free operations.
851 * requires care: Here, we use the deal.II table class which is prepared to
852 * hold the data with correct alignment. However, storing
e.g. an
853 * <code>std::vector<
VectorizedArray<number> ></code> is not possible with
854 * vectorization:
A certain alignment of the data with the memory address
855 * boundaries is required (essentially, a
VectorizedArray that is 32 bytes
856 *
long in case of AVX needs to start at a memory address that is divisible
857 * by 32). The table class (as well as the
AlignedVector class it is based
858 * on) makes sure that this alignment is respected, whereas std::vector does
859 * not in
general, which may lead to segmentation faults at strange places
860 * for some systems or suboptimal performance for other systems.
863 * template <
int dim,
int fe_degree, typename number>
864 * class LaplaceOperator
869 *
using value_type = number;
873 *
void clear()
override;
875 *
void evaluate_coefficient(
const Coefficient<dim> &coefficient_function);
880 *
virtual void apply_add(
888 *
const std::pair<unsigned int, unsigned int> &cell_range)
const;
890 *
void local_compute_diagonal(
893 *
const unsigned int & dummy,
894 *
const std::pair<unsigned int, unsigned int> &cell_range)
const;
903 * This is the constructor of the @p LaplaceOperator
class. All it does is
904 * to
call the
default constructor of the base
class
906 *
class that asserts that this class is not accessed after going out of scope
907 *
e.g. in a preconditioner.
910 *
template <
int dim,
int fe_degree,
typename number>
911 * LaplaceOperator<dim, fe_degree, number>::LaplaceOperator()
918 *
template <
int dim,
int fe_degree,
typename number>
919 *
void LaplaceOperator<dim, fe_degree, number>::clear()
921 * coefficient.
reinit(0, 0);
931 * <a name=
"Computationofcoefficient"></a>
932 * <h4>Computation of coefficient</h4>
936 * To initialize the coefficient, we directly give it the Coefficient
class
937 * defined above and then select the method
938 * <code>coefficient_function.value</code> with vectorized number (which the
939 * compiler can deduce from the
point data type). The use of the
940 *
FEEvaluation class (and its
template arguments) will be explained below.
943 * template <int dim, int fe_degree, typename number>
944 *
void LaplaceOperator<dim, fe_degree, number>::evaluate_coefficient(
945 *
const Coefficient<dim> &coefficient_function)
947 *
const unsigned int n_cells = this->data->n_cell_batches();
950 * coefficient.reinit(
n_cells, phi.n_q_points);
951 *
for (
unsigned int cell = 0; cell <
n_cells; ++cell)
954 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
955 * coefficient(cell, q) =
956 * coefficient_function.value(phi.quadrature_point(q));
965 * <a name=
"LocalevaluationofLaplaceoperator"></a>
966 * <h4>Local evaluation of Laplace
operator</h4>
970 * Here comes the main
function of
this class, the evaluation of the
971 *
matrix-vector product (or, in
general, a finite element
operator
972 * evaluation). This is done in a
function that takes exactly four
973 * arguments, the
MatrixFree object, the destination and source vectors, and
974 * a range of cells that are to be worked on. The method
975 * <code>cell_loop</code> in the
MatrixFree class will internally
call this
976 *
function with some range of cells that is obtained by checking which
977 * cells are possible to work on simultaneously so that write operations
do
978 * not cause any race condition. Note that the cell range used in the
loop
979 * is not directly the number of (active) cells in the current mesh, but
980 * rather a collection of batches of cells. In other word,
"cell" may be
982 * cells together. This means that in the
loop over quadrature points we are
983 * actually seeing a group of quadrature points of several cells as
one
984 * block. This is done to enable a higher degree of vectorization. The
985 * number of such
"cells" or
"cell batches" is stored in
MatrixFree and can
987 * cell iterators, in
this class all cells are laid out in a plain array
988 * with no direct knowledge of
level or neighborship relations, which makes
989 * it possible to index the cells by
unsigned integers.
993 * The implementation of the Laplace
operator is quite simple: First, we
994 * need to create an
object FEEvaluation that contains the computational
995 * kernels and has data fields to store temporary results (
e.g.
gradients
996 * evaluated on all quadrature points on a collection of a few cells). Note
997 * that temporary results
do not use a lot of memory, and since we specify
998 *
template arguments with the element order, the data is stored on the
999 * stack (without expensive memory allocation). Usually,
one only needs to
1000 *
set two
template arguments, the dimension as a
first argument and the
1001 * degree of the finite element as the
second argument (
this is
equal to the
1002 * number of degrees of freedom per dimension minus
one for FE_Q
1003 * elements). However, here we also want to be able to use
float numbers for
1004 * the multigrid preconditioner, which is the last (fifth)
template
1005 * argument. Therefore, we cannot rely on the
default template arguments and
1006 * must also fill the third and fourth field, consequently. The third
1007 * argument specifies the number of quadrature points per direction and has
1008 * a
default value equal to the degree of the element plus
one. The fourth
1009 * argument sets the number of components (
one can also evaluate
1010 * vector-valued
functions in systems of PDEs, but the
default is a scalar
1011 * element), and
finally the last argument sets the number type.
1015 * Next, we
loop over the given cell range and then we
continue with the
1016 * actual implementation: <ol> <li>Tell the
FEEvaluation object the (macro)
1017 * cell we want to work on. <li>Read in the
values of the source vectors
1018 * (@p read_dof_values), including the resolution of constraints. This
1019 * stores @f$u_\mathrm{cell}@f$ as described in the introduction. <li>Compute
1020 * the unit-cell
gradient (the evaluation of finite element
1022 *
gradient computations, it uses a unified
interface to all kinds of
1023 * derivatives of order between
zero and two. We only want
gradients, no
1024 *
values and no
second derivatives, so we
set the
function arguments to
1026 * (
first slot). There is also a third slot
for the Hessian which is
1027 *
false by
default, so it needs not be given. Note that the
FEEvaluation
1028 *
class internally evaluates shape
functions in an efficient way where
one
1029 * dimension is worked on at a time (
using the tensor product form of shape
1030 *
functions and quadrature points as mentioned in the introduction). This
1031 * gives complexity
equal to @f$\mathcal
O(
d^2 (p+1)^{
d+1})@f$
for polynomial
1032 * degree @f$p@f$ in @f$d@f$ dimensions, compared to the naive approach with loops
1033 * over all local degrees of freedom and quadrature points that is used in
1034 *
FEValues and costs @f$\mathcal
O(
d (p+1)^{2
d})@f$. <li>Next comes the
1035 * application of the Jacobian transformation, the multiplication by the
1036 * variable coefficient and the quadrature weight.
FEEvaluation has an
1037 * access
function @p get_gradient that applies the Jacobian and returns the
1038 * gradient in real space. Then, we just need to multiply by the (scalar)
1039 * coefficient, and let the
function @p submit_gradient apply the
second
1040 * Jacobian (
for the test
function) and the quadrature weight and Jacobian
1042 * data field as where it is read from in @p get_gradient. Therefore, you
1043 * need to make sure to not read from the same quadrature
point again after
1044 * having called @p submit_gradient on that particular quadrature
point. In
1045 *
general, it is a good idea to
copy the result of @p get_gradient when it
1046 * is used more often than once. <li>Next follows the summation over
1047 * quadrature points
for all test
functions that corresponds to the actual
1048 * integration step. For the Laplace
operator, we just multiply by the
1049 *
gradient, so we
call the integrate
function with the respective argument
1050 *
set. If you have an equation where you test by both the
values of the
1052 * to
true. Calling
first the integrate
function for values and then
1054 *
call will internally overwrite the results from the
first call. Note that
1055 * there is no
function argument
for the
second derivative
for integrate
1056 * step. <li>Eventually, the local contributions in the vector
1057 * @f$v_\mathrm{cell}@f$ as mentioned in the introduction need to be added into
1058 * the result vector (and constraints are applied). This is done with a
call
1059 * to @p distribute_local_to_global, the same name as the corresponding
1061 * in the
FEEvaluation object, as are the indices between local and global
1062 * degrees of freedom). </ol>
1065 *
template <
int dim,
int fe_degree,
typename number>
1066 *
void LaplaceOperator<dim, fe_degree, number>::local_apply(
1070 *
const std::pair<unsigned int, unsigned int> & cell_range)
const
1074 *
for (
unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
1080 * phi.read_dof_values(src);
1082 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
1083 * phi.submit_gradient(coefficient(cell, q) * phi.get_gradient(q), q);
1085 * phi.distribute_local_to_global(dst);
1093 * This
function implements the
loop over all cells
for the
1094 * Base::apply_add() interface. This is done with the @p cell_loop of the
1095 *
MatrixFree class, which takes the operator() of this class with arguments
1096 *
MatrixFree, OutVector, InVector, cell_range. When working with MPI
1097 * parallelization (but no threading) as is done in this tutorial program,
1098 * the cell
loop corresponds to the following three lines of code:
1102 * <div class=CodeFragmentInTutorialComment>
1104 * src.update_ghost_values();
1105 * local_apply(*this->data, dst, src, std::make_pair(0
U,
1106 * data.n_cell_batches()));
1113 * Here, the two calls update_ghost_values() and
compress() perform the data
1114 * exchange on processor boundaries for MPI, once for the source vector
1115 * where we need to read from entries owned by remote processors, and once
1116 * for the destination vector where we have accumulated parts of the
1117 * residuals that need to be added to the respective entry of the owner
1118 * processor. However,
MatrixFree::cell_loop does not only abstract away
1119 * those two calls, but also performs some additional optimizations. On the
1120 *
one hand, it will
split the update_ghost_values() and
compress() calls in
1121 * a way to allow for overlapping communication and computation. The
1122 * local_apply function is then called with three cell ranges representing
1123 * partitions of the cell range from 0 to
MatrixFree::n_cell_batches(). On
1124 * the other hand, cell_loop also supports thread parallelism in which case
1125 * the cell ranges are
split into smaller chunks and scheduled in an
1126 * advanced way that avoids access to the same vector entry by several
1127 * threads. That feature is explained in @ref step_48 "step-48".
1131 * Note that after the cell
loop, the constrained degrees of freedom need to
1132 * be touched once more for sensible vmult() operators: Since the assembly
1133 *
loop automatically resolves constraints (just as the
1135 * compute any contribution for constrained degrees of freedom, leaving the
1136 * respective entries
zero. This would represent a
matrix that had empty
1137 * rows and columns for constrained degrees of freedom. However, iterative
1138 * solvers like CG only work for non-singular matrices. The easiest way to
1139 * do that is to
set the sub-block of the
matrix that corresponds to
1140 * constrained DoFs to an
identity matrix, in which case application of the
1141 *
matrix would simply
copy the elements of the right hand side vector into
1142 * the left hand side. Fortunately, the vmult() implementations
1144 * apply_add() function, so we do not need to take further action here.
1149 * with MPI, there is
one aspect to be careful about — the indexing
1150 * used for accessing the vector. For performance reasons,
MatrixFree and
1151 *
FEEvaluation are designed to access vectors in MPI-local index space also
1152 * when working with multiple processors. Working in local index space means
1153 * that no index
translation needs to be performed at the place the vector
1154 * access happens, apart from the unavoidable indirect addressing. However,
1155 * local index spaces are ambiguous: While it is standard convention to
1156 * access the locally owned range of a vector with indices between 0 and the
1157 * local size, the numbering is not so clear for the ghosted entries and
1158 * somewhat arbitrary. For the
matrix-vector product, only the indices
1159 * appearing on locally owned cells (plus those referenced via hanging node
1160 * constraints) are necessary. However, in deal.II we often
set all the
1161 * degrees of freedom on ghosted elements as ghosted vector entries, called
1162 * the @ref GlossLocallyRelevantDof "locally relevant DoFs described in the
1163 * glossary". In that case, the MPI-local index of a ghosted vector entry
1164 * can in
general be different in the two possible ghost sets, despite
1165 * referring to the same global index. To avoid problems,
FEEvaluation
1166 * checks that the partitioning of the vector used for the
matrix-vector
1167 * product does indeed match with the partitioning of the indices in
1171 * mechanism to fit the ghost
set to the correct layout. This happens in the
1172 * ghost region of the vector, so keep in mind that the ghost region might
1173 * be modified in both the destination and source vector after a
call to a
1174 * vmult() method. This is legitimate because the ghost region of a
1175 * distributed deal.II vector is a mutable section and filled on
1176 * demand. Vectors used in
matrix-vector products must not be ghosted upon
1177 * entry of vmult()
functions, so no information gets lost.
1180 * template <
int dim,
int fe_degree, typename number>
1181 *
void LaplaceOperator<dim, fe_degree, number>::apply_add(
1185 * this->data->cell_loop(&LaplaceOperator::local_apply,
this, dst, src);
1192 * The following
function implements the computation of the
diagonal of the
1194 * turns out to be more complicated than evaluating the
1195 *
operator. Fundamentally, we could obtain a
matrix representation of the
1196 *
operator by applying the
operator on <i>all</i> unit vectors. Of course,
1197 * that would be very inefficient since we would need to perform <i>n</i>
1198 *
operator evaluations to retrieve the whole
matrix. Furthermore,
this
1199 * approach would completely ignore the
matrix sparsity. On an individual
1200 * cell, however,
this is the way to go and actually not that inefficient as
1201 * there usually is a coupling between all degrees of freedom inside the
1207 * layout. This vector is encapsulated in a member called
1208 * inverse_diagonal_entries of type
DiagonalMatrix in the base
class
1210 * need to initialize and then get the vector representing the
diagonal
1211 * entries in the
matrix. As to the actual
diagonal computation, we again
1212 * use the cell_loop infrastructure of
MatrixFree to invoke a local worker
1213 * routine called local_compute_diagonal(). Since we will only write into a
1214 * vector but not have any source vector, we put a dummy argument of type
1215 * <tt>
unsigned int</tt> in place of the source vector to confirm with the
1216 * cell_loop interface. After the
loop, we need to
set the vector entries
1217 * subject to Dirichlet boundary conditions to
one (either those on the
1219 * the indices at the interface between different grid levels in adaptive
1220 * multigrid). This is done through the
function
1222 * with the setting in the
matrix-vector product provided by the Base
1223 * operator. Finally, we need to
invert the
diagonal entries which is the
1224 * form required by the Chebyshev smoother based on the Jacobi iteration. In
1225 * the
loop, we assert that all entries are non-
zero, because they should
1226 * either have obtained a positive contribution from integrals or be
1227 * constrained and treated by @p set_constrained_entries_to_one() following
1231 * template <
int dim,
int fe_degree, typename number>
1238 * this->data->initialize_dof_vector(inverse_diagonal);
1239 *
unsigned int dummy = 0;
1240 * this->data->cell_loop(&LaplaceOperator::local_compute_diagonal,
1250 *
ExcMessage(
"No diagonal entry in a positive definite operator "
1251 *
"should be zero"));
1262 * columns in the local
matrix and putting the entry 1 in the <i>i</i>th
1263 * slot and a
zero entry in all other slots, i.e., we apply the cell-wise
1264 * differential
operator on
one unit vector at a time. The inner part
1266 * FEEvalution::integrate, is exactly the same as in the local_apply
1267 *
function. Afterwards, we pick out the <i>i</i>th entry of the local
1268 * result and put it to a temporary storage (as we overwrite all entries in
1270 * iteration). Finally, the temporary storage is written to the destination
1272 *
FEEvaluation::submit_dof_value() to read and write to the data field that
1273 *
FEEvaluation uses for the integration on the
one hand and writes into the
1274 * global vector on the other hand.
1278 * Given that we are only interested in the
matrix diagonal, we simply throw
1279 * away all other entries of the local
matrix that have been computed along
1280 * the way. While it might seem wasteful to compute the complete cell
matrix
1281 * and then throw away everything but the
diagonal, the integration are so
1282 * efficient that the computation does not take too much time. Note that the
1283 * complexity of operator evaluation per element is @f$\mathcal
1284 *
O((p+1)^{
d+1})@f$
for polynomial degree @f$k@f$, so computing the whole
matrix
1285 * costs us @f$\mathcal
O((p+1)^{2
d+1})@f$ operations, not too far away from
1286 * @f$\mathcal
O((p+1)^{2
d})@f$ complexity
for computing the
diagonal with
1288 * vectorization and other optimizations, the
diagonal computation with
this
1289 *
function is actually the fastest (simple) variant. (It would be possible
1290 * to compute the
diagonal with
sum factorization techniques in @f$\mathcal
1291 *
O((p+1)^{
d+1})@f$ operations involving specifically adapted
1292 * kernels—but since such kernels are only useful in that particular
1293 * context and the
diagonal computation is typically not on the critical
1294 * path, they have not been implemented in deal.II.)
1298 * Note that the code that calls distribute_local_to_global on the vector to
1299 * accumulate the
diagonal entries into the global
matrix has some
1300 * limitations. For operators with hanging node constraints that distribute
1301 * an integral contribution of a constrained DoF to several other entries
1302 * inside the distribute_local_to_global
call, the vector
interface used
1303 * here does not exactly compute the
diagonal entries, but lumps some
1306 * result is correct up to discretization accuracy as explained in <a
1307 * href=
"http://dx.doi.org/10.4208/cicp.101214.021015a">Kormann (2016),
1308 * section 5.3</a>, but not mathematically
equal. In
this tutorial program,
1309 * no harm can happen because the
diagonal is only used
for the multigrid
1310 *
level matrices where no hanging node constraints appear.
1313 *
template <
int dim,
int fe_degree,
typename number>
1314 *
void LaplaceOperator<dim, fe_degree, number>::local_compute_diagonal(
1317 *
const unsigned int &,
1318 *
const std::pair<unsigned int, unsigned int> &cell_range)
const
1324 *
for (
unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
1330 *
for (
unsigned int i = 0; i < phi.dofs_per_cell; ++i)
1332 *
for (
unsigned int j = 0; j < phi.dofs_per_cell; ++j)
1334 * phi.submit_dof_value(make_vectorized_array<number>(1.), i);
1337 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
1338 * phi.submit_gradient(coefficient(cell, q) * phi.get_gradient(q),
1341 *
diagonal[i] = phi.get_dof_value(i);
1343 *
for (
unsigned int i = 0; i < phi.dofs_per_cell; ++i)
1344 * phi.submit_dof_value(
diagonal[i], i);
1345 * phi.distribute_local_to_global(dst);
1354 * <a name=
"LaplaceProblemclass"></a>
1355 * <h3>LaplaceProblem
class</h3>
1359 * This
class is based on the
one in @ref step_16 "step-16". However, we replaced the
1361 * that we can also skip the sparsity patterns. Notice that we define the
1362 * LaplaceOperator
class with the degree of finite element as template
1363 * argument (the value is defined at the top of the file), and that we use
1368 * The
class also has a member variable to keep track of all the detailed
1369 * timings
for setting up the entire chain of data before we actually go
1370 * about solving the problem. In addition, there is an output stream (that
1371 * is disabled by
default) that can be used to output details
for the
1372 * individual setup operations instead of the summary only that is printed
1377 * Since
this program is designed to be used with MPI, we also provide the
1378 * usual @p pcout output stream that only prints the information of the
1379 * processor with MPI rank 0. The grid used
for this programs can either be
1380 * a distributed
triangulation based on p4est (in
case deal.II is configured
1381 * to use p4est), otherwise it is a serial grid that only runs without MPI.
1384 *
template <
int dim>
1385 *
class LaplaceProblem
1392 *
void setup_system();
1393 *
void assemble_rhs();
1395 *
void output_results(
const unsigned int cycle)
const;
1409 *
using SystemMatrixType =
1410 * LaplaceOperator<dim, degree_finite_element, double>;
1411 * SystemMatrixType system_matrix;
1414 *
using LevelMatrixType = LaplaceOperator<dim, degree_finite_element, float>;
1420 *
double setup_time;
1429 * When we initialize the finite element, we of course have to use the
1430 * degree specified at the top of the file as well (otherwise, an exception
1431 * will be thrown at some
point, since the computational kernel defined in
1432 * the templated LaplaceOperator
class and the information from the finite
1433 * element read out by
MatrixFree will not match). The constructor of the
1435 * conform to the 2:1 cell balance over
vertices, which is needed
for the
1436 * convergence of the geometric multigrid routines. For the distributed
1437 * grid, we also need to specifically enable the multigrid hierarchy.
1440 *
template <
int dim>
1441 * LaplaceProblem<dim>::LaplaceProblem()
1453 * fe(degree_finite_element)
1460 * The LaplaceProblem
class holds an additional output stream that
1461 * collects detailed timings about the setup phase. This stream, called
1462 * time_details, is disabled by
default through the @p
false argument
1463 * specified here. For detailed timings, removing the @p
false argument
1464 * prints all the details.
1467 * time_details(std::cout,
1476 * <a name=
"LaplaceProblemsetup_system"></a>
1477 * <h4>LaplaceProblem::setup_system</h4>
1481 * The setup stage is in analogy to @ref step_16
"step-16" with relevant changes due to the
1483 * including the degrees of freedom
for the multigrid levels, and to
1484 * initialize constraints from hanging nodes and homogeneous Dirichlet
1485 * conditions. Since we intend to use
this programs in %
parallel with MPI,
1486 * we need to make sure that the constraints get to know the locally
1487 * relevant degrees of freedom, otherwise the storage would explode when
1488 *
using more than a few hundred millions of degrees of freedom, see
1489 * @ref step_40
"step-40".
1493 * Once we have created the multigrid dof_handler and the constraints, we
1495 * each
level of the multigrid scheme. The main action is to
set up the
1496 * <code>
MatrixFree </code> instance
for the problem. The base
class of the
1498 * initialized with a shared pointer to
MatrixFree object. This way, we can
1499 * simply create it here and then pass it on to the system
matrix and
level
1500 * matrices, respectively. For setting up
MatrixFree, we need to activate
1501 * the update flag in the AdditionalData field of
MatrixFree that enables
1502 * the storage of quadrature
point coordinates in real space (by
default, it
1503 * only caches data
for gradients (inverse transposed Jacobians) and JxW
1505 *
level (i.e., giving <code>
level = numbers::invalid_unsigned_int</code>),
1506 *
MatrixFree constructs a
loop over the active cells. In
this tutorial, we
1507 *
do not use threads in addition to MPI, which is why we explicitly disable
1510 * and vectors are initialized as explained above.
1513 *
template <
int dim>
1514 *
void LaplaceProblem<dim>::setup_system()
1519 * system_matrix.clear();
1520 * mg_matrices.clear_elements();
1522 * dof_handler.distribute_dofs(fe);
1523 * dof_handler.distribute_mg_dofs();
1525 * pcout <<
"Number of degrees of freedom: " << dof_handler.n_dofs()
1531 * constraints.clear();
1532 * constraints.reinit(locally_relevant_dofs);
1536 * constraints.close();
1538 * time_details <<
"Distribute DoFs & B.C. (CPU/wall) " << time.
cpu_time()
1539 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1548 * std::shared_ptr<MatrixFree<dim, double>> system_mf_storage(
1550 * system_mf_storage->reinit(mapping,
1555 * system_matrix.initialize(system_mf_storage);
1558 * system_matrix.evaluate_coefficient(Coefficient<dim>());
1560 * system_matrix.initialize_dof_vector(solution);
1561 * system_matrix.initialize_dof_vector(system_rhs);
1564 * time_details <<
"Setup matrix-free system (CPU/wall) " << time.
cpu_time()
1565 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1570 * Next, initialize the matrices
for the multigrid method on all the
1572 * the indices subject to boundary conditions as well as the indices on
1573 * edges between different refinement levels as described in the @ref step_16
"step-16"
1574 * tutorial program. We then go through the levels of the mesh and
1575 * construct the constraints and matrices on each
level. These follow
1576 * closely the construction of the system
matrix on the original mesh,
1577 * except the slight difference in naming when accessing information on
1578 * the levels rather than the active cells.
1581 *
const unsigned int nlevels =
triangulation.n_global_levels();
1582 * mg_matrices.resize(0, nlevels - 1);
1584 * std::set<types::boundary_id> dirichlet_boundary;
1585 * dirichlet_boundary.insert(0);
1586 * mg_constrained_dofs.initialize(dof_handler);
1587 * mg_constrained_dofs.make_zero_boundary_constraints(dof_handler,
1588 * dirichlet_boundary);
1597 * level_constraints.
reinit(relevant_dofs);
1599 * mg_constrained_dofs.get_boundary_indices(
level));
1600 * level_constraints.
close();
1608 * std::shared_ptr<MatrixFree<dim, float>> mg_mf_storage_level(
1610 * mg_mf_storage_level->reinit(mapping,
1612 * level_constraints,
1616 * mg_matrices[
level].initialize(mg_mf_storage_level,
1617 * mg_constrained_dofs,
1619 * mg_matrices[
level].evaluate_coefficient(Coefficient<dim>());
1622 * time_details <<
"Setup matrix-free levels (CPU/wall) " << time.
cpu_time()
1623 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1631 * <a name=
"LaplaceProblemassemble_rhs"></a>
1632 * <h4>LaplaceProblem::assemble_rhs</h4>
1636 * The
assemble function is very simple since all we have to
do is to
1638 * cached in the
MatrixFree class, which we query from
1641 * alternative), we must not forget to
call compress() at the
end of the
1642 * assembly to send all the contributions of the right hand side to the
1643 * owner of the respective degree of freedom.
1646 * template <
int dim>
1647 *
void LaplaceProblem<dim>::assemble_rhs()
1653 * *system_matrix.get_matrix_free());
1654 *
for (
unsigned int cell = 0;
1655 * cell < system_matrix.get_matrix_free()->n_cell_batches();
1659 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
1660 * phi.submit_value(make_vectorized_array<double>(1.0), q);
1662 * phi.distribute_local_to_global(system_rhs);
1667 * time_details <<
"Assemble right hand side (CPU/wall) " << time.
cpu_time()
1668 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1676 * <a name=
"LaplaceProblemsolve"></a>
1677 * <h4>LaplaceProblem::solve</h4>
1681 * The solution process is similar as in @ref step_16
"step-16". We start with the setup of
1684 * interpolation between the grid levels with the same fast
sum
1685 * factorization kernels that get also used in
FEEvaluation.
1688 *
template <
int dim>
1689 *
void LaplaceProblem<dim>::solve()
1693 * mg_transfer.build(dof_handler);
1695 * time_details <<
"MG build transfer time (CPU/wall) " << time.
cpu_time()
1701 * As a smoother,
this tutorial program uses a Chebyshev iteration instead
1702 * of SOR in @ref step_16
"step-16". (SOR would be very difficult to implement because we
1703 *
do not have the
matrix elements available explicitly, and it is
1704 * difficult to make it work efficiently in %
parallel.) The smoother is
1705 * initialized with our
level matrices and the mandatory additional data
1706 *
for the Chebyshev smoother. We use a relatively high degree here (5),
1707 * since
matrix-vector products are comparably cheap. We choose to smooth
1708 * out a range of @f$[1.2 \hat{\lambda}_{
\max}/15,1.2 \hat{\lambda}_{
\max}]@f$
1709 * in the smoother where @f$\hat{\lambda}_{
\max}@f$ is an estimate of the
1710 * largest eigenvalue (the factor 1.2 is applied inside
1712 * Chebyshev initialization performs a few steps of a CG algorithm
1713 * without preconditioner. Since the highest eigenvalue is usually the
1714 * easiest
one to find and a rough estimate is enough, we choose 10
1715 * iterations. Finally, we also
set the inner preconditioner type in the
1716 * Chebyshev method which is a Jacobi iteration. This is represented by
1718 * by our LaplaceOperator
class.
1722 * On
level zero, we initialize the smoother differently because we want
1724 * allows the user to
switch to solver mode where the number of iterations
1725 * is internally chosen to the correct
value. In the additional data
1726 * object,
this setting is activated by choosing the polynomial degree to
1729 *
matrix. The number of steps in the Chebyshev smoother are chosen such
1730 * that the Chebyshev convergence estimates guarantee to reduce the
1731 * residual by the number specified in the variable @p
1732 * smoothing_range. Note that
for solving, @p smoothing_range is a
1733 * relative tolerance and chosen smaller than
one, in
this case, we select
1734 * three orders of magnitude, whereas it is a number larger than 1 when
1739 * From a computational
point of view, the Chebyshev iteration is a very
1740 * attractive coarse grid solver as
long as the coarse size is
1741 * moderate. This is because the Chebyshev method performs only
1742 *
matrix-vector products and vector updates, which typically parallelize
1743 * better to the largest cluster size with more than a few tens of
1744 * thousands of cores than inner product involved in other iterative
1745 * methods. The former involves only local communication between neighbors
1746 * in the (coarse) mesh, whereas the latter requires global communication
1747 * over all processors.
1750 *
using SmootherType =
1763 * smoother_data[
level].smoothing_range = 15.;
1764 * smoother_data[
level].degree = 5;
1765 * smoother_data[
level].eig_cg_n_iterations = 10;
1769 * smoother_data[0].smoothing_range = 1
e-3;
1771 * smoother_data[0].eig_cg_n_iterations = mg_matrices[0].m();
1773 * mg_matrices[
level].compute_diagonal();
1774 * smoother_data[
level].preconditioner =
1775 * mg_matrices[
level].get_matrix_diagonal_inverse();
1777 * mg_smoother.initialize(mg_matrices, smoother_data);
1785 * The next step is to
set up the
interface matrices that are needed for the
1786 *
case with hanging nodes. The adaptive multigrid realization in deal.II
1787 * implements an approach called local smoothing. This means that the
1788 * smoothing on the finest
level only covers the local part of the mesh
1789 * defined by the fixed (finest) grid
level and ignores parts of the
1790 * computational domain where the terminal cells are coarser than
this
1791 *
level. As the method progresses to coarser levels, more and more of the
1792 * global mesh will be covered. At some coarser
level, the whole mesh will
1793 * be covered. Since all
level matrices in the multigrid method cover a
1794 * single
level in the mesh, no hanging nodes appear on the
level matrices.
1795 * At the
interface between multigrid levels, homogeneous Dirichlet boundary
1796 * conditions are
set while smoothing. When the residual is transferred to
1797 * the next coarser
level, however, the coupling over the multigrid
1798 *
interface needs to be taken into account. This is done by the so-called
1799 * interface (or edge) matrices that compute the part of the residual that
1801 * homogeneous Dirichlet conditions. We refer to the @ref mg_paper
1802 *
"Multigrid paper by Janssen and Kanschat" for more details.
1806 * For the implementation of those
interface matrices, there is already a
1810 *
vmult() and @p
Tvmult() operations (that were originally written for
1811 * matrices, hence expecting those names). Note that vmult_interface_down
1812 * is used during the restriction phase of the multigrid
V-cycle, whereas
1813 * vmult_interface_up is used during the prolongation phase.
1818 * preconditioner infrastructure in complete analogy to @ref step_16 "step-16" to obtain
1819 * a @p preconditioner
object that can be applied to a
matrix.
1826 * mg_interface_matrices;
1827 * mg_interface_matrices.resize(0,
triangulation.n_global_levels() - 1);
1832 * mg_interface_matrices);
1835 * mg_matrix, mg_coarse, mg_transfer, mg_smoother, mg_smoother);
1836 *
mg.set_edge_matrices(mg_interface, mg_interface);
1841 * preconditioner(dof_handler,
mg, mg_transfer);
1845 * The setup of the multigrid routines is quite easy and
one cannot see
1846 * any difference in the solve process as compared to @ref step_16 "step-16". All the
1847 * magic is hidden behind the implementation of the LaplaceOperator::
vmult
1848 * operation. Note that we print out the solve time and the accumulated
1849 * setup time through standard out, i.
e., in any case, whereas detailed
1850 * times for the setup operations are only printed in case the flag for
1851 * detail_times in the constructor is changed.
1859 * setup_time += time.wall_time();
1860 * time_details << "MG build smoother time (CPU/wall) " << time.cpu_time()
1861 * << "s/" << time.wall_time() << "s\n";
1862 * pcout << "Total setup time (wall) " << setup_time << "s\n";
1866 * constraints.set_zero(solution);
1867 * cg.solve(system_matrix, solution, system_rhs, preconditioner);
1869 * constraints.distribute(solution);
1871 * pcout << "Time solve (" << solver_control.last_step() << " iterations)"
1872 * << (solver_control.last_step() < 10 ? " " : " ") << "(CPU/wall) "
1873 * << time.cpu_time() << "s/" << time.wall_time() << "s\n";
1881 * <a name="LaplaceProblemoutput_results"></a>
1882 * <h4>LaplaceProblem::output_results</h4>
1886 * Here is the data output, which is a simplified version of @ref step_5 "step-5". We use
1887 * the standard VTU (= compressed VTK) output for each grid produced in the
1888 * refinement process. In addition, we use a compression algorithm that is
1889 * optimized for speed rather than disk usage. The default setting (which
1890 * optimizes for disk usage) makes saving the output take about 4 times as
1891 *
long as running the linear solver, while setting
1893 *
DataOutBase::VtkFlags::best_speed lowers this to only
one fourth the time
1894 * of the linear solve.
1898 * We disable the output when the mesh gets too large.
A variant of this
1899 * program has been
run on hundreds of thousands MPI ranks with as many as
1900 * 100 billion grid cells, which is not directly accessible to classical
1901 * visualization tools.
1904 * template <
int dim>
1905 *
void LaplaceProblem<dim>::output_results(const
unsigned int cycle) const
1913 * solution.update_ghost_values();
1922 *
"./",
"solution", cycle, MPI_COMM_WORLD, 3);
1924 * time_details <<
"Time write output (CPU/wall) " << time.
cpu_time()
1933 * <a name=
"LaplaceProblemrun"></a>
1938 * The
function that runs the program is very similar to the
one in
1939 * @ref step_16
"step-16". We
do few refinement steps in 3D compared to 2D, but that
's
1944 * Before we run the program, we output some information about the detected
1945 * vectorization level as discussed in the introduction.
1948 * template <int dim>
1949 * void LaplaceProblem<dim>::run()
1952 * const unsigned int n_vect_doubles = VectorizedArray<double>::size();
1953 * const unsigned int n_vect_bits = 8 * sizeof(double) * n_vect_doubles;
1955 * pcout << "Vectorization over " << n_vect_doubles
1956 * << " doubles = " << n_vect_bits << " bits ("
1957 * << Utilities::System::get_current_vectorization_level() << ")"
1961 * for (unsigned int cycle = 0; cycle < 9 - dim; ++cycle)
1963 * pcout << "Cycle " << cycle << std::endl;
1967 * GridGenerator::hyper_cube(triangulation, 0., 1.);
1968 * triangulation.refine_global(3 - dim);
1970 * triangulation.refine_global(1);
1974 * output_results(cycle);
1975 * pcout << std::endl;
1978 * } // namespace Step37
1985 * <a name="Thecodemaincodefunction"></a>
1986 * <h3>The <code>main</code> function</h3>
1990 * Apart from the fact that we set up the MPI framework according to @ref step_40 "step-40",
1991 * there are no surprises in the main function.
1994 * int main(int argc, char *argv[])
1998 * using namespace Step37;
2000 * Utilities::MPI::MPI_InitFinalize mpi_init(argc, argv, 1);
2002 * LaplaceProblem<dimension> laplace_problem;
2003 * laplace_problem.run();
2005 * catch (std::exception &exc)
2007 * std::cerr << std::endl
2009 * << "----------------------------------------------------"
2011 * std::cerr << "Exception on processing: " << std::endl
2012 * << exc.what() << std::endl
2013 * << "Aborting!" << std::endl
2014 * << "----------------------------------------------------"
2020 * std::cerr << std::endl
2022 * << "----------------------------------------------------"
2024 * std::cerr << "Unknown exception!" << std::endl
2025 * << "Aborting!" << std::endl
2026 * << "----------------------------------------------------"
2034 <a name="Results"></a><h1>Results</h1>
2037 <a name="Programoutput"></a><h3>Program output</h3>
2040 Since this example solves the same problem as @ref step_5 "step-5" (except for
2041 a different coefficient), there is little to say about the
2042 solution. We show a picture anyway, illustrating the size of the
2043 solution through both isocontours and volume rendering:
2045 <img src="https://www.dealii.org/images/steps/developer/step-37.solution.png" alt="">
2047 Of more interest is to evaluate some aspects of the multigrid solver.
2048 When we run this program in 2D for quadratic (@f$Q_2@f$) elements, we get the
2049 following output (when run on one core in release mode):
2051 Vectorization over 2 doubles = 128 bits (SSE2)
2053 Number of degrees of freedom: 81
2054 Total setup time (wall) 0.00159788s
2055 Time solve (6 iterations) (CPU/wall) 0.000951s/0.000951052s
2058 Number of degrees of freedom: 289
2059 Total setup time (wall) 0.00114608s
2060 Time solve (6 iterations) (CPU/wall) 0.000935s/0.000934839s
2063 Number of degrees of freedom: 1089
2064 Total setup time (wall) 0.00244665s
2065 Time solve (6 iterations) (CPU/wall) 0.00207s/0.002069s
2068 Number of degrees of freedom: 4225
2069 Total setup time (wall) 0.00678205s
2070 Time solve (6 iterations) (CPU/wall) 0.005616s/0.00561595s
2073 Number of degrees of freedom: 16641
2074 Total setup time (wall) 0.0241671s
2075 Time solve (6 iterations) (CPU/wall) 0.019543s/0.0195441s
2078 Number of degrees of freedom: 66049
2079 Total setup time (wall) 0.0967851s
2080 Time solve (6 iterations) (CPU/wall) 0.07457s/0.0745709s
2083 Number of degrees of freedom: 263169
2084 Total setup time (wall) 0.346374s
2085 Time solve (6 iterations) (CPU/wall) 0.260042s/0.265033s
2088 As in @ref step_16 "step-16", we see that the number of CG iterations remains constant with
2089 increasing number of degrees of freedom. A constant number of iterations
2090 (together with optimal computational properties) means that the computing time
2091 approximately quadruples as the problem size quadruples from one cycle to the
2092 next. The code is also very efficient in terms of storage. Around 2-4 million
2093 degrees of freedom fit into 1 GB of memory, see also the MPI results below. An
2094 interesting fact is that solving one linear system is cheaper than the setup,
2095 despite not building a matrix (approximately half of which is spent in the
2096 DoFHandler::distribute_dofs() and DoFHandler::distribute_mg_dofs()
2097 calls). This shows the high efficiency of this approach, but also that the
2098 deal.II data structures are quite expensive to set up and the setup cost must
2099 be amortized over several system solves.
2101 Not much changes if we run the program in three spatial dimensions. Since we
2102 use uniform mesh refinement, we get eight times as many elements and
2103 approximately eight times as many degrees of freedom with each cycle:
2106 Vectorization over 2 doubles = 128 bits (SSE2)
2108 Number of degrees of freedom: 125
2109 Total setup time (wall) 0.00231099s
2110 Time solve (6 iterations) (CPU/wall) 0.000692s/0.000922918s
2113 Number of degrees of freedom: 729
2114 Total setup time (wall) 0.00289083s
2115 Time solve (6 iterations) (CPU/wall) 0.001534s/0.0024128s
2118 Number of degrees of freedom: 4913
2119 Total setup time (wall) 0.0143182s
2120 Time solve (6 iterations) (CPU/wall) 0.010785s/0.0107841s
2123 Number of degrees of freedom: 35937
2124 Total setup time (wall) 0.087064s
2125 Time solve (6 iterations) (CPU/wall) 0.063522s/0.06545s
2128 Number of degrees of freedom: 274625
2129 Total setup time (wall) 0.596306s
2130 Time solve (6 iterations) (CPU/wall) 0.427757s/0.431765s
2133 Number of degrees of freedom: 2146689
2134 Total setup time (wall) 4.96491s
2135 Time solve (6 iterations) (CPU/wall) 3.53126s/3.56142s
2138 Since it is so easy, we look at what happens if we increase the polynomial
2139 degree. When selecting the degree as four in 3D, i.e., on @f$\mathcal Q_4@f$
2140 elements, by changing the line <code>const unsigned int
2141 degree_finite_element=4;</code> at the top of the program, we get the
2142 following program output:
2145 Vectorization over 2 doubles = 128 bits (SSE2)
2147 Number of degrees of freedom: 729
2148 Total setup time (wall) 0.00633097s
2149 Time solve (6 iterations) (CPU/wall) 0.002829s/0.00379395s
2152 Number of degrees of freedom: 4913
2153 Total setup time (wall) 0.0174279s
2154 Time solve (6 iterations) (CPU/wall) 0.012255s/0.012254s
2157 Number of degrees of freedom: 35937
2158 Total setup time (wall) 0.082655s
2159 Time solve (6 iterations) (CPU/wall) 0.052362s/0.0523629s
2162 Number of degrees of freedom: 274625
2163 Total setup time (wall) 0.507943s
2164 Time solve (6 iterations) (CPU/wall) 0.341811s/0.345788s
2167 Number of degrees of freedom: 2146689
2168 Total setup time (wall) 3.46251s
2169 Time solve (7 iterations) (CPU/wall) 3.29638s/3.3265s
2172 Number of degrees of freedom: 16974593
2173 Total setup time (wall) 27.8989s
2174 Time solve (7 iterations) (CPU/wall) 26.3705s/27.1077s
2177 Since @f$\mathcal Q_4@f$ elements on a certain mesh correspond to @f$\mathcal Q_2@f$
2178 elements on half the mesh size, we can compare the run time at cycle 4 with
2179 fourth degree polynomials with cycle 5 using quadratic polynomials, both at
2180 2.1 million degrees of freedom. The surprising effect is that the solver for
2181 @f$\mathcal Q_4@f$ element is actually slightly faster than for the quadratic
2182 case, despite using one more linear iteration. The effect that higher-degree
2183 polynomials are similarly fast or even faster than lower degree ones is one of
2184 the main strengths of matrix-free operator evaluation through sum
2185 factorization, see the <a
2186 href="http://dx.doi.org/10.1016/j.compfluid.2012.04.012">matrix-free
2187 paper</a>. This is fundamentally different to matrix-based methods that get
2188 more expensive per unknown as the polynomial degree increases and the coupling
2191 In addition, also the setup gets a bit cheaper for higher order, which is
2192 because fewer elements need to be set up.
2194 Finally, let us look at the timings with degree 8, which corresponds to
2195 another round of mesh refinement in the lower order methods:
2198 Vectorization over 2 doubles = 128 bits (SSE2)
2200 Number of degrees of freedom: 4913
2201 Total setup time (wall) 0.0842004s
2202 Time solve (8 iterations) (CPU/wall) 0.019296s/0.0192959s
2205 Number of degrees of freedom: 35937
2206 Total setup time (wall) 0.327048s
2207 Time solve (8 iterations) (CPU/wall) 0.07517s/0.075999s
2210 Number of degrees of freedom: 274625
2211 Total setup time (wall) 2.12335s
2212 Time solve (8 iterations) (CPU/wall) 0.448739s/0.453698s
2215 Number of degrees of freedom: 2146689
2216 Total setup time (wall) 16.1743s
2217 Time solve (8 iterations) (CPU/wall) 3.95003s/3.97717s
2220 Number of degrees of freedom: 16974593
2221 Total setup time (wall) 130.8s
2222 Time solve (8 iterations) (CPU/wall) 31.0316s/31.767s
2225 Here, the initialization seems considerably slower than before, which is
2226 mainly due to the computation of the diagonal of the matrix, which actually
2227 computes a 729 x 729 matrix on each cell and throws away everything but the
2228 diagonal. The solver times, however, are again very close to the quartic case,
2229 showing that the linear increase with the polynomial degree that is
2230 theoretically expected is almost completely offset by better computational
2231 characteristics and the fact that higher order methods have a smaller share of
2232 degrees of freedom living on several cells that add to the evaluation
2235 <a name="Comparisonwithasparsematrix"></a><h3>Comparison with a sparse matrix</h3>
2238 In order to understand the capabilities of the matrix-free implementation, we
2239 compare the performance of the 3d example above with a sparse matrix
2240 implementation based on TrilinosWrappers::SparseMatrix by measuring both the
2241 computation times for the initialization of the problem (distribute DoFs,
2242 setup and assemble matrices, setup multigrid structures) and the actual
2243 solution for the matrix-free variant and the variant based on sparse
2244 matrices. We base the preconditioner on float numbers and the actual matrix
2245 and vectors on double numbers, as shown above. Tests are run on an Intel Core
2246 i7-5500U notebook processor (two cores and <a
2247 href="http://en.wikipedia.org/wiki/Advanced_Vector_Extensions">AVX</a>
2248 support, i.e., four operations on doubles can be done with one CPU
2249 instruction, which is heavily used in FEEvaluation), optimized mode, and two
2252 <table align="center" class="doxtable">
2255 <th colspan="2">Sparse matrix</th>
2256 <th colspan="2">Matrix-free implementation</th>
2260 <th>Setup + assemble</th>
2261 <th> Solve </th>
2262 <th>Setup + assemble</th>
2263 <th> Solve </th>
2266 <td align="right">125</td>
2267 <td align="center">0.0042s</td>
2268 <td align="center">0.0012s</td>
2269 <td align="center">0.0022s</td>
2270 <td align="center">0.00095s</td>
2273 <td align="right">729</td>
2274 <td align="center">0.012s</td>
2275 <td align="center">0.0040s</td>
2276 <td align="center">0.0027s</td>
2277 <td align="center">0.0021s</td>
2280 <td align="right">4,913</td>
2281 <td align="center">0.082s</td>
2282 <td align="center">0.012s</td>
2283 <td align="center">0.011s</td>
2284 <td align="center">0.0057s</td>
2287 <td align="right">35,937</td>
2288 <td align="center">0.73s</td>
2289 <td align="center">0.13s</td>
2290 <td align="center">0.048s</td>
2291 <td align="center">0.040s</td>
2294 <td align="right">274,625</td>
2295 <td align="center">5.43s</td>
2296 <td align="center">1.01s</td>
2297 <td align="center">0.33s</td>
2298 <td align="center">0.25s</td>
2301 <td align="right">2,146,689</td>
2302 <td align="center">43.8s</td>
2303 <td align="center">8.24s</td>
2304 <td align="center">2.42s</td>
2305 <td align="center">2.06s</td>
2309 The table clearly shows that the matrix-free implementation is more than twice
2310 as fast for the solver, and more than six times as fast when it comes to
2311 initialization costs. As the problem size is made a factor 8 larger, we note
2312 that the times usually go up by a factor eight, too (as the solver iterations
2313 are constant at six). The main deviation is in the sparse matrix between 5k
2314 and 36k degrees of freedom, where the time increases by a factor 12. This is
2315 the threshold where the (L3) cache in the processor can no longer hold all
2316 data necessary for the matrix-vector products and all matrix elements must be
2317 fetched from main memory.
2319 Of course, this picture does not necessarily translate to all cases, as there
2320 are problems where knowledge of matrix entries enables much better solvers (as
2321 happens when the coefficient is varying more strongly than in the above
2322 example). Moreover, it also depends on the computer system. The present system
2323 has good memory performance, so sparse matrices perform comparably
2324 well. Nonetheless, the matrix-free implementation gives a nice speedup already
2325 for the <i>Q</i><sub>2</sub> elements used in this example. This becomes
2326 particularly apparent for time-dependent or nonlinear problems where sparse
2327 matrices would need to be reassembled over and over again, which becomes much
2328 easier with this class. And of course, thanks to the better complexity of the
2329 products, the method gains increasingly larger advantages when the order of the
2330 elements increases (the matrix-free implementation has costs
2331 4<i>d</i><sup>2</sup><i>p</i> per degree of freedom, compared to
2332 2<i>p<sup>d</sup></i> for the sparse matrix, so it will win anyway for order 4
2335 <a name="ResultsforlargescaleparallelcomputationsonSuperMUC"></a><h3> Results for large-scale parallel computations on SuperMUC</h3>
2338 As explained in the introduction and the in-code comments, this program can be
2339 run in parallel with MPI. It turns out that geometric multigrid schemes work
2340 really well and can scale to very large machines. To the authors' knowledge,
2341 the geometric multigrid results shown here are the largest computations done
2342 with deal.II as of late 2016,
run on up to 147,456 cores of the <a
2343 href=
"https://www.lrz.de/services/compute/supermuc/systemdescription/">complete
2344 SuperMUC Phase 1</a>. The ingredients
for scalability beyond 1000 cores are
2345 that no data structure that depends on the global problem size is held in its
2346 entirety on a single processor and that the communication is not too frequent
2347 in order not to
run into latency issues of the network. For PDEs solved with
2348 iterative solvers, the communication latency is often the limiting factor,
2349 rather than the throughput of the network. For the example of the SuperMUC
2350 system, the
point-to-
point latency between two processors is between 1
e-6 and
2351 1
e-5 seconds, depending on the proximity in the MPI network. The
matrix-vector
2352 products with @p LaplaceOperator from
this class involves several
2353 point-to-
point communication steps, interleaved with computations on each
2354 core. The resulting latency of a
matrix-vector product is around 1
e-4
2355 seconds. Global communication, for example an @p MPI_Allreduce operation that
2356 accumulates the
sum of a single number per rank over all ranks in the MPI
2357 network, has a latency of 1
e-4 seconds. The multigrid
V-cycle used in this
2358 program is also a form of global communication. Think about the coarse grid
2359 solve that happens on a single processor: It accumulates the contributions
2360 from all processors before it starts. When completed, the coarse grid solution
2361 is transferred to finer levels, where more and more processors help in
2362 smoothing until the fine grid. Essentially, this is a tree-like pattern over
2363 the processors in the network and controlled by the mesh. As opposed to the
2364 @p MPI_Allreduce operations where the tree in the
reduction is optimized to the
2365 actual links in the MPI network, the multigrid
V-cycle does
this according to
2366 the partitioning of the mesh. Thus, we cannot expect the same
2367 optimality. Furthermore, the multigrid cycle is not simply a walk up and down
2368 the refinement tree, but also communication on each
level when doing the
2369 smoothing. In other words, the global communication in multigrid is more
2370 challenging and related to the mesh that provides less optimization
2371 opportunities. The measured latency of the
V-cycle is between 6
e-3 and 2
e-2
2372 seconds, i.e., the same as 60 to 200 MPI_Allreduce operations.
2374 The following figure shows a scaling experiments on @f$\mathcal Q_3@f$
2375 elements. Along the lines, the problem size is held constant as the number of
2376 cores is increasing. When doubling the number of cores,
one expects a halving
2377 of the computational time, indicated by the dotted gray lines. The results
2378 show that the implementation shows almost ideal behavior until an absolute
2379 time of around 0.1 seconds is reached. The solver tolerances have been
set
2380 such that the solver performs five iterations. This way of plotting data is
2381 the <b>strong scaling</
b> of the algorithm. As we go to very large core
2382 counts, the curves flatten out a bit earlier, which is because of the
2383 communication network in SuperMUC where communication between processors
2384 farther away is slightly slower.
2386 <img src=
"https://www.dealii.org/images/steps/developer/step-37.scaling_strong.png" alt=
"">
2388 In addition, the plot also contains results for <b>weak scaling</
b> that lists
2389 how the algorithm behaves as both the number of processor cores and elements
2390 is increased at the same pace. In
this situation, we expect that the compute
2391 time remains constant. Algorithmically, the number of CG iterations is
2392 constant at 5, so we are good from that
end. The lines in the plot are
2393 arranged such that the top left
point in each data series represents the same
2394 size per processor, namely 131,072 elements (or approximately 3.5 million
2395 degrees of freedom per core). The gray lines indicating ideal strong scaling
2396 are by the same factor of 8 apart. The results show again that the scaling is
2397 almost ideal. The
parallel efficiency when going from 288 cores to 147,456
2398 cores is at around 75%
for a local problem size of 750,000 degrees of freedom
2399 per core which takes 1.0s on 288 cores, 1.03s on 2304 cores, 1.19s on 18k
2400 cores, and 1.35s on 147k cores. The algorithms also reach a very high
2401 utilization of the processor. The largest computation on 147k cores reaches
2402 around 1.7 PFLOPs/s on SuperMUC out of an arithmetic peak of 3.2 PFLOPs/s. For
2403 an iterative PDE solver,
this is a very high number and significantly more is
2404 often only reached
for dense linear algebra. Sparse linear algebra is limited
2405 to a tenth of
this value.
2407 As mentioned in the introduction, the
matrix-
free method reduces the memory
2408 consumption of the data structures. Besides the higher performance due to less
2409 memory transfer, the algorithms also allow
for very large problems to fit into
2410 memory. The figure below shows the computational time as we increase the
2411 problem size until an upper limit where the computation exhausts memory. We
do
2412 this for 1k cores, 8k cores, and 65k cores and see that the problem size can
2413 be varied over almost two orders of magnitude with ideal scaling. The largest
2414 computation shown in
this picture involves 292 billion (@f$2.92 \cdot 10^{11}@f$)
2415 degrees of freedom. On a DG computation of 147k cores, the above algorithms
2416 were also
run involving up to 549 billion (2^39) DoFs.
2418 <img src=
"https://www.dealii.org/images/steps/developer/step-37.scaling_size.png" alt=
"">
2420 Finally, we note that
while performing the tests on the large-
scale system
2421 shown above, improvements of the multigrid algorithms in deal.II have been
2422 developed. The original version contained the sub-optimal code based on
2424 all vector entries are
zero) were done on each smoothing
2425 operation on each
level, which only became apparent on 65k cores and
2426 more. However, the following picture shows that the improvement already pay
2427 off on a smaller
scale, here shown on computations on up to 14,336 cores
for
2428 @f$\mathcal Q_5@f$ elements:
2430 <img src=
"https://www.dealii.org/images/steps/developer/step-37.scaling_oldnew.png" alt=
"">
2433 <a name=
"Adaptivity"></a><h3> Adaptivity</h3>
2436 As explained in the code, the algorithm presented here is prepared to
run on
2437 adaptively refined meshes. If only part of the mesh is refined, the multigrid
2438 cycle will
run with local smoothing and impose Dirichlet conditions along the
2439 interfaces which differ in refinement
level for smoothing through the
2441 distributed over levels, relating the owner of the
level cells to the owner of
2442 the
first descendant active cell, there can be an imbalance between different
2443 processors in MPI, which limits scalability by a factor of around two to five.
2445 <a name=
"Possibilitiesforextensions"></a><h3> Possibilities
for extensions</h3>
2448 <a name=
"Kellyerrorestimator"></a><h4> Kelly error estimator </h4>
2451 As mentioned above the code is ready
for locally adaptive h-refinement.
2452 For the Poisson equation
one can employ the Kelly error indicator,
2454 with the ghost indices of
parallel vectors.
2455 In order to evaluate the jump terms in the error indicator, each MPI process
2456 needs to know locally relevant DoFs.
2458 some locally relevant DoFs.
2459 The ghost indices made available in the vector are a tight
set of only those indices
2460 that are touched in the cell integrals (including constraint resolution).
2461 This choice has performance reasons, because sending all locally relevant degrees
2462 of freedom would be too expensive compared to the
matrix-vector product.
2463 Consequently the solution vector as-is is
2465 The trick is to change the ghost part of the
partition, for example using a
2473 solution.
reinit(dof_handler.locally_owned_dofs(),
2474 locally_relevant_dofs,
2476 solution.copy_locally_owned_data_from(copy_vec);
2477 constraints.distribute(solution);
2478 solution.update_ghost_values();
2481 <a name="Sharedmemoryparallelization"></a><h4> Shared-memory parallelization</h4>
2484 This program is parallelized with MPI only. As an alternative, the
MatrixFree
2485 loop can also be issued in hybrid mode, for example by using MPI parallelizing
2486 over the nodes of a cluster and with threads through Intel TBB within the
2487 shared memory region of
one node. To use this,
one would need to both
set the
2488 number of threads in the MPI_InitFinalize data structure in the main function,
2489 and
set the
MatrixFree::AdditionalData::tasks_parallel_scheme to
2490 partition_color to actually do the
loop in
parallel. This use case is
2491 discussed in @ref step_48 "step-48".
2493 <a name="InhomogeneousDirichletboundaryconditions"></a><h4> Inhomogeneous Dirichlet boundary conditions </h4>
2496 The presented program assumes homogeneous Dirichlet boundary conditions. When
2497 going to non-homogeneous conditions, the situation is a bit more intricate. To
2498 understand how to implement such a setting, let us
first recall how these
2499 arise in the mathematical formulation and how they are implemented in a
2500 matrix-based variant. In essence, an inhomogeneous Dirichlet condition sets
2501 some of the nodal
values in the solution to given
values rather than
2502 determining them through the variational principles,
2504 u_h(\mathbf{x}) = \sum_{i\in \mathcal
N} \varphi_i(\mathbf{x}) u_i =
2505 \sum_{i\in \mathcal N \setminus \mathcal N_D} \varphi_i(\mathbf{x}) u_i +
2506 \sum_{i\in \mathcal N_D} \varphi_i(\mathbf{x}) g_i,
2508 where @f$u_i@f$ denotes the nodal
values of the solution and @f$\mathcal
N@f$ denotes
2509 the
set of all nodes. The
set @f$\mathcal N_D\subset \mathcal
N@f$ is the subset
2510 of the nodes that are subject to Dirichlet boundary conditions where the
2511 solution is forced to
equal @f$u_i = g_i = g(\mathbf{x}_i)@f$ as the interpolation
2512 of boundary
values on the Dirichlet-constrained node points @f$i\in \mathcal
2513 N_D@f$. We then
insert this solution
2514 representation into the weak form,
e.g. the Laplacian shown above, and move
2515 the known quantities to the right hand side:
2517 (\nabla \varphi_i, \nabla u_h)_\Omega &=& (\varphi_i, f)_\Omega \quad \Rightarrow \\
2518 \sum_{j\in \mathcal N \setminus \mathcal N_D}(\nabla \varphi_i,\nabla \varphi_j)_\Omega \, u_j &=&
2519 (\varphi_i, f)_\Omega
2520 -\sum_{j\in \mathcal N_D} (\nabla \varphi_i,\nabla\varphi_j)_\Omega\, g_j.
2522 In
this formula, the equations are tested
for all basis
functions @f$\varphi_i@f$
2523 with @f$i\in N \setminus \mathcal N_D@f$ that are not related to the nodes
2524 constrained by Dirichlet conditions.
2526 In the implementation in deal.II, the integrals @f$(\nabla \varphi_i,\nabla \varphi_j)_\Omega@f$
2527 on the right hand side are already contained in the local
matrix contributions
2528 we
assemble on each cell. When
using
2530 @ref step_6
"step-6" and @ref step_7
"step-7" tutorial programs, we can account
for the contribution of
2531 inhomogeneous constraints <i>j</i> by multiplying the columns <i>j</i> and
2532 rows <i>i</i> of the local
matrix according to the integrals @f$(\varphi_i,
2533 \varphi_j)_\Omega@f$ by the inhomogeneities and subtracting the resulting from
2534 the position <i>i</i> in the global right-hand-side vector, see also the @ref
2535 constraints module. In essence, we use some of the integrals that get
2536 eliminated from the left hand side of the equation to finalize the right hand
2537 side contribution. Similar mathematics are also involved when
first writing
2538 all entries into a left hand side
matrix and then eliminating
matrix rows and
2541 In principle, the components that belong to the constrained degrees of freedom
2542 could be eliminated from the linear system because they
do not carry any
2543 information. In practice, in deal.II we
always keep the size of the linear
2544 system the same to avoid handling two different numbering systems and avoid
2545 confusion about the two different index sets. In order to ensure that the
2546 linear system does not get singular when not adding anything to constrained
2547 rows, we then add dummy entries to the
matrix diagonal that are otherwise
2548 unrelated to the real entries.
2550 In a
matrix-
free method, we need to take a different approach, since the @p
2551 LaplaceOperator
class represents the
matrix-vector product of a
2552 <
b>homogeneous</
b> operator (the left-hand side of the last formula). It does
2556 constraints as long as it represents a <b>linear</b> operator.
2558 In our
matrix-
free code, the right hand side computation where the
2559 contribution of inhomogeneous conditions ends up is completely decoupled from
2560 the
matrix operator and handled by a different function above. Thus, we need
2561 to explicitly generate the data that enters the right hand side rather than
2562 using a byproduct of the
matrix assembly. Since we already know how to apply
2563 the operator on a vector, we could try to use those facilities for a vector
2567 std::map<types::global_dof_index, double> boundary_values;
2571 BoundaryValueFunction<dim>(),
2573 for (
const std::pair<const types::global_dof_index, double> &pair : boundary_values)
2574 if (solution.locally_owned_elements().is_element(pair.first))
2575 solution(pair.first) = pair.second;
2577 or, equivalently,
if we already had filled the inhomogeneous constraints into
2584 We could then pass the vector @p solution to the @p
2586 system_rhs vector that gets filled in the @p LaplaceProblem::assemble_rhs()
2587 function. However, this idea does not work because the
2589 homogeneous
values on all constraints (otherwise the operator would not be a
2590 linear operator but an
affine one). To also retrieve the
values of the
2591 inhomogeneities, we could select
one of two following strategies.
2593 <a name="UseFEEvaluationread_dof_values_plaintoavoidresolvingconstraints"></a><h5> Use
FEEvaluation::read_dof_values_plain() to avoid resolving constraints </h5>
2596 The class
FEEvaluation has a facility that addresses precisely this
2597 requirement: For non-homogeneous Dirichlet
values, we do want to skip the
2598 implicit imposition of homogeneous (Dirichlet) constraints upon reading the
2599 data from the vector @p solution. For example, we could extend the @p
2600 LaplaceProblem::assemble_rhs() function to deal with inhomogeneous Dirichlet
2601 values as follows, assuming the Dirichlet
values have been interpolated into
2602 the
object @p constraints:
2605 void LaplaceProblem<dim>::assemble_rhs()
2608 constraints.distribute(solution);
2609 solution.update_ghost_values();
2614 for (
unsigned int cell = 0;
2615 cell < system_matrix.get_matrix_free()->n_cell_batches();
2619 phi.read_dof_values_plain(solution);
2621 for (
unsigned int q = 0; q < phi.n_q_points; ++q)
2623 phi.submit_gradient(-coefficient(cell, q) * phi.get_gradient(q), q);
2624 phi.submit_value(make_vectorized_array<double>(1.0), q);
2627 phi.distribute_local_to_global(system_rhs);
2634 tentative solution vector by
FEEvaluation::read_dof_values_plain() that
2635 ignores all constraints. Due to this setup, we must make sure that other
2636 constraints,
e.g. by hanging nodes, are correctly distributed to the input
2637 vector already as they are not resolved as in
2638 FEEvaluation::read_dof_values_plain(). Inside the
loop, we then evaluate the
2639 Laplacian and repeat the
second derivative
call with
2640 FEEvaluation::submit_gradient() from the @p LaplaceOperator class, but with the
2641 sign switched since we wanted to subtract the contribution of Dirichlet
2642 conditions on the right hand side vector according to the formula above. When
2644 regarding the value slot and
first derivative slot to true to account for both
2645 terms added in the
loop over quadrature points. Once the right hand side is
2646 assembled, we then go on to solving the linear system for the homogeneous
2647 problem, say involving a variable @p solution_update. After solving, we can
2648 add @p solution_update to the @p solution vector that contains the final
2649 (inhomogeneous) solution.
2651 Note that the negative
sign for the Laplacian alongside with a positive
sign
2652 for the forcing that we needed to build the right hand side is a more
general
2653 concept: We have implemented
nothing else than Newton's method for nonlinear
2654 equations, but applied to a linear system. We have used an
initial guess for
2655 the variable @p solution in terms of the Dirichlet boundary conditions and
2656 computed a residual @f$r = f - Au_0@f$. The linear system was then solved as
2657 @f$\Delta u =
A^{-1} (f-Au)@f$ and we
finally computed @f$u = u_0 + \Delta u@f$. For a
2658 linear system, we obviously reach the exact solution after a single
2659 iteration. If we wanted to extend the code to a nonlinear problem, we would
2660 rename the @p assemble_rhs()
function into a more descriptive name like @p
2661 assemble_residual() that computes the (weak) form of the residual, whereas the
2662 @p LaplaceOperator::apply_add() function would get the linearization of the
2663 residual with respect to the solution variable.
2665 <a name="UseLaplaceOperatorwithasecondAffineConstraintsobjectwithoutDirichletconditions"></a><h5> Use LaplaceOperator with a
second AffineConstraints object without Dirichlet conditions </h5>
2668 A second alternative to get the right hand side that re-uses the @p
2669 LaplaceOperator::apply_add() function is to instead add a
second LaplaceOperator
2670 that skips Dirichlet constraints. To do this, we initialize a
second MatrixFree
2671 object which does not have any boundary value constraints. This @p matrix_free
2672 object is then passed to a @p LaplaceOperator class instance @p
2673 inhomogeneous_operator that is only used to create the right hand side:
2676 void LaplaceProblem<dim>::assemble_rhs()
2680 no_constraints.
close();
2681 LaplaceOperator<dim, degree_finite_element, double> inhomogeneous_operator;
2686 std::shared_ptr<MatrixFree<dim, double>> matrix_free(
2688 matrix_free->reinit(dof_handler,
2692 inhomogeneous_operator.initialize(matrix_free);
2695 constraints.distribute(solution);
2696 inhomogeneous_operator.evaluate_coefficient(Coefficient<dim>());
2697 inhomogeneous_operator.vmult(system_rhs, solution);
2701 *inhomogeneous_operator.get_matrix_free());
2702 for (
unsigned int cell = 0;
2703 cell < inhomogeneous_operator.get_matrix_free()->n_cell_batches();
2707 for (
unsigned int q = 0; q < phi.n_q_points; ++q)
2708 phi.submit_value(make_vectorized_array<double>(1.0), q);
2710 phi.distribute_local_to_global(system_rhs);
2716 A more sophisticated implementation of
this technique could reuse the original
2719 object. Doing
this would require making substantial modifications to the
2721 comes with the library can do this. See the discussion on blocks in
2725 <a name="PlainProg"></a>
2726 <h1> The plain program</h1>
2727 @include
"step-37.cc"
void reinit(const IndexSet &local_constraints=IndexSet())
void distribute_local_to_global(const InVector &local_vector, const std::vector< size_type > &local_dof_indices, OutVector &global_vector) const
void distribute(VectorType &vec) const
void add_lines(const std::vector< bool > &lines)
void attach_dof_handler(const DoFHandlerType &)
void add_data_vector(const VectorType &data, const std::vector< std::string > &names, const DataVectorType type=type_automatic, const std::vector< DataComponentInterpretation::DataComponentInterpretation > &data_component_interpretation=std::vector< DataComponentInterpretation::DataComponentInterpretation >())
virtual void build_patches(const unsigned int n_subdivisions=0)
value_type get_dof_value(const unsigned int dof) const
void read_dof_values(const VectorType &src, const unsigned int first_index=0)
void evaluate(const EvaluationFlags::EvaluationFlags evaluation_flag)
void initialize(const MGSmootherBase< VectorType > &coarse_smooth)
void resize(const unsigned int new_minlevel, const unsigned int new_maxlevel, Args &&... args)
void set_constrained_entries_to_one(VectorType &dst) const
void vmult_add(LinearAlgebra::distributed::Vector< double > &dst, const LinearAlgebra::distributed::Vector< double > &src) const
void vmult_interface_down(VectorType &dst, const VectorType &src) const
std::shared_ptr< DiagonalMatrix< VectorType > > inverse_diagonal_entries
void vmult(VectorType &dst, const VectorType &src) const
void initialize(const OperatorType &operator_in)
void Tvmult(VectorType &dst, const VectorType &src) const
unsigned int n_cell_batches() const
void initialize_dof_vector(VectorType &vec, const unsigned int dof_handler_index=0) const
void cell_loop(const std::function< void(const MatrixFree< dim, Number, VectorizedArrayType > &, OutVector &, const InVector &, const std::pair< unsigned int, unsigned int > &)> &cell_operation, OutVector &dst, const InVector &src, const bool zero_dst_vector=false) const
constexpr SymmetricTensor< 2, dim, Number > invert(const SymmetricTensor< 2, dim, Number > &t)
constexpr Number determinant(const SymmetricTensor< 2, dim, Number > &t)
#define DEAL_II_WITH_P4EST
@ update_JxW_values
Transformed quadrature weights.
@ update_gradients
Shape function gradients.
@ update_quadrature_points
Transformed quadrature points.
__global__ void reduction(Number *result, const Number *v, const size_type N)
__global__ void set(Number *val, const Number s, const size_type N)
std::string write_vtu_with_pvtu_record(const std::string &directory, const std::string &filename_without_extension, const unsigned int counter, const MPI_Comm &mpi_communicator, const unsigned int n_digits_for_counter=numbers::invalid_unsigned_int, const unsigned int n_groups=0) const
ZlibCompressionLevel compression_level
#define Assert(cond, exc)
#define AssertDimension(dim1, dim2)
void set_flags(const FlagType &flags)
static ::ExceptionBase & ExcMessage(std::string arg1)
void loop(ITERATOR begin, typename identity< ITERATOR >::type end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(DOFINFO &, DOFINFO &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, ASSEMBLER &assembler, const LoopControl &lctrl=LoopControl())
Number local_element(const size_type local_index) const
size_type locally_owned_size() const
void reinit(const size_type size, const bool omit_zeroing_entries=false)
void make_hanging_node_constraints(const DoFHandler< dim, spacedim > &dof_handler, AffineConstraints< number > &constraints)
std::vector< value_type > split(const typename ::Triangulation< dim, spacedim >::cell_iterator &parent, const value_type parent_value)
std::vector< value_type > l2_norm(const typename ::Triangulation< dim, spacedim >::cell_iterator &parent, const value_type parent_value)
Expression sign(const Expression &x)
static const types::blas_int zero
@ matrix
Contents is actually a matrix.
@ eigenvalues
Eigenvalue vector is filled.
@ diagonal
Matrix is diagonal.
@ general
No special properties.
static const types::blas_int one
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > b(const Tensor< 2, dim, Number > &F)
void call(const std::function< RT()> &function, internal::return_value< RT > &ret_val)
VectorType::value_type * begin(VectorType &V)
VectorType::value_type * end(VectorType &V)
unsigned int this_mpi_process(const MPI_Comm &mpi_communicator)
T sum(const T &t, const MPI_Comm &mpi_communicator)
std::string compress(const std::string &input)
void run(const Iterator &begin, const typename identity< Iterator >::type &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
unsigned int n_cells(const internal::TriangulationImplementation::NumberCache< 1 > &c)
void copy(const T *begin, const T *end, U *dest)
int(&) functions(const void *v1, const void *v2)
void assemble(const MeshWorker::DoFInfoBox< dim, DOFINFO > &dinfo, A *assembler)
void reinit(MatrixBlock< MatrixType > &v, const BlockSparsityPattern &p)
static const unsigned int invalid_unsigned_int
const ::parallel::distributed::Triangulation< dim, spacedim > * triangulation
TasksParallelScheme tasks_parallel_scheme
UpdateFlags mapping_update_flags