Reference documentation for deal.II version 9.3.2
\(\newcommand{\dealvcentcolon}{\mathrel{\mathop{:}}}\) \(\newcommand{\dealcoloneq}{\dealvcentcolon\mathrel{\mkern-1.2mu}=}\) \(\newcommand{\jump}[1]{\left[\!\left[ #1 \right]\!\right]}\) \(\newcommand{\average}[1]{\left\{\!\left\{ #1 \right\}\!\right\}}\)
step-76.h
Go to the documentation of this file.
1 true);
282  FEFaceEvaluation<dim, degree, degree + 1, 1, Number> phi_p(data, /*is_interior_face=*/false);
283 
284  for (unsigned int face = range.first; face < range.second; ++face)
285  {
286  phi_m.reinit(face);
287  phi_m.gather_evaluate(src, face_evaluation_flags);
288  phi_p.reinit(face);
289  phi_p.gather_evaluate(src, face_evaluation_flags);
290 
291  // some operations on the face quadrature points
292 
293  phi_m.integrate_scatter(face_evaluation_flags, dst);
294  phi_p.integrate_scatter(face_evaluation_flags, dst);
295  }
296  },
297  [&](const auto &data, auto &dst, const auto &src, const auto range) {
298  // operation performed boundary faces
299 
300  FEFaceEvaluation<dim, degree, degree + 1, 1, Number> phi_m(data, /*is_interior_face=*/true);
301 
302  for (unsigned int face = range.first; face < range.second; ++face)
303  {
304  phi_m.reinit(face);
305  phi_m.gather_evaluate(src, face_evaluation_flags);
306 
307  // some operations on the face quadrature points
308 
309  phi_m.integrate_scatter(face_evaluation_flags, dst);
310  }
311  },
312  dst,
313  src);
314 @endcode
315 
316 in the following way:
317 
318 @code
319 matrix_free.template loop_cell_centric<VectorType, VectorType>(
320  [&](const auto &data, auto &dst, const auto &src, const auto range) {
322  FEFaceEvaluation<dim, degree, degree + 1, 1, Number> phi_m(data, /*is_interior_face=*/true);
323  FEFaceEvaluation<dim, degree, degree + 1, 1, Number> phi_p(data, /*is_interior_face=*/false);
324 
325  for (unsigned int cell = range.first; cell < range.second; ++cell)
326  {
327  phi.reinit(cell);
328  phi.gather_evaluate(src, cell_evaluation_flags);
329 
330  // some operations on the cell quadrature points
331 
332  phi.integrate_scatter(cell_evaluation_flags, dst);
333 
334  // loop over all faces of cell
335  for (unsigned int face = 0; face < GeometryInfo<dim>::faces_per_cell; ++face)
336  {
337  if (data.get_faces_by_cells_boundary_id(cell, face)[0] ==
339  {
340  // internal face
341  phi_m.reinit(cell, face);
342  phi_m.gather_evaluate(src, face_evaluation_flags);
343  phi_p.reinit(cell, face);
344  phi_p.gather_evaluate(src, face_evaluation_flags);
345 
346  // some operations on the face quadrature points
347 
348  phi_m.integrate_scatter(face_evaluation_flags, dst);
349  }
350  else
351  {
352  // boundary face
353  phi_m.reinit(cell, face);
354  phi_m.gather_evaluate(src, face_evaluation_flags);
355 
356  // some operations on the face quadrature points
357 
358  phi_m.integrate_scatter(face_evaluation_flags, dst);
359  }
360  }
361  }
362  },
363  dst,
364  src);
365 @endcode
366 
367 It should be noted that FEFaceEvaluation is initialized now with two numbers,
368 the cell number and the local face number. The given example only
369 highlights how to transform face-centric loops into cell-centric loops and
370 is by no means efficient, since data is read and written multiple times
371 from and to the global vector as well as computations are performed
372 redundantly. Below, we will discuss advanced techniques that target these issues.
373 
374 To be able to use MatrixFree::loop_cell_centric(), following flags of MatrixFree::AdditionalData
375 have to be enabled:
376 
377 @code
378 typename MatrixFree<dim, Number>::AdditionalData additional_data;
379 
380 // set flags as usual (not shown)
381 
382 additional_data.hold_all_faces_to_owned_cells = true;
383 additional_data.mapping_update_flags_faces_by_cells =
384  additional_data.mapping_update_flags_inner_faces |
385  additional_data.mapping_update_flags_boundary_faces;
386 
387 data.reinit(mapping, dof_handler, constraint, quadrature, additional_data);
388 @endcode
389 
390 In particular, these flags enable that the internal data structures are set up
391 for all faces of the cells.
392 
393 Currently, cell-centric loops in deal.II only work for uniformly refined meshes
394 and if no constraints are applied (which is the standard case DG is normally
395 used).
396 
397 
398 <a name="ProvidinglambdastoMatrixFreeloops"></a><h3>Providing lambdas to MatrixFree loops</h3>
399 
400 
401 The examples given above have already used lambdas, which have been provided to
402 matrix-free loops. The following short examples present how to transform functions between
403 a version where a class and a pointer to one of its methods are used and a
404 variant where lambdas are utilized.
405 
406 In the following code, a class and a pointer to one of its methods, which should
407 be interpreted as cell integral, are passed to MatrixFree::loop():
408 
409 @code
410 void
411 local_apply_cell(const MatrixFree<dim, Number> & data,
412  VectorType & dst,
413  const VectorType & src,
414  const std::pair<unsigned int, unsigned int> &range) const
415 {
417  for (unsigned int cell = range.first; cell < range.second; ++cell)
418  {
419  phi.reinit(cell);
420  phi.gather_evaluate(src, cell_evaluation_flags);
421 
422  // some operations on the quadrature points
423 
424  phi.integrate_scatter(cell_evaluation_flags, dst);
425  }
426 }
427 @endcode
428 
429 @code
430 matrix_free.cell_loop(&Operator::local_apply_cell, this, dst, src);
431 @endcode
432 
433 However, it is also possible to pass an anonymous function via a lambda function
434 with the same result:
435 
436 @code
437 matrix_free.template cell_loop<VectorType, VectorType>(
438  [&](const auto &data, auto &dst, const auto &src, const auto range) {
440  for (unsigned int cell = range.first; cell < range.second; ++cell)
441  {
442  phi.reinit(cell);
443  phi.gather_evaluate(src, cell_evaluation_flags);
444 
445  // some operations on the quadrature points
446 
447  phi.integrate_scatter(cell_evaluation_flags, dst);
448  }
449  },
450  dst,
451  src);
452 @endcode
453 
454 <a name="VectorizedArrayType"></a><h3>VectorizedArrayType</h3>
455 
456 
457 The class VectorizedArray<Number> is a key component to achieve the high
458 node-level performance of the matrix-free algorithms in deal.II.
459 It is a wrapper class around a short vector of @f$n@f$ entries of type Number and
460 maps arithmetic operations to appropriate single-instruction/multiple-data
461 (SIMD) concepts by intrinsic functions. The length of the vector can be
462 queried by VectorizedArray::size() and its underlying number type by
464 
465 In the default case (<code>VectorizedArray<Number></code>), the vector length is
466 set at compile time of the library to
467 match the highest value supported by the given processor architecture.
468 However, also a second optional template argument can be
469 specified as <code>VectorizedArray<Number, size></code>, where <code>size</code> explicitly
470 controls the vector length within the capabilities of a particular instruction
471 set. A full list of supported vector lengths is presented in the following table:
472 
473 <table align="center" class="doxtable">
474  <tr>
475  <th>double</th>
476  <th>float</th>
477  <th>ISA</th>
478  </tr>
479  <tr>
480  <td><code>VectorizedArray<double, 1></code></td>
481  <td><code>VectorizedArray<float, 1></code></td>
482  <td>(auto-vectorization)</td>
483  </tr>
484  <tr>
485  <td><code>VectorizedArray<double, 2></code></td>
486  <td><code>VectorizedArray<float, 4></code></td>
487  <td>SSE2/AltiVec</td>
488  </tr>
489  <tr>
490  <td><code>VectorizedArray<double, 4></code></td>
491  <td><code>VectorizedArray<float, 8></code></td>
492  <td>AVX/AVX2</td>
493  </tr>
494  <tr>
495  <td><code>VectorizedArray<double, 8></code></td>
496  <td><code>VectorizedArray<float, 16></code></td>
497  <td>AVX-512</td>
498  </tr>
499 </table>
500 
501 This allows users to select the vector length/ISA and, as a consequence, the
502 number of cells to be processed at once in matrix-free operator evaluations,
503 possibly reducing the pressure on the caches, an severe issue for very high
504 degrees (and dimensions).
505 
506 A possible further reason to reduce the number of filled lanes
507 is to simplify debugging: instead of having to look at, e.g., 8
508 cells, one can concentrate on a single cell.
509 
510 The interface of VectorizedArray also enables the replacement by any type with
511 a matching interface. Specifically, this prepares deal.II for the <code>std::simd</code>
512 class that is planned to become part of the C++23 standard. The following table
513 compares the deal.II-specific SIMD classes and the equivalent C++23 classes:
514 
515 
516 <table align="center" class="doxtable">
517  <tr>
518  <th>VectorizedArray (deal.II)</th>
519  <th>std::simd (C++23)</th>
520  </tr>
521  <tr>
522  <td><code>VectorizedArray<Number></code></td>
523  <td><code>std::experimental::native_simd<Number></code></td>
524  </tr>
525  <tr>
526  <td><code>VectorizedArray<Number, size></code></td>
527  <td><code>std::experimental::fixed_size_simd<Number, size></code></td>
528  </tr>
529 </table>
530  *
531  *
532  * <a name="CommProg"></a>
533  * <h1> The commented program</h1>
534  *
535  *
536  * <a name="Parametersandutilityfunctions"></a>
537  * <h3>Parameters and utility functions</h3>
538  *
539 
540  *
541  * The same includes as in @ref step_67 "step-67":
542  *
543  * @code
544  * #include <deal.II/base/conditional_ostream.h>
545  * #include <deal.II/base/function.h>
546  * #include <deal.II/base/logstream.h>
547  * #include <deal.II/base/time_stepping.h>
548  * #include <deal.II/base/timer.h>
549  * #include <deal.II/base/utilities.h>
550  * #include <deal.II/base/vectorization.h>
551  *
552  * #include <deal.II/distributed/tria.h>
553  *
554  * #include <deal.II/dofs/dof_handler.h>
555  *
556  * #include <deal.II/fe/fe_dgq.h>
557  * #include <deal.II/fe/fe_system.h>
558  *
559  * #include <deal.II/grid/grid_generator.h>
560  * #include <deal.II/grid/tria.h>
561  * #include <deal.II/grid/tria_accessor.h>
562  * #include <deal.II/grid/tria_iterator.h>
563  *
564  * #include <deal.II/lac/affine_constraints.h>
565  * #include <deal.II/lac/la_parallel_vector.h>
566  *
567  * #include <deal.II/matrix_free/fe_evaluation.h>
568  * #include <deal.II/matrix_free/matrix_free.h>
569  * #include <deal.II/matrix_free/operators.h>
570  *
571  * #include <deal.II/numerics/data_out.h>
572  *
573  * #include <fstream>
574  * #include <iomanip>
575  * #include <iostream>
576  *
577  * @endcode
578  *
579  * A new include for categorizing of cells according to their boundary IDs:
580  *
581  * @code
582  * #include <deal.II/matrix_free/tools.h>
583  *
584  *
585  *
586  * namespace Euler_DG
587  * {
588  * using namespace dealii;
589  *
590  * @endcode
591  *
592  * The same input parameters as in @ref step_67 "step-67":
593  *
594  * @code
595  * constexpr unsigned int testcase = 1;
596  * constexpr unsigned int dimension = 2;
597  * constexpr unsigned int n_global_refinements = 2;
598  * constexpr unsigned int fe_degree = 5;
599  * constexpr unsigned int n_q_points_1d = fe_degree + 2;
600  *
601  * @endcode
602  *
603  * This parameter specifies the size of the shared-memory group. Currently,
604  * only the values 1 and numbers::invalid_unsigned_int is possible, leading
605  * to the options that the memory features can be turned off or all processes
606  * having access to the same shared-memory domain are grouped together.
607  *
608  * @code
609  * constexpr unsigned int group_size = numbers::invalid_unsigned_int;
610  *
611  * using Number = double;
612  *
613  * @endcode
614  *
615  * Here, the type of the data structure is chosen for vectorization. In the
616  * default case, VectorizedArray<Number> is used, i.e., the highest
617  * instruction-set-architecture extension available on the given hardware with
618  * the maximum number of vector lanes is used. However, one might reduce
619  * the number of filled lanes, e.g., by writing
620  * <code>using VectorizedArrayType = VectorizedArray<Number, 4></code> to only
621  * process 4 cells.
622  *
623  * @code
624  * using VectorizedArrayType = VectorizedArray<Number>;
625  *
626  * @endcode
627  *
628  * The following parameters have not changed:
629  *
630  * @code
631  * constexpr double gamma = 1.4;
632  * constexpr double final_time = testcase == 0 ? 10 : 2.0;
633  * constexpr double output_tick = testcase == 0 ? 1 : 0.05;
634  *
635  * const double courant_number = 0.15 / std::pow(fe_degree, 1.5);
636  *
637  * @endcode
638  *
639  * Specify max number of time steps useful for performance studies.
640  *
641  * @code
642  * constexpr unsigned int max_time_steps = numbers::invalid_unsigned_int;
643  *
644  * @endcode
645  *
646  * Runge-Kutta-related functions copied from @ref step_67 "step-67" and slightly modified
647  * with the purpose to minimize global vector access:
648  *
649  * @code
650  * enum LowStorageRungeKuttaScheme
651  * {
652  * stage_3_order_3,
653  * stage_5_order_4,
654  * stage_7_order_4,
655  * stage_9_order_5,
656  * };
657  * constexpr LowStorageRungeKuttaScheme lsrk_scheme = stage_5_order_4;
658  *
659  *
660  *
661  * class LowStorageRungeKuttaIntegrator
662  * {
663  * public:
664  * LowStorageRungeKuttaIntegrator(const LowStorageRungeKuttaScheme scheme)
665  * {
667  * switch (scheme)
668  * {
669  * case stage_3_order_3:
671  * break;
672  * case stage_5_order_4:
674  * break;
675  * case stage_7_order_4:
677  * break;
678  * case stage_9_order_5:
680  * break;
681  *
682  * default:
683  * AssertThrow(false, ExcNotImplemented());
684  * }
687  * rk_integrator(lsrk);
688  * std::vector<double> ci; // not used
689  * rk_integrator.get_coefficients(ai, bi, ci);
690  * }
691  *
692  * unsigned int n_stages() const
693  * {
694  * return bi.size();
695  * }
696  *
697  * template <typename VectorType, typename Operator>
698  * void perform_time_step(const Operator &pde_operator,
699  * const double current_time,
700  * const double time_step,
701  * VectorType & solution,
702  * VectorType & vec_ri,
703  * VectorType & vec_ki) const
704  * {
705  * AssertDimension(ai.size() + 1, bi.size());
706  *
707  * vec_ki.swap(solution);
708  *
709  * double sum_previous_bi = 0;
710  * for (unsigned int stage = 0; stage < bi.size(); ++stage)
711  * {
712  * const double c_i = stage == 0 ? 0 : sum_previous_bi + ai[stage - 1];
713  *
714  * pde_operator.perform_stage(stage,
715  * current_time + c_i * time_step,
716  * bi[stage] * time_step,
717  * (stage == bi.size() - 1 ?
718  * 0 :
719  * ai[stage] * time_step),
720  * (stage % 2 == 0 ? vec_ki : vec_ri),
721  * (stage % 2 == 0 ? vec_ri : vec_ki),
722  * solution);
723  *
724  * if (stage > 0)
725  * sum_previous_bi += bi[stage - 1];
726  * }
727  * }
728  *
729  * private:
730  * std::vector<double> bi;
731  * std::vector<double> ai;
732  * };
733  *
734  *
735  * @endcode
736  *
737  * Euler-specific utility functions from @ref step_67 "step-67":
738  *
739  * @code
740  * enum EulerNumericalFlux
741  * {
742  * lax_friedrichs_modified,
743  * harten_lax_vanleer,
744  * };
745  * constexpr EulerNumericalFlux numerical_flux_type = lax_friedrichs_modified;
746  *
747  *
748  *
749  * template <int dim>
750  * class ExactSolution : public Function<dim>
751  * {
752  * public:
753  * ExactSolution(const double time)
754  * : Function<dim>(dim + 2, time)
755  * {}
756  *
757  * virtual double value(const Point<dim> & p,
758  * const unsigned int component = 0) const override;
759  * };
760  *
761  *
762  *
763  * template <int dim>
764  * double ExactSolution<dim>::value(const Point<dim> & x,
765  * const unsigned int component) const
766  * {
767  * const double t = this->get_time();
768  *
769  * switch (testcase)
770  * {
771  * case 0:
772  * {
773  * Assert(dim == 2, ExcNotImplemented());
774  * const double beta = 5;
775  *
776  * Point<dim> x0;
777  * x0[0] = 5.;
778  * const double radius_sqr =
779  * (x - x0).norm_square() - 2. * (x[0] - x0[0]) * t + t * t;
780  * const double factor =
781  * beta / (numbers::PI * 2) * std::exp(1. - radius_sqr);
782  * const double density_log = std::log2(
783  * std::abs(1. - (gamma - 1.) / gamma * 0.25 * factor * factor));
784  * const double density = std::exp2(density_log * (1. / (gamma - 1.)));
785  * const double u = 1. - factor * (x[1] - x0[1]);
786  * const double v = factor * (x[0] - t - x0[0]);
787  *
788  * if (component == 0)
789  * return density;
790  * else if (component == 1)
791  * return density * u;
792  * else if (component == 2)
793  * return density * v;
794  * else
795  * {
796  * const double pressure =
797  * std::exp2(density_log * (gamma / (gamma - 1.)));
798  * return pressure / (gamma - 1.) +
799  * 0.5 * (density * u * u + density * v * v);
800  * }
801  * }
802  *
803  * case 1:
804  * {
805  * if (component == 0)
806  * return 1.;
807  * else if (component == 1)
808  * return 0.4;
809  * else if (component == dim + 1)
810  * return 3.097857142857143;
811  * else
812  * return 0.;
813  * }
814  *
815  * default:
816  * Assert(false, ExcNotImplemented());
817  * return 0.;
818  * }
819  * }
820  *
821  *
822  *
823  * template <int dim, typename Number>
824  * inline DEAL_II_ALWAYS_INLINE
826  * euler_velocity(const Tensor<1, dim + 2, Number> &conserved_variables)
827  * {
828  * const Number inverse_density = Number(1.) / conserved_variables[0];
829  *
830  * Tensor<1, dim, Number> velocity;
831  * for (unsigned int d = 0; d < dim; ++d)
832  * velocity[d] = conserved_variables[1 + d] * inverse_density;
833  *
834  * return velocity;
835  * }
836  *
837  * template <int dim, typename Number>
838  * inline DEAL_II_ALWAYS_INLINE
839  * Number
840  * euler_pressure(const Tensor<1, dim + 2, Number> &conserved_variables)
841  * {
842  * const Tensor<1, dim, Number> velocity =
843  * euler_velocity<dim>(conserved_variables);
844  *
845  * Number rho_u_dot_u = conserved_variables[1] * velocity[0];
846  * for (unsigned int d = 1; d < dim; ++d)
847  * rho_u_dot_u += conserved_variables[1 + d] * velocity[d];
848  *
849  * return (gamma - 1.) * (conserved_variables[dim + 1] - 0.5 * rho_u_dot_u);
850  * }
851  *
852  * template <int dim, typename Number>
853  * inline DEAL_II_ALWAYS_INLINE
855  * euler_flux(const Tensor<1, dim + 2, Number> &conserved_variables)
856  * {
857  * const Tensor<1, dim, Number> velocity =
858  * euler_velocity<dim>(conserved_variables);
859  * const Number pressure = euler_pressure<dim>(conserved_variables);
860  *
862  * for (unsigned int d = 0; d < dim; ++d)
863  * {
864  * flux[0][d] = conserved_variables[1 + d];
865  * for (unsigned int e = 0; e < dim; ++e)
866  * flux[e + 1][d] = conserved_variables[e + 1] * velocity[d];
867  * flux[d + 1][d] += pressure;
868  * flux[dim + 1][d] =
869  * velocity[d] * (conserved_variables[dim + 1] + pressure);
870  * }
871  *
872  * return flux;
873  * }
874  *
875  * template <int n_components, int dim, typename Number>
876  * inline DEAL_II_ALWAYS_INLINE
878  * operator*(const Tensor<1, n_components, Tensor<1, dim, Number>> &matrix,
879  * const Tensor<1, dim, Number> & vector)
880  * {
882  * for (unsigned int d = 0; d < n_components; ++d)
883  * result[d] = matrix[d] * vector;
884  * return result;
885  * }
886  *
887  * template <int dim, typename Number>
888  * inline DEAL_II_ALWAYS_INLINE
890  * euler_numerical_flux(const Tensor<1, dim + 2, Number> &u_m,
891  * const Tensor<1, dim + 2, Number> &u_p,
892  * const Tensor<1, dim, Number> & normal)
893  * {
894  * const auto velocity_m = euler_velocity<dim>(u_m);
895  * const auto velocity_p = euler_velocity<dim>(u_p);
896  *
897  * const auto pressure_m = euler_pressure<dim>(u_m);
898  * const auto pressure_p = euler_pressure<dim>(u_p);
899  *
900  * const auto flux_m = euler_flux<dim>(u_m);
901  * const auto flux_p = euler_flux<dim>(u_p);
902  *
903  * switch (numerical_flux_type)
904  * {
905  * case lax_friedrichs_modified:
906  * {
907  * const auto lambda =
908  * 0.5 * std::sqrt(std::max(velocity_p.norm_square() +
909  * gamma * pressure_p * (1. / u_p[0]),
910  * velocity_m.norm_square() +
911  * gamma * pressure_m * (1. / u_m[0])));
912  *
913  * return 0.5 * (flux_m * normal + flux_p * normal) +
914  * 0.5 * lambda * (u_m - u_p);
915  * }
916  *
917  * case harten_lax_vanleer:
918  * {
919  * const auto avg_velocity_normal =
920  * 0.5 * ((velocity_m + velocity_p) * normal);
921  * const auto avg_c = std::sqrt(std::abs(
922  * 0.5 * gamma *
923  * (pressure_p * (1. / u_p[0]) + pressure_m * (1. / u_m[0]))));
924  * const Number s_pos =
925  * std::max(Number(), avg_velocity_normal + avg_c);
926  * const Number s_neg =
927  * std::min(Number(), avg_velocity_normal - avg_c);
928  * const Number inverse_s = Number(1.) / (s_pos - s_neg);
929  *
930  * return inverse_s *
931  * ((s_pos * (flux_m * normal) - s_neg * (flux_p * normal)) -
932  * s_pos * s_neg * (u_m - u_p));
933  * }
934  *
935  * default:
936  * {
937  * Assert(false, ExcNotImplemented());
938  * return {};
939  * }
940  * }
941  * }
942  *
943  *
944  *
945  * @endcode
946  *
947  * General-purpose utility functions from @ref step_67 "step-67":
948  *
949  * @code
950  * template <int dim, typename VectorizedArrayType>
951  * VectorizedArrayType
952  * evaluate_function(const Function<dim> & function,
953  * const Point<dim, VectorizedArrayType> &p_vectorized,
954  * const unsigned int component)
955  * {
956  * VectorizedArrayType result;
957  * for (unsigned int v = 0; v < VectorizedArrayType::size(); ++v)
958  * {
959  * Point<dim> p;
960  * for (unsigned int d = 0; d < dim; ++d)
961  * p[d] = p_vectorized[d][v];
962  * result[v] = function.value(p, component);
963  * }
964  * return result;
965  * }
966  *
967  *
968  * template <int dim, typename VectorizedArrayType, int n_components = dim + 2>
970  * evaluate_function(const Function<dim> & function,
971  * const Point<dim, VectorizedArrayType> &p_vectorized)
972  * {
973  * AssertDimension(function.n_components, n_components);
975  * for (unsigned int v = 0; v < VectorizedArrayType::size(); ++v)
976  * {
977  * Point<dim> p;
978  * for (unsigned int d = 0; d < dim; ++d)
979  * p[d] = p_vectorized[d][v];
980  * for (unsigned int d = 0; d < n_components; ++d)
981  * result[d][v] = function.value(p, d);
982  * }
983  * return result;
984  * }
985  *
986  *
987  * @endcode
988  *
989  *
990  * <a name="EuleroperatorusingacellcentricloopandMPI30sharedmemory"></a>
991  * <h3>Euler operator using a cell-centric loop and MPI-3.0 shared memory</h3>
992  *
993 
994  *
995  * Euler operator from @ref step_67 "step-67" with some changes as detailed below:
996  *
997  * @code
998  * template <int dim, int degree, int n_points_1d>
999  * class EulerOperator
1000  * {
1001  * public:
1002  * static constexpr unsigned int n_quadrature_points_1d = n_points_1d;
1003  *
1004  * EulerOperator(TimerOutput &timer_output);
1005  *
1006  * ~EulerOperator();
1007  *
1008  * void reinit(const Mapping<dim> & mapping,
1009  * const DoFHandler<dim> &dof_handler);
1010  *
1011  * void set_inflow_boundary(const types::boundary_id boundary_id,
1012  * std::unique_ptr<Function<dim>> inflow_function);
1013  *
1014  * void set_subsonic_outflow_boundary(
1016  * std::unique_ptr<Function<dim>> outflow_energy);
1017  *
1018  * void set_wall_boundary(const types::boundary_id boundary_id);
1019  *
1020  * void set_body_force(std::unique_ptr<Function<dim>> body_force);
1021  *
1022  * void
1023  * perform_stage(const unsigned int stage,
1024  * const Number cur_time,
1025  * const Number bi,
1026  * const Number ai,
1027  * const LinearAlgebra::distributed::Vector<Number> &current_ri,
1029  * LinearAlgebra::distributed::Vector<Number> &solution) const;
1030  *
1031  * void project(const Function<dim> & function,
1032  * LinearAlgebra::distributed::Vector<Number> &solution) const;
1033  *
1034  * std::array<double, 3> compute_errors(
1035  * const Function<dim> & function,
1036  * const LinearAlgebra::distributed::Vector<Number> &solution) const;
1037  *
1038  * double compute_cell_transport_speed(
1039  * const LinearAlgebra::distributed::Vector<Number> &solution) const;
1040  *
1041  * void
1042  * initialize_vector(LinearAlgebra::distributed::Vector<Number> &vector) const;
1043  *
1044  * private:
1045  * @endcode
1046  *
1047  * Instance of SubCommunicatorWrapper containing the sub-communicator, which
1048  * we need to pass to MatrixFree::reinit() to be able to exploit MPI-3.0
1049  * shared-memory capabilities:
1050  *
1051  * @code
1052  * MPI_Comm subcommunicator;
1053  *
1054  * MatrixFree<dim, Number, VectorizedArrayType> data;
1055  *
1056  * TimerOutput &timer;
1057  *
1058  * std::map<types::boundary_id, std::unique_ptr<Function<dim>>>
1059  * inflow_boundaries;
1060  * std::map<types::boundary_id, std::unique_ptr<Function<dim>>>
1061  * subsonic_outflow_boundaries;
1062  * std::set<types::boundary_id> wall_boundaries;
1063  * std::unique_ptr<Function<dim>> body_force;
1064  * };
1065  *
1066  *
1067  *
1068  * @endcode
1069  *
1070  * New constructor, which creates a sub-communicator. The user can specify
1071  * the size of the sub-communicator via the global parameter group_size. If
1072  * the size is set to -1, all MPI processes of a
1073  * shared-memory domain are combined to a group. The specified size is
1074  * decisive for the benefit of the shared-memory capabilities of MatrixFree
1075  * and, therefore, setting the <code>size</code> to <code>-1</code> is a
1076  * reasonable choice. By setting, the size to <code>1</code> users explicitly
1077  * disable the MPI-3.0 shared-memory features of MatrixFree and rely
1078  * completely on MPI-2.0 features, like <code>MPI_Isend</code> and
1079  * <code>MPI_Irecv</code>.
1080  *
1081  * @code
1082  * template <int dim, int degree, int n_points_1d>
1083  * EulerOperator<dim, degree, n_points_1d>::EulerOperator(TimerOutput &timer)
1084  * : timer(timer)
1085  * {
1086  * #if DEAL_II_MPI_VERSION_GTE(3, 0)
1087  * if (group_size == 1)
1088  * {
1089  * this->subcommunicator = MPI_COMM_SELF;
1090  * }
1091  * else if (group_size == numbers::invalid_unsigned_int)
1092  * {
1093  * const auto rank = Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
1094  *
1095  * MPI_Comm_split_type(MPI_COMM_WORLD,
1096  * MPI_COMM_TYPE_SHARED,
1097  * rank,
1098  * MPI_INFO_NULL,
1099  * &subcommunicator);
1100  * }
1101  * else
1102  * {
1103  * Assert(false, ExcNotImplemented());
1104  * }
1105  * #else
1106  * (void)subcommunicator;
1107  * (void)group_size;
1108  * this->subcommunicator = MPI_COMM_SELF;
1109  * #endif
1110  * }
1111  *
1112  *
1113  * @endcode
1114  *
1115  * New destructor responsible for freeing of the sub-communicator.
1116  *
1117  * @code
1118  * template <int dim, int degree, int n_points_1d>
1119  * EulerOperator<dim, degree, n_points_1d>::~EulerOperator()
1120  * {
1121  * #ifdef DEAL_II_WITH_MPI
1122  * if (this->subcommunicator != MPI_COMM_SELF)
1123  * MPI_Comm_free(&subcommunicator);
1124  * #endif
1125  * }
1126  *
1127  *
1128  * @endcode
1129  *
1130  * Modified reinit() function to setup the internal data structures in
1131  * MatrixFree in a way that it is usable by the cell-centric loops and
1132  * the MPI-3.0 shared-memory capabilities are used:
1133  *
1134  * @code
1135  * template <int dim, int degree, int n_points_1d>
1136  * void EulerOperator<dim, degree, n_points_1d>::reinit(
1137  * const Mapping<dim> & mapping,
1138  * const DoFHandler<dim> &dof_handler)
1139  * {
1140  * const std::vector<const DoFHandler<dim> *> dof_handlers = {&dof_handler};
1141  * const AffineConstraints<double> dummy;
1142  * const std::vector<const AffineConstraints<double> *> constraints = {&dummy};
1143  * const std::vector<Quadrature<1>> quadratures = {QGauss<1>(n_q_points_1d),
1144  * QGauss<1>(fe_degree + 1)};
1145  *
1147  * additional_data;
1148  * additional_data.mapping_update_flags =
1150  * update_values);
1151  * additional_data.mapping_update_flags_inner_faces =
1153  * update_values);
1154  * additional_data.mapping_update_flags_boundary_faces =
1156  * update_values);
1157  * additional_data.tasks_parallel_scheme =
1159  *
1160  * @endcode
1161  *
1162  * Categorize cells so that all lanes have the same boundary IDs for each
1163  * face. This is strictly not necessary, however, allows to write simpler
1164  * code in EulerOperator::perform_stage() without masking, since it is
1165  * guaranteed that all cells grouped together (in a VectorizedArray)
1166  * have to perform exactly the same operation also on the faces.
1167  *
1168  * @code
1169  * MatrixFreeTools::categorize_by_boundary_ids(dof_handler.get_triangulation(),
1170  * additional_data);
1171  *
1172  * @endcode
1173  *
1174  * Enable MPI-3.0 shared-memory capabilities within MatrixFree by providing
1175  * the sub-communicator:
1176  *
1177  * @code
1178  * additional_data.communicator_sm = subcommunicator;
1179  *
1180  * data.reinit(
1181  * mapping, dof_handlers, constraints, quadratures, additional_data);
1182  * }
1183  *
1184  *
1185  * @endcode
1186  *
1187  * The following function does an entire stage of a Runge--Kutta update and is
1188  * - alongside the slightly modified setup - the heart of this tutorial
1189  * compared to @ref step_67 "step-67".
1190  *
1191 
1192  *
1193  * In contrast to @ref step_67 "step-67", we are not executing the advection step
1194  * (using MatrixFree::loop()) and the inverse mass-matrix step
1195  * (using MatrixFree::cell_loop()) in sequence, but evaluate everything in
1196  * one go inside of MatrixFree::loop_cell_centric(). This function expects
1197  * a single function that is executed on each locally-owned (macro) cell as
1198  * parameter so that we need to loop over all faces of that cell and perform
1199  * needed integration steps on our own.
1200  *
1201 
1202  *
1203  * The following function contains to a large extent copies of the following
1204  * functions from @ref step_67 "step-67" so that comments related the evaluation of the weak
1205  * form are skipped here:
1206  * - <code>EulerDG::EulerOperator::local_apply_cell</code>
1207  * - <code>EulerDG::EulerOperator::local_apply_face</code>
1208  * - <code>EulerDG::EulerOperator::local_apply_boundary_face</code>
1209  * - <code>EulerDG::EulerOperator::local_apply_inverse_mass_matrix</code>
1210  *
1211  * @code
1212  * template <int dim, int degree, int n_points_1d>
1213  * void EulerOperator<dim, degree, n_points_1d>::perform_stage(
1214  * const unsigned int stage,
1215  * const Number current_time,
1216  * const Number bi,
1217  * const Number ai,
1218  * const LinearAlgebra::distributed::Vector<Number> &current_ri,
1219  * LinearAlgebra::distributed::Vector<Number> & vec_ki,
1220  * LinearAlgebra::distributed::Vector<Number> & solution) const
1221  * {
1222  * for (auto &i : inflow_boundaries)
1223  * i.second->set_time(current_time);
1224  * for (auto &i : subsonic_outflow_boundaries)
1225  * i.second->set_time(current_time);
1226  *
1227  * @endcode
1228  *
1229  * Run a cell-centric loop by calling MatrixFree::loop_cell_centric() and
1230  * providing a lambda containing the effects of the cell, face and
1231  * boundary-face integrals:
1232  *
1233  * @code
1234  * data.template loop_cell_centric<LinearAlgebra::distributed::Vector<Number>,
1235  * LinearAlgebra::distributed::Vector<Number>>(
1236  * [&](const auto &data, auto &dst, const auto &src, const auto cell_range) {
1237  * using FECellIntegral = FEEvaluation<dim,
1238  * degree,
1239  * n_points_1d,
1240  * dim + 2,
1241  * Number,
1242  * VectorizedArrayType>;
1243  * using FEFaceIntegral = FEFaceEvaluation<dim,
1244  * degree,
1245  * n_points_1d,
1246  * dim + 2,
1247  * Number,
1248  * VectorizedArrayType>;
1249  *
1250  * FECellIntegral phi(data);
1251  * FECellIntegral phi_temp(data);
1252  * FEFaceIntegral phi_m(data, true);
1253  * FEFaceIntegral phi_p(data, false);
1254  *
1255  * Tensor<1, dim, VectorizedArrayType> constant_body_force;
1256  * const Functions::ConstantFunction<dim> *constant_function =
1257  * dynamic_cast<Functions::ConstantFunction<dim> *>(body_force.get());
1258  *
1259  * if (constant_function)
1260  * constant_body_force =
1261  * evaluate_function<dim, VectorizedArrayType, dim>(
1262  * *constant_function, Point<dim, VectorizedArrayType>());
1263  *
1264  * const ::internal::EvaluatorTensorProduct<
1266  * dim,
1267  * n_points_1d,
1268  * n_points_1d,
1269  * VectorizedArrayType>
1271  * data.get_shape_info().data[0].shape_gradients_collocation_eo,
1273  *
1274  * AlignedVector<VectorizedArrayType> buffer(phi.static_n_q_points *
1275  * phi.n_components);
1276  *
1277  * @endcode
1278  *
1279  * Loop over all cell batches:
1280  *
1281  * @code
1282  * for (unsigned int cell = cell_range.first; cell < cell_range.second;
1283  * ++cell)
1284  * {
1285  * phi.reinit(cell);
1286  *
1287  * if (ai != Number())
1288  * phi_temp.reinit(cell);
1289  *
1290  * @endcode
1291  *
1292  * Read values from global vector and compute the values at the
1293  * quadrature points:
1294  *
1295  * @code
1296  * if (ai != Number() && stage == 0)
1297  * {
1298  * phi.read_dof_values(src);
1299  *
1300  * for (unsigned int i = 0;
1301  * i < phi.static_dofs_per_component * (dim + 2);
1302  * ++i)
1303  * phi_temp.begin_dof_values()[i] = phi.begin_dof_values()[i];
1304  *
1305  * phi.evaluate(EvaluationFlags::values);
1306  * }
1307  * else
1308  * {
1309  * phi.gather_evaluate(src, EvaluationFlags::values);
1310  * }
1311  *
1312  * @endcode
1313  *
1314  * Buffer the computed values at the quadrature points, since
1315  * these are overridden by FEEvaluation::submit_value() in the next
1316  * step, however, are needed later on for the face integrals:
1317  *
1318  * @code
1319  * for (unsigned int i = 0; i < phi.static_n_q_points * (dim + 2); ++i)
1320  * buffer[i] = phi.begin_values()[i];
1321  *
1322  * @endcode
1323  *
1324  * Apply the cell integral at the cell quadrature points. See also
1325  * the function <code>EulerOperator::local_apply_cell()</code> from
1326  * @ref step_67 "step-67":
1327  *
1328  * @code
1329  * for (unsigned int q = 0; q < phi.n_q_points; ++q)
1330  * {
1331  * const auto w_q = phi.get_value(q);
1332  * phi.submit_gradient(euler_flux<dim>(w_q), q);
1333  * if (body_force.get() != nullptr)
1334  * {
1335  * const Tensor<1, dim, VectorizedArrayType> force =
1336  * constant_function ?
1337  * constant_body_force :
1338  * evaluate_function<dim, VectorizedArrayType, dim>(
1339  * *body_force, phi.quadrature_point(q));
1340  *
1342  * for (unsigned int d = 0; d < dim; ++d)
1343  * forcing[d + 1] = w_q[0] * force[d];
1344  * for (unsigned int d = 0; d < dim; ++d)
1345  * forcing[dim + 1] += force[d] * w_q[d + 1];
1346  *
1347  * phi.submit_value(forcing, q);
1348  * }
1349  * }
1350  *
1351  * @endcode
1352  *
1353  * Test with the gradient of the test functions in the quadrature
1354  * points. We skip the interpolation back to the support points
1355  * of the element, since we first collect all contributions in the
1356  * cell quadrature points and only perform the interpolation back
1357  * as the final step.
1358  *
1359  * @code
1360  * {
1361  * auto *values_ptr = phi.begin_values();
1362  * auto *gradient_ptr = phi.begin_gradients();
1363  *
1364  * for (unsigned int c = 0; c < dim + 2; ++c)
1365  * {
1366  * if (dim >= 1 && body_force.get() == nullptr)
1367  * eval.template gradients<0, false, false>(
1368  * gradient_ptr + phi.static_n_q_points * 0, values_ptr);
1369  * else if (dim >= 1)
1370  * eval.template gradients<0, false, true>(
1371  * gradient_ptr + phi.static_n_q_points * 0, values_ptr);
1372  * if (dim >= 2)
1373  * eval.template gradients<1, false, true>(
1374  * gradient_ptr + phi.static_n_q_points * 1, values_ptr);
1375  * if (dim >= 3)
1376  * eval.template gradients<2, false, true>(
1377  * gradient_ptr + phi.static_n_q_points * 2, values_ptr);
1378  *
1379  * values_ptr += phi.static_n_q_points;
1380  * gradient_ptr += phi.static_n_q_points * dim;
1381  * }
1382  * }
1383  *
1384  * @endcode
1385  *
1386  * Loop over all faces of the current cell:
1387  *
1388  * @code
1389  * for (unsigned int face = 0;
1390  * face < GeometryInfo<dim>::faces_per_cell;
1391  * ++face)
1392  * {
1393  * @endcode
1394  *
1395  * Determine the boundary ID of the current face. Since we have
1396  * set up MatrixFree in a way that all filled lanes have
1397  * guaranteed the same boundary ID, we can select the
1398  * boundary ID of the first lane.
1399  *
1400  * @code
1401  * const auto boundary_ids =
1402  * data.get_faces_by_cells_boundary_id(cell, face);
1403  *
1404  * Assert(std::equal(boundary_ids.begin(),
1405  * boundary_ids.begin() +
1406  * data.n_active_entries_per_cell_batch(cell),
1407  * boundary_ids.begin()),
1408  * ExcMessage("Boundary IDs of lanes differ."));
1409  *
1410  * const auto boundary_id = boundary_ids[0];
1411  *
1412  * phi_m.reinit(cell, face);
1413  *
1414  * @endcode
1415  *
1416  * Interpolate the values from the cell quadrature points to the
1417  * quadrature points of the current face via a simple 1D
1418  * interpolation:
1419  *
1420  * @code
1422  * n_points_1d - 1,
1423  * VectorizedArrayType>::
1424  * template interpolate_quadrature<true, false>(
1425  * dim + 2,
1426  * data.get_shape_info(),
1427  * buffer.data(),
1428  * phi_m.begin_values(),
1429  * false,
1430  * face);
1431  *
1432  * @endcode
1433  *
1434  * Check if the face is an internal or a boundary face and
1435  * select a different code path based on this information:
1436  *
1437  * @code
1439  * {
1440  * @endcode
1441  *
1442  * Process and internal face. The following lines of code
1443  * are a copy of the function
1444  * <code>EulerDG::EulerOperator::local_apply_face</code>
1445  * from @ref step_67 "step-67":
1446  *
1447  * @code
1448  * phi_p.reinit(cell, face);
1449  * phi_p.gather_evaluate(src, EvaluationFlags::values);
1450  *
1451  * for (unsigned int q = 0; q < phi_m.n_q_points; ++q)
1452  * {
1453  * const auto numerical_flux =
1454  * euler_numerical_flux<dim>(phi_m.get_value(q),
1455  * phi_p.get_value(q),
1456  * phi_m.get_normal_vector(q));
1457  * phi_m.submit_value(-numerical_flux, q);
1458  * }
1459  * }
1460  * else
1461  * {
1462  * @endcode
1463  *
1464  * Process a boundary face. These following lines of code
1465  * are a copy of the function
1466  * <code>EulerDG::EulerOperator::local_apply_boundary_face</code>
1467  * from @ref step_67 "step-67":
1468  *
1469  * @code
1470  * for (unsigned int q = 0; q < phi_m.n_q_points; ++q)
1471  * {
1472  * const auto w_m = phi_m.get_value(q);
1473  * const auto normal = phi_m.get_normal_vector(q);
1474  *
1475  * auto rho_u_dot_n = w_m[1] * normal[0];
1476  * for (unsigned int d = 1; d < dim; ++d)
1477  * rho_u_dot_n += w_m[1 + d] * normal[d];
1478  *
1479  * bool at_outflow = false;
1480  *
1482  *
1483  * if (wall_boundaries.find(boundary_id) !=
1484  * wall_boundaries.end())
1485  * {
1486  * w_p[0] = w_m[0];
1487  * for (unsigned int d = 0; d < dim; ++d)
1488  * w_p[d + 1] =
1489  * w_m[d + 1] - 2. * rho_u_dot_n * normal[d];
1490  * w_p[dim + 1] = w_m[dim + 1];
1491  * }
1492  * else if (inflow_boundaries.find(boundary_id) !=
1493  * inflow_boundaries.end())
1494  * w_p = evaluate_function(
1495  * *inflow_boundaries.find(boundary_id)->second,
1496  * phi_m.quadrature_point(q));
1497  * else if (subsonic_outflow_boundaries.find(
1498  * boundary_id) !=
1499  * subsonic_outflow_boundaries.end())
1500  * {
1501  * w_p = w_m;
1502  * w_p[dim + 1] =
1503  * evaluate_function(*subsonic_outflow_boundaries
1504  * .find(boundary_id)
1505  * ->second,
1506  * phi_m.quadrature_point(q),
1507  * dim + 1);
1508  * at_outflow = true;
1509  * }
1510  * else
1511  * AssertThrow(false,
1512  * ExcMessage(
1513  * "Unknown boundary id, did "
1514  * "you set a boundary condition for "
1515  * "this part of the domain boundary?"));
1516  *
1517  * auto flux = euler_numerical_flux<dim>(w_m, w_p, normal);
1518  *
1519  * if (at_outflow)
1520  * for (unsigned int v = 0;
1521  * v < VectorizedArrayType::size();
1522  * ++v)
1523  * {
1524  * if (rho_u_dot_n[v] < -1e-12)
1525  * for (unsigned int d = 0; d < dim; ++d)
1526  * flux[d + 1][v] = 0.;
1527  * }
1528  *
1529  * phi_m.submit_value(-flux, q);
1530  * }
1531  * }
1532  *
1533  * @endcode
1534  *
1535  * Evaluate local integrals related to cell by quadrature and
1536  * add into cell contribution via a simple 1D interpolation:
1537  *
1538  * @code
1540  * n_points_1d - 1,
1541  * VectorizedArrayType>::
1542  * template interpolate_quadrature<false, true>(
1543  * dim + 2,
1544  * data.get_shape_info(),
1545  * phi_m.begin_values(),
1546  * phi.begin_values(),
1547  * false,
1548  * face);
1549  * }
1550  *
1551  * @endcode
1552  *
1553  * Apply inverse mass matrix in the cell quadrature points. See
1554  * also the function
1555  * <code>EulerDG::EulerOperator::local_apply_inverse_mass_matrix()</code>
1556  * from @ref step_67 "step-67":
1557  *
1558  * @code
1559  * for (unsigned int q = 0; q < phi.static_n_q_points; ++q)
1560  * {
1561  * const auto factor = VectorizedArrayType(1.0) / phi.JxW(q);
1562  * for (unsigned int c = 0; c < dim + 2; ++c)
1563  * phi.begin_values()[c * phi.static_n_q_points + q] =
1564  * phi.begin_values()[c * phi.static_n_q_points + q] * factor;
1565  * }
1566  *
1567  * @endcode
1568  *
1569  * Transform values from collocation space to the original
1570  * Gauss-Lobatto space:
1571  *
1572  * @code
1576  * dim,
1577  * degree + 1,
1578  * n_points_1d,
1579  * VectorizedArrayType,
1580  * VectorizedArrayType>::do_backward(dim + 2,
1581  * data.get_shape_info()
1582  * .data[0]
1583  * .inverse_shape_values_eo,
1584  * false,
1585  * phi.begin_values(),
1586  * phi.begin_dof_values());
1587  *
1588  * @endcode
1589  *
1590  * Perform Runge-Kutta update and write results back to global
1591  * vectors:
1592  *
1593  * @code
1594  * if (ai == Number())
1595  * {
1596  * for (unsigned int q = 0; q < phi.static_dofs_per_cell; ++q)
1597  * phi.begin_dof_values()[q] = bi * phi.begin_dof_values()[q];
1598  * phi.distribute_local_to_global(solution);
1599  * }
1600  * else
1601  * {
1602  * if (stage != 0)
1603  * phi_temp.read_dof_values(solution);
1604  *
1605  * for (unsigned int q = 0; q < phi.static_dofs_per_cell; ++q)
1606  * {
1607  * const auto K_i = phi.begin_dof_values()[q];
1608  *
1609  * phi.begin_dof_values()[q] =
1610  * phi_temp.begin_dof_values()[q] + (ai * K_i);
1611  *
1612  * phi_temp.begin_dof_values()[q] += bi * K_i;
1613  * }
1614  * phi.set_dof_values(dst);
1615  * phi_temp.set_dof_values(solution);
1616  * }
1617  * }
1618  * },
1619  * vec_ki,
1620  * current_ri,
1621  * true,
1623  * }
1624  *
1625  *
1626  *
1627  * @endcode
1628  *
1629  * From here, the code of @ref step_67 "step-67" has not changed.
1630  *
1631  * @code
1632  * template <int dim, int degree, int n_points_1d>
1633  * void EulerOperator<dim, degree, n_points_1d>::initialize_vector(
1635  * {
1636  * data.initialize_dof_vector(vector);
1637  * }
1638  *
1639  *
1640  *
1641  * template <int dim, int degree, int n_points_1d>
1642  * void EulerOperator<dim, degree, n_points_1d>::set_inflow_boundary(
1644  * std::unique_ptr<Function<dim>> inflow_function)
1645  * {
1646  * AssertThrow(subsonic_outflow_boundaries.find(boundary_id) ==
1647  * subsonic_outflow_boundaries.end() &&
1648  * wall_boundaries.find(boundary_id) == wall_boundaries.end(),
1649  * ExcMessage("You already set the boundary with id " +
1650  * std::to_string(static_cast<int>(boundary_id)) +
1651  * " to another type of boundary before now setting " +
1652  * "it as inflow"));
1653  * AssertThrow(inflow_function->n_components == dim + 2,
1654  * ExcMessage("Expected function with dim+2 components"));
1655  *
1656  * inflow_boundaries[boundary_id] = std::move(inflow_function);
1657  * }
1658  *
1659  *
1660  *
1661  * template <int dim, int degree, int n_points_1d>
1662  * void EulerOperator<dim, degree, n_points_1d>::set_subsonic_outflow_boundary(
1664  * std::unique_ptr<Function<dim>> outflow_function)
1665  * {
1666  * AssertThrow(inflow_boundaries.find(boundary_id) ==
1667  * inflow_boundaries.end() &&
1668  * wall_boundaries.find(boundary_id) == wall_boundaries.end(),
1669  * ExcMessage("You already set the boundary with id " +
1670  * std::to_string(static_cast<int>(boundary_id)) +
1671  * " to another type of boundary before now setting " +
1672  * "it as subsonic outflow"));
1673  * AssertThrow(outflow_function->n_components == dim + 2,
1674  * ExcMessage("Expected function with dim+2 components"));
1675  *
1676  * subsonic_outflow_boundaries[boundary_id] = std::move(outflow_function);
1677  * }
1678  *
1679  *
1680  *
1681  * template <int dim, int degree, int n_points_1d>
1682  * void EulerOperator<dim, degree, n_points_1d>::set_wall_boundary(
1684  * {
1685  * AssertThrow(inflow_boundaries.find(boundary_id) ==
1686  * inflow_boundaries.end() &&
1687  * subsonic_outflow_boundaries.find(boundary_id) ==
1688  * subsonic_outflow_boundaries.end(),
1689  * ExcMessage("You already set the boundary with id " +
1690  * std::to_string(static_cast<int>(boundary_id)) +
1691  * " to another type of boundary before now setting " +
1692  * "it as wall boundary"));
1693  *
1694  * wall_boundaries.insert(boundary_id);
1695  * }
1696  *
1697  *
1698  *
1699  * template <int dim, int degree, int n_points_1d>
1700  * void EulerOperator<dim, degree, n_points_1d>::set_body_force(
1701  * std::unique_ptr<Function<dim>> body_force)
1702  * {
1703  * AssertDimension(body_force->n_components, dim);
1704  *
1705  * this->body_force = std::move(body_force);
1706  * }
1707  *
1708  *
1709  *
1710  * template <int dim, int degree, int n_points_1d>
1712  * const Function<dim> & function,
1714  * {
1716  * phi(data, 0, 1);
1718  * degree,
1719  * dim + 2,
1720  * Number,
1721  * VectorizedArrayType>
1722  * inverse(phi);
1723  * solution.zero_out_ghost_values();
1724  * for (unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1725  * {
1726  * phi.reinit(cell);
1727  * for (unsigned int q = 0; q < phi.n_q_points; ++q)
1728  * phi.submit_dof_value(evaluate_function(function,
1729  * phi.quadrature_point(q)),
1730  * q);
1731  * inverse.transform_from_q_points_to_basis(dim + 2,
1732  * phi.begin_dof_values(),
1733  * phi.begin_dof_values());
1734  * phi.set_dof_values(solution);
1735  * }
1736  * }
1737  *
1738  *
1739  *
1740  * template <int dim, int degree, int n_points_1d>
1741  * std::array<double, 3> EulerOperator<dim, degree, n_points_1d>::compute_errors(
1742  * const Function<dim> & function,
1743  * const LinearAlgebra::distributed::Vector<Number> &solution) const
1744  * {
1745  * TimerOutput::Scope t(timer, "compute errors");
1746  * double errors_squared[3] = {};
1748  * phi(data, 0, 0);
1749  *
1750  * for (unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1751  * {
1752  * phi.reinit(cell);
1753  * phi.gather_evaluate(solution, EvaluationFlags::values);
1754  * VectorizedArrayType local_errors_squared[3] = {};
1755  * for (unsigned int q = 0; q < phi.n_q_points; ++q)
1756  * {
1757  * const auto error =
1758  * evaluate_function(function, phi.quadrature_point(q)) -
1759  * phi.get_value(q);
1760  * const auto JxW = phi.JxW(q);
1761  *
1762  * local_errors_squared[0] += error[0] * error[0] * JxW;
1763  * for (unsigned int d = 0; d < dim; ++d)
1764  * local_errors_squared[1] += (error[d + 1] * error[d + 1]) * JxW;
1765  * local_errors_squared[2] += (error[dim + 1] * error[dim + 1]) * JxW;
1766  * }
1767  * for (unsigned int v = 0; v < data.n_active_entries_per_cell_batch(cell);
1768  * ++v)
1769  * for (unsigned int d = 0; d < 3; ++d)
1770  * errors_squared[d] += local_errors_squared[d][v];
1771  * }
1772  *
1773  * Utilities::MPI::sum(errors_squared, MPI_COMM_WORLD, errors_squared);
1774  *
1775  * std::array<double, 3> errors;
1776  * for (unsigned int d = 0; d < 3; ++d)
1777  * errors[d] = std::sqrt(errors_squared[d]);
1778  *
1779  * return errors;
1780  * }
1781  *
1782  *
1783  *
1784  * template <int dim, int degree, int n_points_1d>
1785  * double EulerOperator<dim, degree, n_points_1d>::compute_cell_transport_speed(
1786  * const LinearAlgebra::distributed::Vector<Number> &solution) const
1787  * {
1788  * TimerOutput::Scope t(timer, "compute transport speed");
1789  * Number max_transport = 0;
1791  * phi(data, 0, 1);
1792  *
1793  * for (unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1794  * {
1795  * phi.reinit(cell);
1796  * phi.gather_evaluate(solution, EvaluationFlags::values);
1797  * VectorizedArrayType local_max = 0.;
1798  * for (unsigned int q = 0; q < phi.n_q_points; ++q)
1799  * {
1800  * const auto solution = phi.get_value(q);
1801  * const auto velocity = euler_velocity<dim>(solution);
1802  * const auto pressure = euler_pressure<dim>(solution);
1803  *
1804  * const auto inverse_jacobian = phi.inverse_jacobian(q);
1805  * const auto convective_speed = inverse_jacobian * velocity;
1806  * VectorizedArrayType convective_limit = 0.;
1807  * for (unsigned int d = 0; d < dim; ++d)
1808  * convective_limit =
1809  * std::max(convective_limit, std::abs(convective_speed[d]));
1810  *
1811  * const auto speed_of_sound =
1812  * std::sqrt(gamma * pressure * (1. / solution[0]));
1813  *
1815  * for (unsigned int d = 0; d < dim; ++d)
1816  * eigenvector[d] = 1.;
1817  * for (unsigned int i = 0; i < 5; ++i)
1818  * {
1819  * eigenvector = transpose(inverse_jacobian) *
1820  * (inverse_jacobian * eigenvector);
1821  * VectorizedArrayType eigenvector_norm = 0.;
1822  * for (unsigned int d = 0; d < dim; ++d)
1823  * eigenvector_norm =
1824  * std::max(eigenvector_norm, std::abs(eigenvector[d]));
1825  * eigenvector /= eigenvector_norm;
1826  * }
1827  * const auto jac_times_ev = inverse_jacobian * eigenvector;
1828  * const auto max_eigenvalue = std::sqrt(
1829  * (jac_times_ev * jac_times_ev) / (eigenvector * eigenvector));
1830  * local_max =
1831  * std::max(local_max,
1832  * max_eigenvalue * speed_of_sound + convective_limit);
1833  * }
1834  *
1835  * for (unsigned int v = 0; v < data.n_active_entries_per_cell_batch(cell);
1836  * ++v)
1837  * for (unsigned int d = 0; d < 3; ++d)
1838  * max_transport = std::max(max_transport, local_max[v]);
1839  * }
1840  *
1841  * max_transport = Utilities::MPI::max(max_transport, MPI_COMM_WORLD);
1842  *
1843  * return max_transport;
1844  * }
1845  *
1846  *
1847  *
1848  * template <int dim>
1849  * class EulerProblem
1850  * {
1851  * public:
1852  * EulerProblem();
1853  *
1854  * void run();
1855  *
1856  * private:
1857  * void make_grid_and_dofs();
1858  *
1859  * void output_results(const unsigned int result_number);
1860  *
1862  *
1863  * ConditionalOStream pcout;
1864  *
1865  * #ifdef DEAL_II_WITH_P4EST
1867  * #else
1869  * #endif
1870  *
1871  * FESystem<dim> fe;
1872  * MappingQGeneric<dim> mapping;
1873  * DoFHandler<dim> dof_handler;
1874  *
1875  * TimerOutput timer;
1876  *
1877  * EulerOperator<dim, fe_degree, n_q_points_1d> euler_operator;
1878  *
1879  * double time, time_step;
1880  *
1881  * class Postprocessor : public DataPostprocessor<dim>
1882  * {
1883  * public:
1884  * Postprocessor();
1885  *
1886  * virtual void evaluate_vector_field(
1887  * const DataPostprocessorInputs::Vector<dim> &inputs,
1888  * std::vector<Vector<double>> &computed_quantities) const override;
1889  *
1890  * virtual std::vector<std::string> get_names() const override;
1891  *
1892  * virtual std::vector<
1894  * get_data_component_interpretation() const override;
1895  *
1896  * virtual UpdateFlags get_needed_update_flags() const override;
1897  *
1898  * private:
1899  * const bool do_schlieren_plot;
1900  * };
1901  * };
1902  *
1903  *
1904  *
1905  * template <int dim>
1906  * EulerProblem<dim>::Postprocessor::Postprocessor()
1907  * : do_schlieren_plot(dim == 2)
1908  * {}
1909  *
1910  *
1911  *
1912  * template <int dim>
1913  * void EulerProblem<dim>::Postprocessor::evaluate_vector_field(
1914  * const DataPostprocessorInputs::Vector<dim> &inputs,
1915  * std::vector<Vector<double>> & computed_quantities) const
1916  * {
1917  * const unsigned int n_evaluation_points = inputs.solution_values.size();
1918  *
1919  * if (do_schlieren_plot == true)
1920  * Assert(inputs.solution_gradients.size() == n_evaluation_points,
1921  * ExcInternalError());
1922  *
1923  * Assert(computed_quantities.size() == n_evaluation_points,
1924  * ExcInternalError());
1925  * Assert(inputs.solution_values[0].size() == dim + 2, ExcInternalError());
1926  * Assert(computed_quantities[0].size() ==
1927  * dim + 2 + (do_schlieren_plot == true ? 1 : 0),
1928  * ExcInternalError());
1929  *
1930  * for (unsigned int q = 0; q < n_evaluation_points; ++q)
1931  * {
1932  * Tensor<1, dim + 2> solution;
1933  * for (unsigned int d = 0; d < dim + 2; ++d)
1934  * solution[d] = inputs.solution_values[q](d);
1935  *
1936  * const double density = solution[0];
1937  * const Tensor<1, dim> velocity = euler_velocity<dim>(solution);
1938  * const double pressure = euler_pressure<dim>(solution);
1939  *
1940  * for (unsigned int d = 0; d < dim; ++d)
1941  * computed_quantities[q](d) = velocity[d];
1942  * computed_quantities[q](dim) = pressure;
1943  * computed_quantities[q](dim + 1) = std::sqrt(gamma * pressure / density);
1944  *
1945  * if (do_schlieren_plot == true)
1946  * computed_quantities[q](dim + 2) =
1947  * inputs.solution_gradients[q][0] * inputs.solution_gradients[q][0];
1948  * }
1949  * }
1950  *
1951  *
1952  *
1953  * template <int dim>
1954  * std::vector<std::string> EulerProblem<dim>::Postprocessor::get_names() const
1955  * {
1956  * std::vector<std::string> names;
1957  * for (unsigned int d = 0; d < dim; ++d)
1958  * names.emplace_back("velocity");
1959  * names.emplace_back("pressure");
1960  * names.emplace_back("speed_of_sound");
1961  *
1962  * if (do_schlieren_plot == true)
1963  * names.emplace_back("schlieren_plot");
1964  *
1965  * return names;
1966  * }
1967  *
1968  *
1969  *
1970  * template <int dim>
1971  * std::vector<DataComponentInterpretation::DataComponentInterpretation>
1972  * EulerProblem<dim>::Postprocessor::get_data_component_interpretation() const
1973  * {
1974  * std::vector<DataComponentInterpretation::DataComponentInterpretation>
1975  * interpretation;
1976  * for (unsigned int d = 0; d < dim; ++d)
1977  * interpretation.push_back(
1979  * interpretation.push_back(DataComponentInterpretation::component_is_scalar);
1980  * interpretation.push_back(DataComponentInterpretation::component_is_scalar);
1981  *
1982  * if (do_schlieren_plot == true)
1983  * interpretation.push_back(
1985  *
1986  * return interpretation;
1987  * }
1988  *
1989  *
1990  *
1991  * template <int dim>
1992  * UpdateFlags EulerProblem<dim>::Postprocessor::get_needed_update_flags() const
1993  * {
1994  * if (do_schlieren_plot == true)
1995  * return update_values | update_gradients;
1996  * else
1997  * return update_values;
1998  * }
1999  *
2000  *
2001  *
2002  * template <int dim>
2003  * EulerProblem<dim>::EulerProblem()
2004  * : pcout(std::cout, Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0)
2005  * #ifdef DEAL_II_WITH_P4EST
2006  * , triangulation(MPI_COMM_WORLD)
2007  * #endif
2008  * , fe(FE_DGQ<dim>(fe_degree), dim + 2)
2009  * , mapping(fe_degree)
2010  * , dof_handler(triangulation)
2011  * , timer(pcout, TimerOutput::never, TimerOutput::wall_times)
2012  * , euler_operator(timer)
2013  * , time(0)
2014  * , time_step(0)
2015  * {}
2016  *
2017  *
2018  *
2019  * template <int dim>
2020  * void EulerProblem<dim>::make_grid_and_dofs()
2021  * {
2022  * switch (testcase)
2023  * {
2024  * case 0:
2025  * {
2026  * Point<dim> lower_left;
2027  * for (unsigned int d = 1; d < dim; ++d)
2028  * lower_left[d] = -5;
2029  *
2030  * Point<dim> upper_right;
2031  * upper_right[0] = 10;
2032  * for (unsigned int d = 1; d < dim; ++d)
2033  * upper_right[d] = 5;
2034  *
2036  * lower_left,
2037  * upper_right);
2038  * triangulation.refine_global(2);
2039  *
2040  * euler_operator.set_inflow_boundary(
2041  * 0, std::make_unique<ExactSolution<dim>>(0));
2042  *
2043  * break;
2044  * }
2045  *
2046  * case 1:
2047  * {
2049  * triangulation, 0.03, 1, 0, true);
2050  *
2051  * euler_operator.set_inflow_boundary(
2052  * 0, std::make_unique<ExactSolution<dim>>(0));
2053  * euler_operator.set_subsonic_outflow_boundary(
2054  * 1, std::make_unique<ExactSolution<dim>>(0));
2055  *
2056  * euler_operator.set_wall_boundary(2);
2057  * euler_operator.set_wall_boundary(3);
2058  *
2059  * if (dim == 3)
2060  * euler_operator.set_body_force(
2061  * std::make_unique<Functions::ConstantFunction<dim>>(
2062  * std::vector<double>({0., 0., -0.2})));
2063  *
2064  * break;
2065  * }
2066  *
2067  * default:
2068  * Assert(false, ExcNotImplemented());
2069  * }
2070  *
2071  * triangulation.refine_global(n_global_refinements);
2072  *
2073  * dof_handler.distribute_dofs(fe);
2074  *
2075  * euler_operator.reinit(mapping, dof_handler);
2076  * euler_operator.initialize_vector(solution);
2077  *
2078  * std::locale s = pcout.get_stream().getloc();
2079  * pcout.get_stream().imbue(std::locale(""));
2080  * pcout << "Number of degrees of freedom: " << dof_handler.n_dofs()
2081  * << " ( = " << (dim + 2) << " [vars] x "
2082  * << triangulation.n_global_active_cells() << " [cells] x "
2083  * << Utilities::pow(fe_degree + 1, dim) << " [dofs/cell/var] )"
2084  * << std::endl;
2085  * pcout.get_stream().imbue(s);
2086  * }
2087  *
2088  *
2089  *
2090  * template <int dim>
2091  * void EulerProblem<dim>::output_results(const unsigned int result_number)
2092  * {
2093  * const std::array<double, 3> errors =
2094  * euler_operator.compute_errors(ExactSolution<dim>(time), solution);
2095  * const std::string quantity_name = testcase == 0 ? "error" : "norm";
2096  *
2097  * pcout << "Time:" << std::setw(8) << std::setprecision(3) << time
2098  * << ", dt: " << std::setw(8) << std::setprecision(2) << time_step
2099  * << ", " << quantity_name << " rho: " << std::setprecision(4)
2100  * << std::setw(10) << errors[0] << ", rho * u: " << std::setprecision(4)
2101  * << std::setw(10) << errors[1] << ", energy:" << std::setprecision(4)
2102  * << std::setw(10) << errors[2] << std::endl;
2103  *
2104  * {
2105  * TimerOutput::Scope t(timer, "output");
2106  *
2107  * Postprocessor postprocessor;
2108  * DataOut<dim> data_out;
2109  *
2110  * DataOutBase::VtkFlags flags;
2111  * flags.write_higher_order_cells = true;
2112  * data_out.set_flags(flags);
2113  *
2114  * data_out.attach_dof_handler(dof_handler);
2115  * {
2116  * std::vector<std::string> names;
2117  * names.emplace_back("density");
2118  * for (unsigned int d = 0; d < dim; ++d)
2119  * names.emplace_back("momentum");
2120  * names.emplace_back("energy");
2121  *
2122  * std::vector<DataComponentInterpretation::DataComponentInterpretation>
2123  * interpretation;
2124  * interpretation.push_back(
2126  * for (unsigned int d = 0; d < dim; ++d)
2127  * interpretation.push_back(
2129  * interpretation.push_back(
2131  *
2132  * data_out.add_data_vector(dof_handler, solution, names, interpretation);
2133  * }
2134  * data_out.add_data_vector(solution, postprocessor);
2135  *
2137  * if (testcase == 0 && dim == 2)
2138  * {
2139  * reference.reinit(solution);
2140  * euler_operator.project(ExactSolution<dim>(time), reference);
2141  * reference.sadd(-1., 1, solution);
2142  * std::vector<std::string> names;
2143  * names.emplace_back("error_density");
2144  * for (unsigned int d = 0; d < dim; ++d)
2145  * names.emplace_back("error_momentum");
2146  * names.emplace_back("error_energy");
2147  *
2148  * std::vector<DataComponentInterpretation::DataComponentInterpretation>
2149  * interpretation;
2150  * interpretation.push_back(
2152  * for (unsigned int d = 0; d < dim; ++d)
2153  * interpretation.push_back(
2155  * interpretation.push_back(
2157  *
2158  * data_out.add_data_vector(dof_handler,
2159  * reference,
2160  * names,
2161  * interpretation);
2162  * }
2163  *
2164  * Vector<double> mpi_owner(triangulation.n_active_cells());
2165  * mpi_owner = Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
2166  * data_out.add_data_vector(mpi_owner, "owner");
2167  *
2168  * data_out.build_patches(mapping,
2169  * fe.degree,
2171  *
2172  * const std::string filename =
2173  * "solution_" + Utilities::int_to_string(result_number, 3) + ".vtu";
2174  * data_out.write_vtu_in_parallel(filename, MPI_COMM_WORLD);
2175  * }
2176  * }
2177  *
2178  *
2179  *
2180  * template <int dim>
2181  * void EulerProblem<dim>::run()
2182  * {
2183  * {
2184  * const unsigned int n_vect_number = VectorizedArrayType::size();
2185  * const unsigned int n_vect_bits = 8 * sizeof(Number) * n_vect_number;
2186  *
2187  * pcout << "Running with "
2188  * << Utilities::MPI::n_mpi_processes(MPI_COMM_WORLD)
2189  * << " MPI processes" << std::endl;
2190  * pcout << "Vectorization over " << n_vect_number << " "
2191  * << (std::is_same<Number, double>::value ? "doubles" : "floats")
2192  * << " = " << n_vect_bits << " bits ("
2194  * << std::endl;
2195  * }
2196  *
2197  * make_grid_and_dofs();
2198  *
2199  * const LowStorageRungeKuttaIntegrator integrator(lsrk_scheme);
2200  *
2203  * rk_register_1.reinit(solution);
2204  * rk_register_2.reinit(solution);
2205  *
2206  * euler_operator.project(ExactSolution<dim>(time), solution);
2207  *
2208  *
2209  * double min_vertex_distance = std::numeric_limits<double>::max();
2210  * for (const auto &cell : triangulation.active_cell_iterators())
2211  * if (cell->is_locally_owned())
2212  * min_vertex_distance =
2213  * std::min(min_vertex_distance, cell->minimum_vertex_distance());
2214  * min_vertex_distance =
2215  * Utilities::MPI::min(min_vertex_distance, MPI_COMM_WORLD);
2216  *
2217  * time_step = courant_number * integrator.n_stages() /
2218  * euler_operator.compute_cell_transport_speed(solution);
2219  * pcout << "Time step size: " << time_step
2220  * << ", minimal h: " << min_vertex_distance
2221  * << ", initial transport scaling: "
2222  * << 1. / euler_operator.compute_cell_transport_speed(solution)
2223  * << std::endl
2224  * << std::endl;
2225  *
2226  * output_results(0);
2227  *
2228  * unsigned int timestep_number = 0;
2229  *
2230  * while (time < final_time - 1e-12 && timestep_number < max_time_steps)
2231  * {
2232  * ++timestep_number;
2233  * if (timestep_number % 5 == 0)
2234  * time_step =
2235  * courant_number * integrator.n_stages() /
2237  * euler_operator.compute_cell_transport_speed(solution), 3);
2238  *
2239  * {
2240  * TimerOutput::Scope t(timer, "rk time stepping total");
2241  * integrator.perform_time_step(euler_operator,
2242  * time,
2243  * time_step,
2244  * solution,
2245  * rk_register_1,
2246  * rk_register_2);
2247  * }
2248  *
2249  * time += time_step;
2250  *
2251  * if (static_cast<int>(time / output_tick) !=
2252  * static_cast<int>((time - time_step) / output_tick) ||
2253  * time >= final_time - 1e-12)
2254  * output_results(
2255  * static_cast<unsigned int>(std::round(time / output_tick)));
2256  * }
2257  *
2258  * timer.print_wall_time_statistics(MPI_COMM_WORLD);
2259  * pcout << std::endl;
2260  * }
2261  *
2262  * } // namespace Euler_DG
2263  *
2264  *
2265  * int main(int argc, char **argv)
2266  * {
2267  * using namespace Euler_DG;
2268  * using namespace dealii;
2269  *
2270  * Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
2271  *
2272  * try
2273  * {
2274  * deallog.depth_console(0);
2275  *
2276  * EulerProblem<dimension> euler_problem;
2277  * euler_problem.run();
2278  * }
2279  * catch (std::exception &exc)
2280  * {
2281  * std::cerr << std::endl
2282  * << std::endl
2283  * << "----------------------------------------------------"
2284  * << std::endl;
2285  * std::cerr << "Exception on processing: " << std::endl
2286  * << exc.what() << std::endl
2287  * << "Aborting!" << std::endl
2288  * << "----------------------------------------------------"
2289  * << std::endl;
2290  *
2291  * return 1;
2292  * }
2293  * catch (...)
2294  * {
2295  * std::cerr << std::endl
2296  * << std::endl
2297  * << "----------------------------------------------------"
2298  * << std::endl;
2299  * std::cerr << "Unknown exception!" << std::endl
2300  * << "Aborting!" << std::endl
2301  * << "----------------------------------------------------"
2302  * << std::endl;
2303  * return 1;
2304  * }
2305  *
2306  * return 0;
2307  * }
2308  * @endcode
2309 <a name="Results"></a><h1>Results</h1>
2310 
2311 
2312 Running the program with the default settings on a machine with 40 processes
2313 produces the following output:
2314 
2315 @code
2316 Running with 40 MPI processes
2317 Vectorization over 8 doubles = 512 bits (AVX512)
2318 Number of degrees of freedom: 27.648.000 ( = 5 [vars] x 25.600 [cells] x 216 [dofs/cell/var] )
2319 Time step size: 0.000295952, minimal h: 0.0075, initial transport scaling: 0.00441179
2320 Time: 0, dt: 0.0003, norm rho: 5.385e-16, rho * u: 1.916e-16, energy: 1.547e-15
2321 +--------------------------------------+------------------+------------+------------------+
2322 | Total wallclock time elapsed | 17.52s 10 | 17.52s | 17.52s 11 |
2323 | | | |
2324 | Section | no. calls | min time rank | avg time | max time rank |
2325 +--------------------------------------+------------------+------------+------------------+
2326 | compute errors | 1 | 0.009594s 16 | 0.009705s | 0.009819s 8 |
2327 | compute transport speed | 22 | 0.1366s 0 | 0.1367s | 0.1368s 18 |
2328 | output | 1 | 1.233s 0 | 1.233s | 1.233s 32 |
2329 | rk time stepping total | 100 | 8.746s 35 | 8.746s | 8.746s 0 |
2330 | rk_stage - integrals L_h | 500 | 8.742s 36 | 8.742s | 8.743s 2 |
2331 +--------------------------------------+------------------+------------+------------------+
2332 @endcode
2333 
2334 and the following visual output:
2335 
2336 <table align="center" class="doxtable" style="width:85%">
2337  <tr>
2338  <td>
2339  <img src="https://www.dealii.org/images/steps/developer/step-67.pressure_010.png" alt="" width="100%">
2340  </td>
2341  <td>
2342  <img src="https://www.dealii.org/images/steps/developer/step-67.pressure_025.png" alt="" width="100%">
2343  </td>
2344  </tr>
2345  <tr>
2346  <td>
2347  <img src="https://www.dealii.org/images/steps/developer/step-67.pressure_050.png" alt="" width="100%">
2348  </td>
2349  <td>
2350  <img src="https://www.dealii.org/images/steps/developer/step-67.pressure_100.png" alt="" width="100%">
2351  </td>
2352  </tr>
2353 </table>
2354 
2355 As a reference, the results of @ref step_67 "step-67" using FCL are:
2356 
2357 @code
2358 Running with 40 MPI processes
2359 Vectorization over 8 doubles = 512 bits (AVX512)
2360 Number of degrees of freedom: 27.648.000 ( = 5 [vars] x 25.600 [cells] x 216 [dofs/cell/var] )
2361 Time step size: 0.000295952, minimal h: 0.0075, initial transport scaling: 0.00441179
2362 Time: 0, dt: 0.0003, norm rho: 5.385e-16, rho * u: 1.916e-16, energy: 1.547e-15
2363 +-------------------------------------------+------------------+------------+------------------+
2364 | Total wallclock time elapsed | 13.33s 0 | 13.34s | 13.35s 34 |
2365 | | | |
2366 | Section | no. calls | min time rank | avg time | max time rank |
2367 +-------------------------------------------+------------------+------------+------------------+
2368 | compute errors | 1 | 0.007977s 10 | 0.008053s | 0.008161s 30 |
2369 | compute transport speed | 22 | 0.1228s 34 | 0.2227s | 0.3845s 0 |
2370 | output | 1 | 1.255s 3 | 1.257s | 1.259s 27 |
2371 | rk time stepping total | 100 | 11.15s 0 | 11.32s | 11.42s 34 |
2372 | rk_stage - integrals L_h | 500 | 8.719s 10 | 8.932s | 9.196s 0 |
2373 | rk_stage - inv mass + vec upd | 500 | 1.944s 0 | 2.377s | 2.55s 10 |
2374 +-------------------------------------------+------------------+------------+------------------+
2375 @endcode
2376 
2377 By the modifications shown in this tutorial, we were able to achieve a speedup of
2378 27% for the Runge-Kutta stages.
2379 
2380 <a name="Possibilitiesforextensions"></a><h3>Possibilities for extensions</h3>
2381 
2382 
2383 The algorithms are easily extendable to higher dimensions: a high-dimensional
2384 <a href="https://github.com/hyperdeal/hyperdeal/blob/a9e67b4e625ff1dde2fed93ad91cdfacfaa3acdf/include/hyper.deal/operators/advection/advection_operation.h#L219-L569">advection operator based on cell-centric loops</a>
2385 is part of the hyper.deal library. An extension of cell-centric loops
2386 to locally-refined meshes is more involved.
2387 
2388 <a name="ExtensiontothecompressibleNavierStokesequations"></a><h4>Extension to the compressible Navier-Stokes equations</h4>
2389 
2390 
2391 The solver presented in this tutorial program can also be extended to the
2392 compressible Navier–Stokes equations by adding viscous terms, as also
2393 suggested in @ref step_67 "step-67". To keep as much of the performance obtained here despite
2394 the additional cost of elliptic terms, e.g. via an interior penalty method, that
2395 tutorial has proposed to switch the basis from FE_DGQ to FE_DGQHermite like in
2396 the @ref step_59 "step-59" tutorial program. The reasoning behind this switch is that in the
2397 case of FE_DGQ all values of neighboring cells (i.e., @f$k+1@f$ layers) are needed,
2398 whilst in the case of FE_DGQHermite only 2 layers, making the latter
2399 significantly more suitable for higher degrees. The additional layers have to be,
2400 on the one hand, loaded from main memory during flux computation and, one the
2401 other hand, have to be communicated. Using the shared-memory capabilities
2402 introduced in this tutorial, the second point can be eliminated on a single
2403 compute node or its influence can be reduced in a hybrid context.
2404 
2405 <a name="BlockGaussSeidellikepreconditioners"></a><h4>Block Gauss-Seidel-like preconditioners</h4>
2406 
2407 
2408 Cell-centric loops could be used to create block Gauss-Seidel preconditioners
2409 that are multiplicative within one process and additive over processes. These
2410 type of preconditioners use during flux computation, in contrast to Jacobi-type
2411 preconditioners, already updated values from neighboring cells. The following
2412 pseudo-code visualizes how this could in principal be achieved:
2413 
2414 @code
2415 // vector monitor if cells have been updated or not
2416 Vector<Number> visit_flags(data.n_cell_batches () + data.n_ghost_cell_batches ());
2417 
2418 // element centric loop with a modified kernel
2419 data.template loop_cell_centric<VectorType, VectorType>(
2420  [&](const auto &data, auto &dst, const auto &src, const auto cell_range) {
2421 
2422  for (unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
2423  {
2424  // cell integral as usual (not shown)
2425 
2426  for (unsigned int face = 0; face < GeometryInfo<dim>::faces_per_cell; ++face)
2427  {
2428  const auto boundary_id = data.get_faces_by_cells_boundary_id(cell, face)[0];
2429 
2430  if (boundary_id == numbers::internal_face_boundary_id)
2431  {
2432  phi_p.reinit(cell, face);
2433 
2434  const auto flags = phi_p.read_cell_data(visit_flags);
2435  const auto all_neighbors_have_been_updated =
2436  std::min(flags.begin(),
2437  flags().begin() + data.n_active_entries_per_cell_batch(cell) == 1;
2438 
2439  if(all_neighbors_have_been_updated)
2440  phi_p.gather_evaluate(dst, EvaluationFlags::values);
2441  else
2442  phi_p.gather_evaluate(src, EvaluationFlags::values);
2443 
2444  // continue as usual (not shown)
2445  }
2446  else
2447  {
2448  // boundary integral as usual (not shown)
2449  }
2450  }
2451 
2452  // continue as above and apply your favorite algorithm to invert
2453  // the cell-local operator (not shown)
2454 
2455  // make cells as updated
2456  phi.set_cell_data(visit_flags, VectorizedArrayType(1.0));
2457  }
2458  },
2459  dst,
2460  src,
2461  true,
2463 @endcode
2464 
2465 For this purpose, one can exploit the cell-data vector capabilities of
2466 MatrixFree and the range-based iteration capabilities of VectorizedArray.
2467 
2468 Please note that in the given example we process <code>VectorizedArrayType::size()</code>
2469 number of blocks, since each lane corresponds to one block. We consider blocks
2470 as updated if all blocks processed by a vector register have been updated. In
2471 the case of Cartesian meshes this is a reasonable approach, however, for
2472 general unstructured meshes this conservative approach might lead to a decrease in the
2473 efficiency of the preconditioner. A reduction of cells processed in parallel
2474 by explicitly reducing the number of lanes used by <code>VectorizedArrayType</code>
2475 might increase the quality of the preconditioner, but with the cost that each
2476 iteration might be more expensive. This dilemma leads us to a further
2477 "possibility for extension": vectorization within an element.
2478  *
2479  *
2480 <a name="PlainProg"></a>
2481 <h1> The plain program</h1>
2482 @include "step-76.cc"
2483 */
pointer data()
void attach_dof_handler(const DoFHandlerType &)
void add_data_vector(const VectorType &data, const std::vector< std::string > &names, const DataVectorType type=type_automatic, const std::vector< DataComponentInterpretation::DataComponentInterpretation > &data_component_interpretation=std::vector< DataComponentInterpretation::DataComponentInterpretation >())
virtual void build_patches(const unsigned int n_subdivisions=0)
Definition: data_out.cc:1085
VectorizedArrayType JxW(const unsigned int q_point) const
void submit_value(const value_type val_in, const unsigned int q_point)
Definition: fe_dgq.h:111
unsigned int depth_console(const unsigned int n)
Definition: logstream.cc:350
Abstract base class for mapping classes.
Definition: mapping.h:304
void loop_cell_centric(void(CLASS::*cell_operation)(const MatrixFree &, OutVector &, const InVector &, const std::pair< unsigned int, unsigned int > &) const, const CLASS *owning_class, OutVector &dst, const InVector &src, const bool zero_dst_vector=false, const DataAccessOnFaces src_vector_face_access=DataAccessOnFaces::unspecified) const
void reinit(const MappingType &mapping, const DoFHandler< dim > &dof_handler, const AffineConstraints< number2 > &constraint, const QuadratureType &quad, const AdditionalData &additional_data=AdditionalData())
Definition: point.h:111
Definition: tensor.h:472
@ wall_times
Definition: timer.h:653
void print_wall_time_statistics(const MPI_Comm &mpi_comm, const double print_quantile=0.) const
Definition: timer.cc:844
Definition: vector.h:110
std::enable_if< std::is_floating_point< T >::value &&std::is_floating_point< U >::value, typename ProductType< std::complex< T >, std::complex< U > >::type >::type operator*(const std::complex< T > &left, const std::complex< U > &right)
#define DEAL_II_ALWAYS_INLINE
Definition: config.h:97
#define DEAL_II_MPI_VERSION_GTE(major, minor)
Definition: config.h:380
#define DEAL_II_WITH_P4EST
Definition: config.h:56
#define DEAL_II_WITH_MPI
Definition: config.h:53
DerivativeForm< 1, spacedim, dim, Number > transpose(const DerivativeForm< 1, dim, spacedim, Number > &DF)
UpdateFlags
@ update_values
Shape function values.
@ update_normal_vectors
Normal vectors.
@ update_JxW_values
Transformed quadrature weights.
@ update_gradients
Shape function gradients.
@ update_quadrature_points
Transformed quadrature points.
Point< 2 > second
Definition: grid_out.cc:4588
Point< 2 > first
Definition: grid_out.cc:4587
unsigned int level
Definition: grid_out.cc:4590
__global__ void reduction(Number *result, const Number *v, const size_type N)
__global__ void set(Number *val, const Number s, const size_type N)
static ::ExceptionBase & ExcInternalError()
void write_vtu_in_parallel(const std::string &filename, const MPI_Comm &comm) const
#define Assert(cond, exc)
Definition: exceptions.h:1465
std::string to_string(const T &t)
Definition: patterns.h:2329
static ::ExceptionBase & ExcNotImplemented()
#define AssertDimension(dim1, dim2)
Definition: exceptions.h:1622
void set_flags(const FlagType &flags)
static ::ExceptionBase & ExcMessage(std::string arg1)
#define AssertThrow(cond, exc)
Definition: exceptions.h:1575
void loop(ITERATOR begin, typename identity< ITERATOR >::type end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(DOFINFO &, DOFINFO &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, ASSEMBLER &assembler, const LoopControl &lctrl=LoopControl())
Definition: loop.h:439
virtual void sadd(const Number s, const Number a, const VectorSpaceVector< Number > &V) override
void reinit(const size_type size, const bool omit_zeroing_entries=false)
LogStream deallog
Definition: logstream.cc:37
const Event initial
Definition: event.cc:65
void hyper_rectangle(Triangulation< dim, spacedim > &tria, const Point< dim > &p1, const Point< dim > &p2, const bool colorize=false)
void channel_with_cylinder(Triangulation< dim > &tria, const double shell_region_width=0.03, const unsigned int n_shells=2, const double skewness=2.0, const bool colorize=false)
@ matrix
Contents is actually a matrix.
static const char A
@ general
No special properties.
static const types::blas_int one
double norm(const FEValuesBase< dim > &fe, const ArrayView< const std::vector< Tensor< 1, dim >>> &Du)
Definition: divergence.h:472
void categorize_by_boundary_ids(const Triangulation< dim > &tria, AdditionalData &additional_data)
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
Definition: utilities.cc:188
SymmetricTensor< 2, dim, Number > C(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
@ LOW_STORAGE_RK_STAGE9_ORDER5
@ LOW_STORAGE_RK_STAGE3_ORDER3
Definition: time_stepping.h:86
@ LOW_STORAGE_RK_STAGE7_ORDER4
Definition: time_stepping.h:96
@ LOW_STORAGE_RK_STAGE5_ORDER4
Definition: time_stepping.h:91
void free(T *&pointer)
Definition: cuda.h:97
unsigned int this_mpi_process(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:128
T min(const T &t, const MPI_Comm &mpi_communicator)
T sum(const T &t, const MPI_Comm &mpi_communicator)
unsigned int n_mpi_processes(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:117
T max(const T &t, const MPI_Comm &mpi_communicator)
std::string get_time()
Definition: utilities.cc:1019
const std::string get_current_vectorization_level()
Definition: utilities.cc:944
constexpr T pow(const T base, const int iexp)
Definition: utilities.h:461
Number truncate_to_n_digits(const Number number, const unsigned int n_digits)
Definition: utilities.cc:581
std::string int_to_string(const unsigned int value, const unsigned int digits=numbers::invalid_unsigned_int)
Definition: utilities.cc:473
void project(const Mapping< dim, spacedim > &mapping, const DoFHandler< dim, spacedim > &dof, const AffineConstraints< typename VectorType::value_type > &constraints, const Quadrature< dim > &quadrature, const Function< spacedim, typename VectorType::value_type > &function, VectorType &vec, const bool enforce_zero_boundary=false, const Quadrature< dim - 1 > &q_boundary=(dim > 1 ? QGauss< dim - 1 >(2) :Quadrature< dim - 1 >(0)), const bool project_to_boundary_first=false)
void run(const Iterator &begin, const typename identity< Iterator >::type &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
Definition: work_stream.h:691
long double gamma(const unsigned int n)
void copy(const T *begin, const T *end, U *dest)
int(&) functions(const void *v1, const void *v2)
void reinit(MatrixBlock< MatrixType > &v, const BlockSparsityPattern &p)
Definition: matrix_block.h:618
static constexpr double PI
Definition: numbers.h:231
const types::boundary_id internal_face_boundary_id
Definition: types.h:255
static const unsigned int invalid_unsigned_int
Definition: types.h:196
void transform(const InputIterator &begin_in, const InputIterator &end_in, OutputIterator out, const Predicate &predicate, const unsigned int grainsize)
Definition: parallel.h:213
Definition: types.h:32
unsigned int boundary_id
Definition: types.h:129
::VectorizedArray< Number, width > sqrt(const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > pow(const ::VectorizedArray< Number, width > &, const Number p)
const ::parallel::distributed::Triangulation< dim, spacedim > * triangulation