Reference documentation for deal.II version 9.3.2
\(\newcommand{\dealvcentcolon}{\mathrel{\mathop{:}}}\) \(\newcommand{\dealcoloneq}{\dealvcentcolon\mathrel{\mkern-1.2mu}=}\) \(\newcommand{\jump}[1]{\left[\!\left[ #1 \right]\!\right]}\) \(\newcommand{\average}[1]{\left\{\!\left\{ #1 \right\}\!\right\}}\)
step-48.h
Go to the documentation of this file.
1 ) const override
550  * {
551  * double t = this->get_time();
552  *
553  * const double m = 0.5;
554  * const double c1 = 0.;
555  * const double c2 = 0.;
556  * const double factor =
557  * (m / std::sqrt(1. - m * m) * std::sin(std::sqrt(1. - m * m) * t + c2));
558  * double result = 1.;
559  * for (unsigned int d = 0; d < dim; ++d)
560  * result *= -4. * std::atan(factor / std::cosh(m * p[d] + c1));
561  * return result;
562  * }
563  * };
564  *
565  *
566  *
567  * @endcode
568  *
569  *
570  * <a name="SineGordonProblemclass"></a>
571  * <h3>SineGordonProblem class</h3>
572  *
573 
574  *
575  * This is the main class that builds on the class in @ref step_25 "step-25". However, we
576  * replaced the SparseMatrix<double> class by the MatrixFree class to store
577  * the geometry data. Also, we use a distributed triangulation in this
578  * example.
579  *
580  * @code
581  * template <int dim>
582  * class SineGordonProblem
583  * {
584  * public:
585  * SineGordonProblem();
586  * void run();
587  *
588  * private:
589  * ConditionalOStream pcout;
590  *
591  * void make_grid_and_dofs();
592  * void output_results(const unsigned int timestep_number);
593  *
594  * #ifdef DEAL_II_WITH_P4EST
596  * #else
598  * #endif
599  * FE_Q<dim> fe;
600  * DoFHandler<dim> dof_handler;
601  *
602  * MappingQ1<dim> mapping;
603  *
604  * AffineConstraints<double> constraints;
605  * IndexSet locally_relevant_dofs;
606  *
607  * MatrixFree<dim, double> matrix_free_data;
608  *
609  * LinearAlgebra::distributed::Vector<double> solution, old_solution,
610  * old_old_solution;
611  *
612  * const unsigned int n_global_refinements;
613  * double time, time_step;
614  * const double final_time;
615  * const double cfl_number;
616  * const unsigned int output_timestep_skip;
617  * };
618  *
619  *
620  * @endcode
621  *
622  *
623  * <a name="SineGordonProblemSineGordonProblem"></a>
624  * <h4>SineGordonProblem::SineGordonProblem</h4>
625  *
626 
627  *
628  * This is the constructor of the SineGordonProblem class. The time interval
629  * and time step size are defined here. Moreover, we use the degree of the
630  * finite element that we defined at the top of the program to initialize a
631  * FE_Q finite element based on Gauss-Lobatto support points. These points
632  * are convenient because in conjunction with a QGaussLobatto quadrature
633  * rule of the same order they give a diagonal mass matrix without
634  * compromising accuracy too much (note that the integration is inexact,
635  * though), see also the discussion in the introduction. Note that FE_Q
636  * selects the Gauss-Lobatto nodal points by default due to their improved
637  * conditioning versus equidistant points. To make things more explicit, we
638  * state the selection of the nodal points nonetheless.
639  *
640  * @code
641  * template <int dim>
642  * SineGordonProblem<dim>::SineGordonProblem()
643  * : pcout(std::cout, Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0)
644  * ,
645  * #ifdef DEAL_II_WITH_P4EST
646  * triangulation(MPI_COMM_WORLD)
647  * ,
648  * #endif
649  * fe(QGaussLobatto<1>(fe_degree + 1))
650  * , dof_handler(triangulation)
651  * , n_global_refinements(10 - 2 * dim)
652  * , time(-10)
653  * , time_step(10.)
654  * , final_time(10.)
655  * , cfl_number(.1 / fe_degree)
656  * , output_timestep_skip(200)
657  * {}
658  *
659  * @endcode
660  *
661  *
662  * <a name="SineGordonProblemmake_grid_and_dofs"></a>
663  * <h4>SineGordonProblem::make_grid_and_dofs</h4>
664  *
665 
666  *
667  * As in @ref step_25 "step-25" this functions sets up a cube grid in <code>dim</code>
668  * dimensions of extent @f$[-15,15]@f$. We refine the mesh more in the center of
669  * the domain since the solution is concentrated there. We first refine all
670  * cells whose center is within a radius of 11, and then refine once more
671  * for a radius 6. This simple ad hoc refinement could be done better by
672  * adapting the mesh to the solution using error estimators during the time
673  * stepping as done in other example programs, and using
674  * parallel::distributed::SolutionTransfer to transfer the solution to the
675  * new mesh.
676  *
677  * @code
678  * template <int dim>
679  * void SineGordonProblem<dim>::make_grid_and_dofs()
680  * {
682  * triangulation.refine_global(n_global_refinements);
683  * {
685  * cell = triangulation.begin_active(),
686  * end_cell = triangulation.end();
687  * for (; cell != end_cell; ++cell)
688  * if (cell->is_locally_owned())
689  * if (cell->center().norm() < 11)
690  * cell->set_refine_flag();
691  * triangulation.execute_coarsening_and_refinement();
692  *
693  * cell = triangulation.begin_active();
694  * end_cell = triangulation.end();
695  * for (; cell != end_cell; ++cell)
696  * if (cell->is_locally_owned())
697  * if (cell->center().norm() < 6)
698  * cell->set_refine_flag();
699  * triangulation.execute_coarsening_and_refinement();
700  * }
701  *
702  * pcout << " Number of global active cells: "
703  * #ifdef DEAL_II_WITH_P4EST
704  * << triangulation.n_global_active_cells()
705  * #else
706  * << triangulation.n_active_cells()
707  * #endif
708  * << std::endl;
709  *
710  * dof_handler.distribute_dofs(fe);
711  *
712  * pcout << " Number of degrees of freedom: " << dof_handler.n_dofs()
713  * << std::endl;
714  *
715  *
716  * @endcode
717  *
718  * We generate hanging node constraints for ensuring continuity of the
719  * solution. As in @ref step_40 "step-40", we need to equip the constraint matrix with
720  * the IndexSet of locally relevant degrees of freedom to avoid it to
721  * consume too much memory for big problems. Next, the <code> MatrixFree
722  * </code> object for the problem is set up. Note that we specify a
723  * particular scheme for shared-memory parallelization (hence one would
724  * use multithreading for intra-node parallelism and not MPI; we here
725  * choose the standard option &mdash; if we wanted to disable shared
726  * memory parallelization even in case where there is more than one TBB
727  * thread available in the program, we would choose
729  * instead of using the default QGauss quadrature argument, we supply a
730  * QGaussLobatto quadrature formula to enable the desired
731  * behavior. Finally, three solution vectors are initialized. MatrixFree
732  * expects a particular layout of ghost indices (as it handles index
733  * access in MPI-local numbers that need to match between the vector and
734  * MatrixFree), so we just ask it to initialize the vectors to be sure the
735  * ghost exchange is properly handled.
736  *
737  * @code
738  * DoFTools::extract_locally_relevant_dofs(dof_handler, locally_relevant_dofs);
739  * constraints.clear();
740  * constraints.reinit(locally_relevant_dofs);
741  * DoFTools::make_hanging_node_constraints(dof_handler, constraints);
742  * constraints.close();
743  *
744  * typename MatrixFree<dim>::AdditionalData additional_data;
745  * additional_data.tasks_parallel_scheme =
747  *
748  * matrix_free_data.reinit(mapping,
749  * dof_handler,
750  * constraints,
751  * QGaussLobatto<1>(fe_degree + 1),
752  * additional_data);
753  *
754  * matrix_free_data.initialize_dof_vector(solution);
755  * old_solution.reinit(solution);
756  * old_old_solution.reinit(solution);
757  * }
758  *
759  *
760  *
761  * @endcode
762  *
763  *
764  * <a name="SineGordonProblemoutput_results"></a>
765  * <h4>SineGordonProblem::output_results</h4>
766  *
767 
768  *
769  * This function prints the norm of the solution and writes the solution
770  * vector to a file. The norm is standard (except for the fact that we need
771  * to accumulate the norms over all processors for the parallel grid which
772  * we do via the VectorTools::compute_global_error() function), and the
773  * second is similar to what we did in @ref step_40 "step-40" or @ref step_37 "step-37". Note that we can
774  * use the same vector for output as the one used during computations: The
775  * vectors in the matrix-free framework always provide full information on
776  * all locally owned cells (this is what is needed in the local evaluations,
777  * too), including ghost vector entries on these cells. This is the only
778  * data that is needed in the VectorTools::integrate_difference() function
779  * as well as in DataOut. The only action to take at this point is to make
780  * sure that the vector updates its ghost values before we read from
781  * them, and to reset ghost values once done. This is a feature present only
782  * in the LinearAlgebra::distributed::Vector class. Distributed vectors with
783  * PETSc and Trilinos, on the other hand, need to be copied to special
784  * vectors including ghost values (see the relevant section in @ref step_40 "step-40"). If
785  * we also wanted to access all degrees of freedom on ghost cells (e.g. when
786  * computing error estimators that use the jump of solution over cell
787  * boundaries), we would need more information and create a vector
788  * initialized with locally relevant dofs just as in @ref step_40 "step-40". Observe also
789  * that we need to distribute constraints for output - they are not filled
790  * during computations (rather, they are interpolated on the fly in the
791  * matrix-free method FEEvaluation::read_dof_values()).
792  *
793  * @code
794  * template <int dim>
795  * void
796  * SineGordonProblem<dim>::output_results(const unsigned int timestep_number)
797  * {
798  * constraints.distribute(solution);
799  *
800  * Vector<float> norm_per_cell(triangulation.n_active_cells());
801  * solution.update_ghost_values();
803  * dof_handler,
804  * solution,
806  * norm_per_cell,
807  * QGauss<dim>(fe_degree + 1),
809  * const double solution_norm =
811  * norm_per_cell,
813  *
814  * pcout << " Time:" << std::setw(8) << std::setprecision(3) << time
815  * << ", solution norm: " << std::setprecision(5) << std::setw(7)
816  * << solution_norm << std::endl;
817  *
818  * DataOut<dim> data_out;
819  *
820  * data_out.attach_dof_handler(dof_handler);
821  * data_out.add_data_vector(solution, "solution");
822  * data_out.build_patches(mapping);
823  *
824  * data_out.write_vtu_with_pvtu_record(
825  * "./", "solution", timestep_number, MPI_COMM_WORLD, 3);
826  *
827  * solution.zero_out_ghost_values();
828  * }
829  *
830  *
831  * @endcode
832  *
833  *
834  * <a name="SineGordonProblemrun"></a>
835  * <h4>SineGordonProblem::run</h4>
836  *
837 
838  *
839  * This function is called by the main function and steps into the
840  * subroutines of the class.
841  *
842 
843  *
844  * After printing some information about the parallel setup, the first
845  * action is to set up the grid and the cell operator. Then, the time step
846  * is computed from the CFL number given in the constructor and the finest
847  * mesh size. The finest mesh size is computed as the diameter of the last
848  * cell in the triangulation, which is the last cell on the finest level of
849  * the mesh. This is only possible for meshes where all elements on a level
850  * have the same size, otherwise, one needs to loop over all cells. Note
851  * that we need to query all the processors for their finest cell since
852  * not all processors might hold a region where the mesh is at the finest
853  * level. Then, we readjust the time step a little to hit the final time
854  * exactly.
855  *
856  * @code
857  * template <int dim>
859  * {
860  * {
861  * pcout << "Number of MPI ranks: "
862  * << Utilities::MPI::n_mpi_processes(MPI_COMM_WORLD) << std::endl;
863  * pcout << "Number of threads on each rank: "
864  * << MultithreadInfo::n_threads() << std::endl;
865  * const unsigned int n_vect_doubles = VectorizedArray<double>::size();
866  * const unsigned int n_vect_bits = 8 * sizeof(double) * n_vect_doubles;
867  * pcout << "Vectorization over " << n_vect_doubles
868  * << " doubles = " << n_vect_bits << " bits ("
870  * << std::endl
871  * << std::endl;
872  * }
873  * make_grid_and_dofs();
874  *
875  * const double local_min_cell_diameter =
876  * triangulation.last()->diameter() / std::sqrt(dim);
877  * const double global_min_cell_diameter =
878  * -Utilities::MPI::max(-local_min_cell_diameter, MPI_COMM_WORLD);
879  * time_step = cfl_number * global_min_cell_diameter;
880  * time_step = (final_time - time) / (int((final_time - time) / time_step));
881  * pcout << " Time step size: " << time_step
882  * << ", finest cell: " << global_min_cell_diameter << std::endl
883  * << std::endl;
884  *
885  * @endcode
886  *
887  * Next the initial value is set. Since we have a two-step time stepping
888  * method, we also need a value of the solution at time-time_step. For
889  * accurate results, one would need to compute this from the time
890  * derivative of the solution at initial time, but here we ignore this
891  * difficulty and just set it to the initial value function at that
892  * artificial time.
893  *
894 
895  *
896  * We then go on by writing the initial state to file and collecting
897  * the two starting solutions in a <tt>std::vector</tt> of pointers that
898  * get later consumed by the SineGordonOperation::apply() function. Next,
899  * an instance of the <code> SineGordonOperation class </code> based on
900  * the finite element degree specified at the top of this file is set up.
901  *
902  * @code
903  * VectorTools::interpolate(mapping,
904  * dof_handler,
905  * InitialCondition<dim>(1, time),
906  * solution);
907  * VectorTools::interpolate(mapping,
908  * dof_handler,
909  * InitialCondition<dim>(1, time - time_step),
910  * old_solution);
911  * output_results(0);
912  *
913  * std::vector<LinearAlgebra::distributed::Vector<double> *>
914  * previous_solutions({&old_solution, &old_old_solution});
915  *
916  * SineGordonOperation<dim, fe_degree> sine_gordon_op(matrix_free_data,
917  * time_step);
918  *
919  * @endcode
920  *
921  * Now loop over the time steps. In each iteration, we shift the solution
922  * vectors by one and call the `apply` function of the
923  * `SineGordonOperator` class. Then, we write the solution to a file. We
924  * clock the wall times for the computational time needed as wall as the
925  * time needed to create the output and report the numbers when the time
926  * stepping is finished.
927  *
928 
929  *
930  * Note how this shift is implemented: We simply call the swap method on
931  * the two vectors which swaps only some pointers without the need to copy
932  * data around, a relatively expensive operation within an explicit time
933  * stepping method. Let us see what happens in more detail: First, we
934  * exchange <code>old_solution</code> with <code>old_old_solution</code>,
935  * which means that <code>old_old_solution</code> gets
936  * <code>old_solution</code>, which is what we expect. Similarly,
937  * <code>old_solution</code> gets the content from <code>solution</code>
938  * in the next step. After this, <code>solution</code> holds
939  * <code>old_old_solution</code>, but that will be overwritten during this
940  * step.
941  *
942  * @code
943  * unsigned int timestep_number = 1;
944  *
945  * Timer timer;
946  * double wtime = 0;
947  * double output_time = 0;
948  * for (time += time_step; time <= final_time;
949  * time += time_step, ++timestep_number)
950  * {
951  * timer.restart();
952  * old_old_solution.swap(old_solution);
953  * old_solution.swap(solution);
954  * sine_gordon_op.apply(solution, previous_solutions);
955  * wtime += timer.wall_time();
956  *
957  * timer.restart();
958  * if (timestep_number % output_timestep_skip == 0)
959  * output_results(timestep_number / output_timestep_skip);
960  *
961  * output_time += timer.wall_time();
962  * }
963  * timer.restart();
964  * output_results(timestep_number / output_timestep_skip + 1);
965  * output_time += timer.wall_time();
966  *
967  * pcout << std::endl
968  * << " Performed " << timestep_number << " time steps." << std::endl;
969  *
970  * pcout << " Average wallclock time per time step: "
971  * << wtime / timestep_number << "s" << std::endl;
972  *
973  * pcout << " Spent " << output_time << "s on output and " << wtime
974  * << "s on computations." << std::endl;
975  * }
976  * } // namespace Step48
977  *
978  *
979  *
980  * @endcode
981  *
982  *
983  * <a name="Thecodemaincodefunction"></a>
984  * <h3>The <code>main</code> function</h3>
985  *
986 
987  *
988  * As in @ref step_40 "step-40", we initialize MPI at the start of the program. Since we will
989  * in general mix MPI parallelization with threads, we also set the third
990  * argument in MPI_InitFinalize that controls the number of threads to an
991  * invalid number, which means that the TBB library chooses the number of
992  * threads automatically, typically to the number of available cores in the
993  * system. As an alternative, you can also set this number manually if you
994  * want to set a specific number of threads (e.g. when MPI-only is required).
995  *
996  * @code
997  * int main(int argc, char **argv)
998  * {
999  * using namespace Step48;
1000  * using namespace dealii;
1001  *
1002  * Utilities::MPI::MPI_InitFinalize mpi_initialization(
1003  * argc, argv, numbers::invalid_unsigned_int);
1004  *
1005  * try
1006  * {
1007  * SineGordonProblem<dimension> sg_problem;
1008  * sg_problem.run();
1009  * }
1010  * catch (std::exception &exc)
1011  * {
1012  * std::cerr << std::endl
1013  * << std::endl
1014  * << "----------------------------------------------------"
1015  * << std::endl;
1016  * std::cerr << "Exception on processing: " << std::endl
1017  * << exc.what() << std::endl
1018  * << "Aborting!" << std::endl
1019  * << "----------------------------------------------------"
1020  * << std::endl;
1021  *
1022  * return 1;
1023  * }
1024  * catch (...)
1025  * {
1026  * std::cerr << std::endl
1027  * << std::endl
1028  * << "----------------------------------------------------"
1029  * << std::endl;
1030  * std::cerr << "Unknown exception!" << std::endl
1031  * << "Aborting!" << std::endl
1032  * << "----------------------------------------------------"
1033  * << std::endl;
1034  * return 1;
1035  * }
1036  *
1037  * return 0;
1038  * }
1039  * @endcode
1040 <a name="Results"></a><h1>Results</h1>
1041 
1042 
1043 <a name="Comparisonwithasparsematrix"></a><h3>Comparison with a sparse matrix</h3>
1044 
1045 
1046 In order to demonstrate the gain in using the MatrixFree class instead of
1047 the standard <code>deal.II</code> assembly routines for evaluating the
1048 information from old time steps, we study a simple serial run of the code on a
1049 nonadaptive mesh. Since much time is spent on evaluating the sine function, we
1050 do not only show the numbers of the full sine-Gordon equation but also for the
1051 wave equation (the sine-term skipped from the sine-Gordon equation). We use
1052 both second and fourth order elements. The results are summarized in the
1053 following table.
1054 
1055 <table align="center" class="doxtable">
1056  <tr>
1057  <th>&nbsp;</th>
1058  <th colspan="3">wave equation</th>
1059  <th colspan="2">sine-Gordon</th>
1060  </tr>
1061  <tr>
1062  <th>&nbsp;</th>
1063  <th>MF</th>
1064  <th>SpMV</th>
1065  <th>dealii</th>
1066  <th>MF</th>
1067  <th>dealii</th>
1068  </tr>
1069  <tr>
1070  <td>2D, @f$\mathcal{Q}_2@f$</td>
1071  <td align="right"> 0.0106</td>
1072  <td align="right"> 0.00971</td>
1073  <td align="right"> 0.109</td>
1074  <td align="right"> 0.0243</td>
1075  <td align="right"> 0.124</td>
1076  </tr>
1077  <tr>
1078  <td>2D, @f$\mathcal{Q}_4@f$</td>
1079  <td align="right"> 0.0328</td>
1080  <td align="right"> 0.0706</td>
1081  <td align="right"> 0.528</td>
1082  <td align="right"> 0.0714</td>
1083  <td align="right"> 0.502</td>
1084  </tr>
1085  <tr>
1086  <td>3D, @f$\mathcal{Q}_2@f$</td>
1087  <td align="right"> 0.0151</td>
1088  <td align="right"> 0.0320</td>
1089  <td align="right"> 0.331</td>
1090  <td align="right"> 0.0376</td>
1091  <td align="right"> 0.364</td>
1092  </tr>
1093  <tr>
1094  <td>3D, @f$\mathcal{Q}_4@f$</td>
1095  <td align="right"> 0.0918</td>
1096  <td align="right"> 0.844</td>
1097  <td align="right"> 6.83</td>
1098  <td align="right"> 0.194</td>
1099  <td align="right"> 6.95</td>
1100  </tr>
1101 </table>
1102 
1103 It is apparent that the matrix-free code outperforms the standard assembly
1104 routines in deal.II by far. In 3D and for fourth order elements, one operator
1105 evaluation is also almost ten times as fast as a sparse matrix-vector
1106 product.
1107 
1108 <a name="Parallelrunin2Dand3D"></a><h3>Parallel run in 2D and 3D</h3>
1109 
1110 
1111 We start with the program output obtained on a workstation with 12 cores / 24
1112 threads (one Intel Xeon E5-2687W v4 CPU running at 3.2 GHz, hyperthreading
1113 enabled), running the program in release mode:
1114 @code
1115 $ make run
1116 Number of MPI ranks: 1
1117 Number of threads on each rank: 24
1118 Vectorization over 4 doubles = 256 bits (AVX)
1119 
1120  Number of global active cells: 15412
1121  Number of degrees of freedom: 249065
1122  Time step size: 0.00292997, finest cell: 0.117188
1123 
1124  Time: -10, solution norm: 9.5599
1125  Time: -9.41, solution norm: 17.678
1126  Time: -8.83, solution norm: 23.504
1127  Time: -8.24, solution norm: 27.5
1128  Time: -7.66, solution norm: 29.513
1129  Time: -7.07, solution norm: 29.364
1130  Time: -6.48, solution norm: 27.23
1131  Time: -5.9, solution norm: 23.527
1132  Time: -5.31, solution norm: 18.439
1133  Time: -4.73, solution norm: 11.935
1134  Time: -4.14, solution norm: 5.5284
1135  Time: -3.55, solution norm: 8.0354
1136  Time: -2.97, solution norm: 14.707
1137  Time: -2.38, solution norm: 20
1138  Time: -1.8, solution norm: 22.834
1139  Time: -1.21, solution norm: 22.771
1140  Time: -0.624, solution norm: 20.488
1141  Time: -0.0381, solution norm: 16.697
1142  Time: 0.548, solution norm: 11.221
1143  Time: 1.13, solution norm: 5.3912
1144  Time: 1.72, solution norm: 8.4528
1145  Time: 2.31, solution norm: 14.335
1146  Time: 2.89, solution norm: 18.555
1147  Time: 3.48, solution norm: 20.894
1148  Time: 4.06, solution norm: 21.305
1149  Time: 4.65, solution norm: 19.903
1150  Time: 5.24, solution norm: 16.864
1151  Time: 5.82, solution norm: 12.223
1152  Time: 6.41, solution norm: 6.758
1153  Time: 6.99, solution norm: 7.2423
1154  Time: 7.58, solution norm: 12.888
1155  Time: 8.17, solution norm: 17.273
1156  Time: 8.75, solution norm: 19.654
1157  Time: 9.34, solution norm: 19.838
1158  Time: 9.92, solution norm: 17.964
1159  Time: 10, solution norm: 17.595
1160 
1161  Performed 6826 time steps.
1162  Average wallclock time per time step: 0.0013453s
1163  Spent 14.976s on output and 9.1831s on computations.
1164 @endcode
1165 
1166 In 3D, the respective output looks like
1167 @code
1168 $ make run
1169 Number of MPI ranks: 1
1170 Number of threads on each rank: 24
1171 Vectorization over 4 doubles = 256 bits (AVX)
1172 
1173  Number of global active cells: 17592
1174  Number of degrees of freedom: 1193881
1175  Time step size: 0.0117233, finest cell: 0.46875
1176 
1177  Time: -10, solution norm: 29.558
1178  Time: -7.66, solution norm: 129.13
1179  Time: -5.31, solution norm: 67.753
1180  Time: -2.97, solution norm: 79.245
1181  Time: -0.621, solution norm: 123.52
1182  Time: 1.72, solution norm: 43.525
1183  Time: 4.07, solution norm: 93.285
1184  Time: 6.41, solution norm: 97.722
1185  Time: 8.76, solution norm: 36.734
1186  Time: 10, solution norm: 94.115
1187 
1188  Performed 1706 time steps.
1189  Average wallclock time per time step: 0.0084542s
1190  Spent 16.766s on output and 14.423s on computations.
1191 @endcode
1192 
1193 It takes 0.008 seconds for one time step with more than a million
1194 degrees of freedom (note that we would need many processors to reach such
1195 numbers when solving linear systems).
1196 
1197 If we replace the thread-parallelization by a pure MPI parallelization, the
1198 timings change into:
1199 @code
1200 $ mpirun -n 24 ./step-48
1201 Number of MPI ranks: 24
1202 Number of threads on each rank: 1
1203 Vectorization over 4 doubles = 256 bits (AVX)
1204 ...
1205  Performed 1706 time steps.
1206  Average wallclock time per time step: 0.0051747s
1207  Spent 2.0535s on output and 8.828s on computations.
1208 @endcode
1209 
1210 We observe a dramatic speedup for the output (which makes sense, given that
1211 most code of the output is not parallelized via threads, whereas it is for
1212 MPI), but less than the theoretical factor of 12 we would expect from the
1213 parallelism. More interestingly, the computations also get faster when
1214 switching from the threads-only variant to the MPI-only variant. This is a
1215 general observation for the MatrixFree framework (as of updating this data in
1216 2019). The main reason is that the decisions regarding work on conflicting
1217 cell batches made to enable execution in parallel are overly pessimistic:
1218 While they ensure that no work on neighboring cells is done on different
1219 threads at the same time, this conservative setting implies that data from
1220 neighboring cells is also evicted from caches by the time neighbors get
1221 touched. Furthermore, the current scheme is not able to provide a constant
1222 load for all 24 threads for the given mesh with 17,592 cells.
1223 
1224 The current program allows to also mix MPI parallelization with thread
1225 parallelization. This is most beneficial when running programs on clusters
1226 with multiple nodes, using MPI for the inter-node parallelization and threads
1227 for the intra-node parallelization. On the workstation used above, we can run
1228 threads in the hyperthreading region (i.e., using 2 threads for each of the 12
1229 MPI ranks). An important setting for mixing MPI with threads is to ensure
1230 proper binning of tasks to CPUs. On many clusters the placing is either
1231 automatically via the `mpirun/mpiexec` environment, or there can be manual
1232 settings. Here, we simply report the run times the plain version of the
1233 program (noting that things could be improved towards the timings of the
1234 MPI-only program when proper pinning is done):
1235 @code
1236 $ mpirun -n 12 ./step-48
1237 Number of MPI ranks: 12
1238 Number of threads on each rank: 2
1239 Vectorization over 4 doubles = 256 bits (AVX)
1240 ...
1241  Performed 1706 time steps.
1242  Average wallclock time per time step: 0.0056651s
1243  Spent 2.5175s on output and 9.6646s on computations.
1244 @endcode
1245 
1246 
1247 
1248 <a name="Possibilitiesforextensions"></a><h3>Possibilities for extensions</h3>
1249 
1250 
1251 There are several things in this program that could be improved to make it
1252 even more efficient (besides improved boundary conditions and physical
1253 stuff as discussed in @ref step_25 "step-25"):
1254 
1255 <ul> <li> <b>Faster evaluation of sine terms:</b> As becomes obvious
1256  from the comparison of the plain wave equation and the sine-Gordon
1257  equation above, the evaluation of the sine terms dominates the total
1258  time for the finite element operator application. There are a few
1259  reasons for this: Firstly, the deal.II sine computation of a
1260  VectorizedArray field is not vectorized (as opposed to the rest of
1261  the operator application). This could be cured by handing the sine
1262  computation to a library with vectorized sine computations like
1263  Intel's math kernel library (MKL). By using the function
1264  <code>vdSin</code> in MKL, the program uses half the computing time
1265  in 2D and 40 percent less time in 3D. On the other hand, the sine
1266  computation is structurally much more complicated than the simple
1267  arithmetic operations like additions and multiplications in the rest
1268  of the local operation.
1269 
1270  <li> <b>Higher order time stepping:</b> While the implementation allows for
1271  arbitrary order in the spatial part (by adjusting the degree of the finite
1272  element), the time stepping scheme is a standard second-order leap-frog
1273  scheme. Since solutions in wave propagation problems are usually very
1274  smooth, the error is likely dominated by the time stepping part. Of course,
1275  this could be cured by using smaller time steps (at a fixed spatial
1276  resolution), but it would be more efficient to use higher order time
1277  stepping as well. While it would be straight-forward to do so for a
1278  first-order system (use some Runge&ndash;Kutta scheme of higher order,
1279  probably combined with adaptive time step selection like the <a
1280  href="http://en.wikipedia.org/wiki/Dormand%E2%80%93Prince_method">Dormand&ndash;Prince
1281  method</a>), it is more challenging for the second-order formulation. At
1282  least in the finite difference community, people usually use the PDE to find
1283  spatial correction terms that improve the temporal error.
1284 
1285 </ul>
1286  *
1287  *
1288 <a name="PlainProg"></a>
1289 <h1> The plain program</h1>
1290 @include "step-48.cc"
1291 */
void swap(BlockIndices &u, BlockIndices &v)
void attach_dof_handler(const DoFHandlerType &)
void add_data_vector(const VectorType &data, const std::vector< std::string > &names, const DataVectorType type=type_automatic, const std::vector< DataComponentInterpretation::DataComponentInterpretation > &data_component_interpretation=std::vector< DataComponentInterpretation::DataComponentInterpretation >())
virtual void build_patches(const unsigned int n_subdivisions=0)
Definition: data_out.cc:1085
Definition: fe_q.h:549
void reinit(const MappingType &mapping, const DoFHandler< dim > &dof_handler, const AffineConstraints< number2 > &constraint, const QuadratureType &quad, const AdditionalData &additional_data=AdditionalData())
static unsigned int n_threads()
Definition: timer.h:119
double wall_time() const
Definition: timer.cc:264
void restart()
Definition: timer.h:914
Definition: vector.h:110
#define DEAL_II_WITH_P4EST
Definition: config.h:56
Point< 3 > center
Point< 2 > second
Definition: grid_out.cc:4588
Point< 2 > first
Definition: grid_out.cc:4587
unsigned int level
Definition: grid_out.cc:4590
__global__ void set(Number *val, const Number s, const size_type N)
std::string write_vtu_with_pvtu_record(const std::string &directory, const std::string &filename_without_extension, const unsigned int counter, const MPI_Comm &mpi_communicator, const unsigned int n_digits_for_counter=numbers::invalid_unsigned_int, const unsigned int n_groups=0) const
void loop(ITERATOR begin, typename identity< ITERATOR >::type end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(DOFINFO &, DOFINFO &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, ASSEMBLER &assembler, const LoopControl &lctrl=LoopControl())
Definition: loop.h:439
void make_hanging_node_constraints(const DoFHandler< dim, spacedim > &dof_handler, AffineConstraints< number > &constraints)
const Event initial
Definition: event.cc:65
Expression cosh(const Expression &x)
Expression atan(const Expression &x)
void extract_locally_relevant_dofs(const DoFHandler< dim, spacedim > &dof_handler, IndexSet &dof_set)
Definition: dof_tools.cc:1210
void interpolate(const DoFHandler< dim, spacedim > &dof1, const InVector &u1, const DoFHandler< dim, spacedim > &dof2, OutVector &u2)
void hyper_cube(Triangulation< dim, spacedim > &tria, const double left=0., const double right=1., const bool colorize=false)
void refine(Triangulation< dim, spacedim > &tria, const Vector< Number > &criteria, const double threshold, const unsigned int max_to_mark=numbers::invalid_unsigned_int)
void shift(const Tensor< 1, spacedim > &shift_vector, Triangulation< dim, spacedim > &triangulation)
Definition: grid_tools.cc:2022
double diameter(const Triangulation< dim, spacedim > &tria)
Definition: grid_tools.cc:81
@ matrix
Contents is actually a matrix.
@ diagonal
Matrix is diagonal.
@ general
No special properties.
static const types::blas_int one
double norm(const FEValuesBase< dim > &fe, const ArrayView< const std::vector< Tensor< 1, dim >>> &Du)
Definition: divergence.h:472
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
Definition: utilities.cc:188
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > b(const Tensor< 2, dim, Number > &F)
void call(const std::function< RT()> &function, internal::return_value< RT > &ret_val)
void free(T *&pointer)
Definition: cuda.h:97
unsigned int this_mpi_process(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:128
unsigned int n_mpi_processes(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:117
T max(const T &t, const MPI_Comm &mpi_communicator)
std::string get_time()
Definition: utilities.cc:1019
const std::string get_current_vectorization_level()
Definition: utilities.cc:944
double compute_global_error(const Triangulation< dim, spacedim > &tria, const InVector &cellwise_error, const NormType &norm, const double exponent=2.)
void integrate_difference(const Mapping< dim, spacedim > &mapping, const DoFHandler< dim, spacedim > &dof, const InVector &fe_function, const Function< spacedim, typename InVector::value_type > &exact_solution, OutVector &difference, const Quadrature< dim > &q, const NormType &norm, const Function< spacedim, double > *weight=nullptr, const double exponent=2.)
void run(const Iterator &begin, const typename identity< Iterator >::type &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
Definition: work_stream.h:691
void copy(const T *begin, const T *end, U *dest)
int(&) functions(const void *v1, const void *v2)
static const unsigned int invalid_unsigned_int
Definition: types.h:196
::VectorizedArray< Number, width > sin(const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > sqrt(const ::VectorizedArray< Number, width > &)
const ::parallel::distributed::Triangulation< dim, spacedim > * triangulation
TasksParallelScheme tasks_parallel_scheme
Definition: matrix_free.h:344