% \iffalse meta-comment
%
%% File: l3benchmark.dtx
% 
% Copyright (C) 2011-2024 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of the
% LaTeX Project Public License (LPPL), either version 1.3c of this
% license or (at your option) any later version.  The latest version
% of this license is in the file
%
%    http://www.latex-project.org/lppl.txt
%
% This file is part of the "l3experimental bundle" (The Work in LPPL)
% and all files in that bundle must be distributed together.
%
% -----------------------------------------------------------------------
%
% The development version of the bundle can be found at
%
%    https://github.com/latex3/latex3
%
% for those people who are interested.
%
%<*driver|package>
\RequirePackage{expl3}
%</driver|package>
%<*driver>
\documentclass[full]{l3doc}
\begin{document}
  \DocInput{\jobname.dtx}
\end{document}
%</driver>
% \fi
%
% \title{^^A
%   The \pkg{l3benchmark} package\\ Experimental benchmarking^^A
% }
%
% \author{^^A
%  The \LaTeX{} Project\thanks
%    {^^A
%      E-mail:
%        \href{mailto:latex-team@latex-project.org}
%          {latex-team@latex-project.org}^^A
%    }^^A
% }
%
% \date{Released 2024-03-14}
%
% \maketitle
%
% \begin{documentation}
%
% \section{Benchmark}
%
% \begin{variable}{\g_benchmark_duration_target_fp}
%   This variable (default value: $1$) controls roughly for how long
%   \cs{benchmark:n} will repeat code to more accurately benchmark it.
%   The actual duration of one call to \cs{benchmark:n} typically lasts
%   between half and twice \cs{g_benchmark_duration_target_fp} seconds,
%   unless of course running the code only once already lasts longer
%   than this.
% \end{variable}
%
% \begin{variable}{\g_benchmark_time_fp, \g_benchmark_ops_fp}
%   These variables store the results of the most recently run benchmark.
%   \cs{g_benchmark_time_fp} stores the time \TeX{} took in seconds, and
%   \cs{g_benchmark_ops_fp} stores the estimated number of elementary
%   operations. The latter is not set by
%   \cs{benchmark_tic:}/\cs{benchmark_toc:}.
% \end{variable}
%
% \begin{function}{\benchmark_once:n, \benchmark_once_silent:n}
%   \begin{syntax}
%     \cs{benchmark_once_silent:n} \Arg{code}
%     \cs{benchmark_once:n} \Arg{code}
%   \end{syntax}
%   Determines the time \cs{g_benchmark_time_fp} (in seconds) taken by
%   \TeX{} to run the \meta{code}, and an estimated number
%   \cs{g_benchmark_ops_fp} of elementary operations.  In addition,
%   \cs{benchmark_once:n} prints these values to the terminal.  The
%   \meta{code} is run only once so the time may be quite inaccurate for
%   fast code.
% \end{function}
%
% \begin{function}{\benchmark:n, \benchmark_silent:n}
%   \begin{syntax}
%     \cs{benchmark:n} \Arg{code}
%   \end{syntax}
%   Determines the time \cs{g_benchmark_time_fp} (in seconds) taken by
%   \TeX{} to run the \meta{code}, and an estimated number
%   \cs{g_benchmark_ops_fp} of elementary operations.  In addition,
%   \cs{benchmark:n} prints these values to the terminal.  The
%   \meta{code} may be run many times and not within a group, thus code
%   with side-effects may cause problems.
% \end{function}
%
% \begin{function}{\benchmark_tic:, \benchmark_toc:}
%   \begin{syntax}
%     \cs{benchmark_tic:} \meta{slow code} \cs{benchmark_toc:}
%   \end{syntax}
%   When it is not possible to run \cs{benchmark:n} (e.g., the code is
%   part of the execution of a package which cannot be looped) the
%   tic/toc commands can be used instead to time between two points in
%   the code.  When executed, \cs{benchmark_tic:} will print a line to the
%   terminal, and \cs{benchmark_toc:} will print a matching line with a
%   time to indicate the duration between them in seconds.
%   These commands can be nested.
% \end{function}
%
% \end{documentation}
%
% \begin{implementation}
%
% \section{\pkg{l3benchmark} implementation}
%
% Our working unit is the scaled second, namely $2^{-16}$ seconds.
%
%    \begin{macrocode}
%<*package>
%    \end{macrocode}
%
%    \begin{macrocode}
\ProvidesExplPackage{l3benchmark}{2024-03-14}{}
  {L3 Experimental benchmarking}
%    \end{macrocode}
%
% \subsection{Benchmarking code}
%
%    \begin{macrocode}
%<@@=benchmark>
%    \end{macrocode}
%
% \begin{variable}{\g_benchmark_duration_target_fp}
%   The benchmark is constrained to take roughly (from half to twice)
%   \cs{g_benchmark_duration_target_fp} seconds, unless one iteration of
%   the code takes longer.
%    \begin{macrocode}
\fp_new:N \g_benchmark_duration_target_fp
\fp_gset:Nn \g_benchmark_duration_target_fp { 1 }
%    \end{macrocode}
% \end{variable}
%
% Having access to the system time is essential.
%    \begin{macrocode}
\sys_if_timer_exist:F
  {
    \fp_gset:Nn \g_benchmark_duration_target_fp { nan }
    \cs_new_protected:Npn \benchmark_once:n #1
      { \msg_error:nn { benchmark } { no-time } }
    \cs_new_eq:NN \benchmark:n \benchmark_once:n
    \cs_new_eq:NN \benchmark_once_silent:n \benchmark_once:n
    \cs_new_eq:NN \benchmark_silent:n \benchmark_once:n
    \cs_new_protected:Npn \benchmark_tic:
      { \msg_error:nn { benchmark } { no-time } }
    \cs_new_eq:NN \benchmark_toc: \benchmark_tic:
    \msg_new:nnnn { benchmark } { no-time }
      { The~l3benchmark~package~failed~to~access~a~clock. }
      { The~current~engine~provides~no~way~to~access~the~system~time. }
    \msg_critical:nn { benchmark } { no-time }
  }
%    \end{macrocode}
%
% \subsubsection{Raw measurement}
%
% \begin{variable}{\g_@@_nesting_int}
% \begin{macro}{\@@_raw:nN, \@@_raw_aux:N, \@@_raw_end:N}
%   Store in the given integer variable the time it took to perform a given
%   piece of code, in scaled seconds.  We call \cs{sys_timer:} as
%   close before and after the code as possible.  We store the
%   intermediate result in a new integer when \cs{@@_raw:nN} is
%   nested.
%    \begin{macrocode}
\int_new:N \g_@@_nesting_int
\cs_new_protected:Npn \@@_raw:nN #1
  {
    \int_gincr:N \g_@@_nesting_int
    \exp_args:Nc \@@_raw_aux:N
      { g_@@_ \int_use:N \g_@@_nesting_int _int }
    \@@_raw_aux:
    #1
    \@@_raw_end:N
  }
\cs_new_protected:Npn \@@_raw_aux:N #1
  {
    \int_gzero_new:N #1
    \cs_gset_protected:Npn \@@_raw_aux: { \int_gset:Nn #1 { \sys_timer: } }
  }
\cs_new_protected:Npn \@@_raw_end:N #1
  {
    \int_gset:Nn #1
      {
        \sys_timer: -
        \int_use:c { g_@@_ \int_use:N \g_@@_nesting_int _int }
      }
    \int_gdecr:N \g_@@_nesting_int
  }
%    \end{macrocode}
% \end{macro}
% \end{variable}
%
% \begin{macro}{\@@_raw_replicate:nnN, \@@_tmp:w}
%   Here, we wish to measure the time it takes for the piece of code
%   |#2| to be run |#1| times, and store the result in the
%   integer~|#3|.
%
%   If the number of copies required is large, defining \cs{@@_tmp:w}
%   would exhaust \TeX{}'s main memory. In that case, we replicate
%   $|#1|/5000$ times the given code before passing it to the main call
%   to \cs{@@_tmp:w}.  Of course the division rounds to an integer, so
%   that step introduces a relative error of order at most
%   $5000/500000$, less than many other sources of variability.
%
%   We subtract the time for another call to \cs{@@_tmp:w}, with the
%   same arguments (to capture the time it takes to read the argument)
%   but empty expansion.
%    \begin{macrocode}
\cs_new_eq:NN \@@_tmp:w ?
\cs_new_protected:Npn \@@_raw_replicate:nnN #1
  {
    \int_compare:nNnTF {#1} > { 500000 }
      { \@@_raw_replicate_large:nnN {#1} }
      { \@@_raw_replicate_small:nnN {#1} }
  }
\cs_new_protected:Npn \@@_raw_replicate_large:nnN #1#2
  {
    \cs_set:Npe \@@_tmp:w ##1 { \prg_replicate:nn { 5000 } {##1} }
    \exp_args:Nno \@@_raw_replicate:nnN { #1 / 5000 }
      { \@@_tmp:w {#2} }
  }
\cs_new_protected:Npn \@@_raw_replicate_small:nnN #1#2
  {
    \cs_set:Npe \@@_tmp:w ##1##2 { \prg_replicate:nn {#1} {##1} }
    \@@_raw:nN { \@@_tmp:w {#2} { } } \g_@@_time_int
    \exp_args:No \@@_raw_replicate_aux:nnN
      { \int_use:N \g_@@_time_int } {#2}
  }
\cs_new_protected:Npn \@@_raw_replicate_aux:nnN #1#2#3
  {
    \@@_raw:nN { \@@_tmp:w { } {#2} } \g_@@_time_int
    \int_gset:Nn #3 { #1 - \g_@@_time_int }
    \cs_set_eq:NN \@@_tmp:w \prg_do_nothing:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Main benchmarking}
%
% \begin{variable}{\g_benchmark_time_fp, \g_benchmark_ops_fp}
%   Functions such as \cs{benchmark:n} store the measured time in
%   \cs{g_benchmark_time_fp} (in seconds) and the estimated number of
%   operations in \cs{g_benchmark_ops_fp}.
%    \begin{macrocode}
\fp_new:N \g_benchmark_time_fp
\fp_new:N \g_benchmark_ops_fp
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_duration_int}
%   A conversion of \cs{g_benchmark_duration_target_fp} seconds into scaled seconds.
%    \begin{macrocode}
\int_new:N \g_@@_duration_int
%    \end{macrocode}
% \end{variable}
% 
% \begin{variable}{\g_@@_time_int, \g_@@_time_a_int, \g_@@_time_b_int, \g_@@_time_c_int, \g_@@_time_d_int}
%   These variables hold the time for running a piece of code, as an
%   integer in scaled seconds.
%    \begin{macrocode}
\int_new:N \g_@@_time_int
\int_new:N \g_@@_time_a_int
\int_new:N \g_@@_time_b_int
\int_new:N \g_@@_time_c_int
\int_new:N \g_@@_time_d_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_repeat_int}
%   Holds the number of times that the piece of code was
%   repeated when timing.
%    \begin{macrocode}
\int_new:N \g_@@_repeat_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_code_tl}
%   Holds the piece of code to repeat.
%    \begin{macrocode}
\tl_new:N \g_@@_code_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}{\benchmark_once:n, \benchmark_once_silent:n}
%   Convert the raw time from scaled seconds to seconds, and convert to
%   a number of operations.  It is important to measure the elementary
%   operation before running the user code because both measurements use
%   the same temporary variables.
%    \begin{macrocode}
\cs_new_protected:Npn \benchmark_once:n #1
  {
    \benchmark_once_silent:n {#1}
    \@@_display:
  }
\cs_new_protected:Npn \benchmark_once_silent:n #1
  {
    \@@_measure_op:
    \@@_raw:nN {#1} \g_@@_time_int
    \fp_gset:Nn \g_benchmark_time_fp { \g_@@_time_int / 65536 }
    \fp_gset:Nn \g_benchmark_ops_fp { \g_benchmark_time_fp / \g_@@_one_op_fp }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\benchmark:n}
%   After setting up some variables the work is done by \cs{@@_aux:}.
%    \begin{macrocode}
\cs_new_protected:Npn \benchmark:n #1
  {
    \benchmark_silent:n {#1}
    \@@_display:
  }
\cs_new_protected:Npn \benchmark_silent:n #1
  {
    \@@_measure_op:
    \tl_gset:Nn \g_@@_code_tl {#1}
    \@@_aux:
    \fp_gset:Nn \g_benchmark_ops_fp { \g_benchmark_time_fp / \g_@@_one_op_fp }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_aux:}
%   The main timing function. First time the user code once.  If that
%   took more than half the allotted time (\cs{g_@@_duration_int}) we're
%   done.  If that took much less, repeatedly quadruple the number of
%   copies until it takes a reasonable amount of time.  Once we reach a
%   reasonable time (or we risk an overflow), compute a number of times
%   that can fit in one quarter of the allotted time and measure that
%   four times.  To save time we reuse the result of the first pass if
%   \cs{g_@@_repeat_int} is one.  Once we have four results, find the
%   smallest, divided by $65536$ and by the number of repetitions, and
%   display that.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_aux:
  {
    \int_gset:Nn \g_@@_repeat_int { 1 }
    \@@_raw:nN { \g_@@_code_tl } \g_@@_time_int
    \int_compare:nNnF \g_@@_time_int < { \g_@@_duration_int / 2 }
      { \prg_break: }
    \bool_until_do:nn
      {
        \int_compare_p:nNn \g_@@_time_int > { \g_@@_duration_int / 32 }
        || \int_compare_p:nNn \g_@@_repeat_int > { \c_max_int / 4 }
      }
      {
        \int_gset:Nn \g_@@_repeat_int { 4 * \g_@@_repeat_int }
        \@@_run:N \g_@@_time_int
      }
    \int_gset:Nn \g_@@_repeat_int
      {
        \fp_to_int:n
          {
            max ( 1 , min ( \c_max_int ,
            \g_@@_duration_int * \g_@@_repeat_int /
            \int_eval:n { 4 * \g_@@_time_int } ) )
          }
      }
    \int_compare:nNnTF \g_@@_repeat_int = 1
      { \int_gset_eq:NN \g_@@_time_a_int \g_@@_time_int }
      { \@@_run:N \g_@@_time_a_int }
    \@@_run:N \g_@@_time_b_int
    \@@_run:N \g_@@_time_c_int
    \@@_run:N \g_@@_time_d_int
    \int_gset:Nn \g_@@_time_int
      {
        \int_min:nn
          { \int_min:nn \g_@@_time_a_int \g_@@_time_b_int }
          { \int_min:nn \g_@@_time_c_int \g_@@_time_d_int }
      }
    \prg_break_point:
    \int_compare:nNnT \g_@@_time_int < 3 { \int_gzero:N \g_@@_time_int }
    \fp_gset:Nn \g_benchmark_time_fp
      { \g_@@_time_int / \g_@@_repeat_int / 65536 }
  }
\cs_new_protected:Npn \@@_run:N
  { \exp_args:NNo \@@_raw_replicate:nnN \g_@@_repeat_int { \g_@@_code_tl } }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Display}
%
% \begin{variable}{\g_@@_one_op_fp}
%   Time for one operation.
%    \begin{macrocode}
\fp_new:N \g_@@_one_op_fp
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}{\@@_measure_op:}
%   Measure one arbitrary single operation (which we put in \cs{g_@@_code_tl}).
%   This uses a common auxiliary \cs{@@_aux:} with the main benchmark function.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_measure_op:
  {
    \int_gset:Nn \g_@@_duration_int
      { \fp_to_int:n { 65536 * \g_benchmark_duration_target_fp } / 4 }
    \tl_gset:Nn \g_@@_code_tl
      { \int_gadd:Nn \g_@@_duration_int { 0 } }
    \@@_aux:
    \fp_gset:Nn \g_@@_one_op_fp { max(\g_benchmark_time_fp, 1e-8) }
    \int_gset:Nn \g_@@_duration_int
      { \fp_to_int:n { 65536 * \g_benchmark_duration_target_fp } }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_fp_to_tl:N, \@@_fp_to_tl_aux:nN}
%   Similar to \cs{fp_to_tl:N} but rounds to $3$ significant digits and
%   uses scientific notation starting from |1e3|.
%    \begin{macrocode}
\cs_new:Npn \@@_fp_to_tl:N #1
  {
    \fp_compare:nTF { abs(#1) < 1000 }
      { \fp_to_tl:n { round(#1, 2 - logb(#1)) } }
      {
        \exp_args:Nf \@@_fp_to_tl_aux:nN
          { \fp_to_int:n { logb(#1) } } #1
      }
  }
\cs_new:Npn \@@_fp_to_tl_aux:nN #1#2
  { \fp_to_tl:n { round(#2 * 1e-#1, 2) } e#1 }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_display:}
%   Function to display the time that was measured and the estimated
%   number of operations.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_display:
  {
    \iow_term:e
      {
        \@@_fp_to_tl:N \g_benchmark_time_fp \c_space_tl seconds \c_space_tl
        ( \@@_fp_to_tl:N \g_benchmark_ops_fp \c_space_tl ops)
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Benchmark tic toc}
%
% \begin{variable}{\g_@@_tictoc_int, \g_@@_tictoc_seq, \l_@@_tictoc_pop_tl}
%    \begin{macrocode}
\int_new:N \g_@@_tictoc_int
\seq_new:N \g_@@_tictoc_seq
\tl_new:N \l_@@_tictoc_pop_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}[EXP]{\@@_tictoc_prefix:}
%   We include the package name in analogy with continuation lines of
%   error/warning messages.
%    \begin{macrocode}
\cs_new:Npn \@@_tictoc_prefix:
  {
    (l3benchmark) \c_space_tl
    + \prg_replicate:nn { \g_@@_tictoc_int } { -+ } \c_space_tl
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\benchmark_tic:}
%    \begin{macrocode}
\cs_new_protected:Npn \benchmark_tic:
  {
    \iow_term:e { \@@_tictoc_prefix: TIC }
    \exp_args:NNf \seq_gput_right:Nn \g_@@_tictoc_seq { \sys_timer: }
    \int_gincr:N \g_@@_tictoc_int
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\benchmark_toc:, \@@_toc:}
%    \begin{macrocode}
\cs_new:Npn \benchmark_toc:
  {
    \seq_gpop_right:NNTF \g_@@_tictoc_seq \l_@@_tictoc_pop_tl
      { \@@_toc: }
      { \msg_error:nn { benchmark } { toc-first } }
  }
\cs_new_protected:Npn \@@_toc:
  {
    \int_gdecr:N \g_@@_tictoc_int
    \fp_gset:Nn \g_benchmark_time_fp
      { ( \sys_timer: - \l_@@_tictoc_pop_tl ) / 65536 }
    \iow_term:e
      {
        \@@_tictoc_prefix:
        TOC: \c_space_tl
        \@@_fp_to_tl:N \g_benchmark_time_fp \c_space_tl s
      }
  }
\msg_new:nnn { benchmark } { toc-first }
  {
    \token_to_str:N \benchmark_toc: \c_space_tl without~
    \token_to_str:N \benchmark_tic: \c_space_tl !
  }
%    \end{macrocode}
% \end{macro}
%
%    \begin{macrocode}
%</package>
%    \end{macrocode}
%
% \end{implementation}
%
% \PrintIndex