View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.rng.examples.jmh.core;
19  
20  import java.util.concurrent.ThreadLocalRandom;
21  import java.util.concurrent.TimeUnit;
22  import java.util.function.LongSupplier;
23  import java.util.stream.LongStream;
24  import org.apache.commons.rng.JumpableUniformRandomProvider;
25  import org.apache.commons.rng.simple.RandomSource;
26  import org.openjdk.jmh.annotations.Benchmark;
27  import org.openjdk.jmh.annotations.BenchmarkMode;
28  import org.openjdk.jmh.annotations.Fork;
29  import org.openjdk.jmh.annotations.Level;
30  import org.openjdk.jmh.annotations.Measurement;
31  import org.openjdk.jmh.annotations.Mode;
32  import org.openjdk.jmh.annotations.OutputTimeUnit;
33  import org.openjdk.jmh.annotations.Param;
34  import org.openjdk.jmh.annotations.Scope;
35  import org.openjdk.jmh.annotations.Setup;
36  import org.openjdk.jmh.annotations.State;
37  import org.openjdk.jmh.annotations.Warmup;
38  
39  /**
40   * Executes a benchmark for operations used in the LXM family of RNGs.
41   *
42   * <h2>Note</h2>
43   *
44   * <p>Some code in this benchmark is commented out. It requires a higher
45   * version of Java than the current target. Bumping the JMH module to a higher
46   * minimum java version prevents running benchmarks on the target JVM for
47   * the Commons RNG artifacts. Thus these benchmarks must be manually reinstated by
48   * uncommenting the code and updating the Java version in the module pom.
49   */
50  @BenchmarkMode(Mode.AverageTime)
51  @OutputTimeUnit(TimeUnit.NANOSECONDS)
52  @Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
53  @Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
54  @State(Scope.Benchmark)
55  @Fork(value = 1, jvmArgs = { "-server", "-Xms128M", "-Xmx128M" })
56  public class LXMBenchmark {
57      /** Low half of 128-bit LCG multiplier. The upper half is {@code 1L}. */
58      static final long ML = 0xd605bbb58c8abbfdL;
59  
60      /** Baseline for generation of 1 long value. */
61      private static final String BASELINE1 = "baseline1";
62      /** Baseline for generation of 2 long values. */
63      private static final String BASELINE2 = "baseline2";
64      /** Baseline for generation of 4 long values. */
65      private static final String BASELINE4 = "baseline4";
66      /** Reference implementation. */
67      private static final String REFERENCE = "reference";
68      /** Branchless implementation. */
69      private static final String BRANCHLESS = "branchless";
70  
71      /**
72       * Encapsulates a method to compute an unsigned multiply of 64-bit values to create
73       * the upper and optionally low 64-bits of the 128-bit result.
74       *
75       * <p>This tests methods used to compute an update of a 128-bit linear congruential
76       * generator (LCG).
77       */
78      @State(Scope.Benchmark)
79      public static class UnsignedMultiplyHighSource {
80          /** A mask to convert an {@code int} to an unsigned integer stored as a {@code long}. */
81          private static final long INT_TO_UNSIGNED_BYTE_MASK = 0xffff_ffffL;
82          /** Precomputed upper split (32-bits) of the low half of the 128-bit multiplier constant. */
83          private static final long X = ML >>> 32;
84          /** Precomputed lower split (32-bits) of the low half of the 128-bit multiplier constant. */
85          private static final long Y = ML & INT_TO_UNSIGNED_BYTE_MASK;
86  
87          /**
88           * The method to compute the value.
89           */
90          @Param({BASELINE1,
91              // Require JDK 9+
92              //"mathMultiplyHigh", "mathMultiplyHighWithML",
93              // Require JDK 18+
94              //"mathUnsignedMultiplyHigh", "mathUnsignedMultiplyHighWithML",
95              "unsignedMultiplyHigh", "unsignedMultiplyHighWithML",
96              "unsignedMultiplyHighML",
97              "unsignedMultiplyHighPlusMultiplyLow", "unsignedMultiplyHighAndLow",
98              })
99          private String method;
100 
101         /** Flag to indicate numbers should be precomputed.
102          * Note: The multiply method is extremely fast and number generation can take
103          * a significant part of the overall generation time. */
104         @Param({"true", "false"})
105         private boolean precompute;
106 
107         /** The generator of the next value. */
108         private LongSupplier gen;
109 
110         /**
111          * Compute the next value.
112          *
113          * @return the value
114          */
115         long next() {
116             return gen.getAsLong();
117         }
118 
119         /**
120          * Create the generator of output values.
121          */
122         @Setup
123         public void setup() {
124             final JumpableUniformRandomProvider rng =
125                 (JumpableUniformRandomProvider) RandomSource.XO_RO_SHI_RO_128_PP.create();
126 
127             LongSupplier ga;
128             LongSupplier gb;
129 
130             // Optionally precompute numbers.
131             if (precompute) {
132                 final long[] values = LongStream.generate(rng::nextLong).limit(1024).toArray();
133                 class A implements LongSupplier {
134                     private int i;
135                     @Override
136                     public long getAsLong() {
137                         return values[i++ & 1023];
138                     }
139                 }
140                 ga = new A();
141                 class B implements LongSupplier {
142                     private int i;
143                     @Override
144                     public long getAsLong() {
145                         // Using any odd increment will sample all values (after wrapping)
146                         return values[(i += 3) & 1023];
147                     }
148                 }
149                 gb = new B();
150             } else {
151                 ga = rng::nextLong;
152                 gb = rng.jump()::nextLong;
153             }
154 
155             if (BASELINE1.equals(method)) {
156                 gen = () -> ga.getAsLong();
157             } else if (BASELINE2.equals(method)) {
158                 gen = () -> ga.getAsLong() ^ gb.getAsLong();
159             } else if ("mathMultiplyHigh".equals(method)) {
160                 gen = () -> mathMultiplyHigh(ga.getAsLong(), gb.getAsLong());
161             } else if ("mathMultiplyHighWithML".equals(method)) {
162                 gen = () -> mathMultiplyHigh(ga.getAsLong(), ML);
163             } else if ("mathUnsignedMultiplyHigh".equals(method)) {
164                 gen = () -> mathUnsignedMultiplyHigh(ga.getAsLong(), gb.getAsLong());
165             } else if ("mathUnsignedMultiplyHighWithML".equals(method)) {
166                 gen = () -> mathUnsignedMultiplyHigh(ga.getAsLong(), ML);
167             } else if ("unsignedMultiplyHigh".equals(method)) {
168                 gen = () -> unsignedMultiplyHigh(ga.getAsLong(), gb.getAsLong());
169             } else if ("unsignedMultiplyHighWithML".equals(method)) {
170                 gen = () -> unsignedMultiplyHigh(ga.getAsLong(), ML);
171             } else if ("unsignedMultiplyHighML".equals(method)) {
172                 // Note:
173                 // Running this benchmark should show the explicit precomputation
174                 // of the ML parts does not increase performance. The JVM can
175                 // optimise the call to unsignedMultiplyHigh(a, b) when b is constant.
176                 gen = () -> unsignedMultiplyHighML(ga.getAsLong());
177             } else if ("unsignedMultiplyHighPlusMultiplyLow".equals(method)) {
178                 gen = () -> {
179                     final long a = ga.getAsLong();
180                     final long b = gb.getAsLong();
181                     return unsignedMultiplyHigh(a, b) ^ (a * b);
182                 };
183             } else if ("unsignedMultiplyHighAndLow".equals(method)) {
184                 // Note:
185                 // Running this benchmark should show this method is slower.
186                 // A CPU supporting parallel instructions can multiply (a*b)
187                 // in parallel to calling unsignedMultiplyHigh.
188                 final long[] lo = {0};
189                 gen = () -> {
190                     final long a = ga.getAsLong();
191                     final long b = gb.getAsLong();
192                     final long hi = unsignedMultiplyHigh(a, b, lo);
193                     return hi ^ lo[0];
194                 };
195             } else {
196                 throw new IllegalStateException("Unknown method: " + method);
197             }
198         }
199 
200         /**
201          * Compute the unsigned multiply of two values using Math.multiplyHigh.
202          * <p>Requires JDK 9.
203          * From JDK 10 onwards this method is an intrinsic candidate.
204          *
205          * @param a First value
206          * @param b Second value
207          * @return the upper 64-bits of the 128-bit result
208          */
209         static long mathMultiplyHigh(long a, long b) {
210             // Requires JDK 9
211             // Note: Using runtime reflection to create a method handle
212             // has not been done to avoid any possible artifact during timing
213             // of this very fast method. To benchmark with this method
214             // requires uncommenting this code and compiling with an
215             // appropriate source level.
216             //return Math.multiplyHigh(a, b) + ((a >> 63) & b) + ((b >> 63) & a);
217             throw new NoSuchMethodError();
218         }
219 
220         /**
221          * Compute the unsigned multiply of two values using Math.unsignedMultiplyHigh.
222          * <p>Requires JDK 18.
223          * This method is an intrinsic candidate.
224          *
225          * @param a First value
226          * @param b Second value
227          * @return the upper 64-bits of the 128-bit result
228          */
229         static long mathUnsignedMultiplyHigh(long a, long b) {
230             // Requires JDK 18
231             //return Math.unsignedMultiplyHigh(a, b);
232             throw new NoSuchMethodError();
233         }
234 
235         /**
236          * Multiply the two values as if unsigned 64-bit longs to produce the high 64-bits
237          * of the 128-bit unsigned result.
238          *
239          * <p>This method computes the equivalent of:
240          * <pre>
241          * Math.multiplyHigh(a, b) + ((a >> 63) & b) + ((b >> 63) & a)
242          * </pre>
243          *
244          * <p>Note: The method {@code Math.multiplyHigh} was added in JDK 9
245          * and should be used as above when the source code targets Java 11
246          * to exploit the intrinsic method.
247          *
248          * <p>Note: The method {@code Math.unsignedMultiplyHigh} was added in JDK 18
249          * and should be used when the source code target allows.
250          *
251          * @param value1 the first value
252          * @param value2 the second value
253          * @return the high 64-bits of the 128-bit result
254          */
255         static long unsignedMultiplyHigh(long value1, long value2) {
256             // Computation is based on the following observation about the upper (a and x)
257             // and lower (b and y) bits of unsigned big-endian integers:
258             //   ab * xy
259             // =  b *  y
260             // +  b * x0
261             // + a0 *  y
262             // + a0 * x0
263             // = b * y
264             // + b * x * 2^32
265             // + a * y * 2^32
266             // + a * x * 2^64
267             //
268             // Summation using a character for each byte:
269             //
270             //             byby byby
271             // +      bxbx bxbx 0000
272             // +      ayay ayay 0000
273             // + axax axax 0000 0000
274             //
275             // The summation can be rearranged to ensure no overflow given
276             // that the result of two unsigned 32-bit integers multiplied together
277             // plus two full 32-bit integers cannot overflow 64 bits:
278             // > long x = (1L << 32) - 1
279             // > x * x + x + x == -1 (all bits set, no overflow)
280             //
281             // The carry is a composed intermediate which will never overflow:
282             //
283             //             byby byby
284             // +           bxbx 0000
285             // +      ayay ayay 0000
286             //
287             // +      bxbx 0000 0000
288             // + axax axax 0000 0000
289 
290             final long a = value1 >>> 32;
291             final long b = value1 & INT_TO_UNSIGNED_BYTE_MASK;
292             final long x = value2 >>> 32;
293             final long y = value2 & INT_TO_UNSIGNED_BYTE_MASK;
294 
295 
296             final long by = b * y;
297             final long bx = b * x;
298             final long ay = a * y;
299             final long ax = a * x;
300 
301             // Cannot overflow
302             final long carry = (by >>> 32) +
303                                (bx & INT_TO_UNSIGNED_BYTE_MASK) +
304                                 ay;
305             // Note:
306             // low = (carry << 32) | (by & INT_TO_UNSIGNED_BYTE_MASK)
307 
308             return (bx >>> 32) + (carry >>> 32) + ax;
309         }
310 
311         /**
312          * Multiply the two values as if unsigned 64-bit longs to produce the high
313          * 64-bits of the 128-bit unsigned result.
314          *
315          * @param value1 the first value
316          * @param value2 the second value
317          * @param low low 64-bits of the 128-bit result
318          * @return the high 64-bits of the 128-bit result
319          */
320         static long unsignedMultiplyHigh(long value1, long value2, long[] low) {
321             final long a = value1 >>> 32;
322             final long b = value1 & INT_TO_UNSIGNED_BYTE_MASK;
323             final long x = value2 >>> 32;
324             final long y = value2 & INT_TO_UNSIGNED_BYTE_MASK;
325 
326 
327             final long by = b * y;
328             final long bx = b * x;
329             final long ay = a * y;
330             final long ax = a * x;
331 
332             final long carry = (by >>> 32) +
333                                (bx & INT_TO_UNSIGNED_BYTE_MASK) +
334                                 ay;
335 
336             low[0] = (carry << 32) | (by & INT_TO_UNSIGNED_BYTE_MASK);
337 
338             return (bx >>> 32) + (carry >>> 32) + ax;
339         }
340 
341         /**
342          * Multiply the value as an unsigned 64-bit long by the constant
343          * {@code ML = 0xd605bbb58c8abbfdL} to produce the high 64-bits
344          * of the 128-bit unsigned result.
345          *
346          * <p>This is a specialised version of {@link #unsignedMultiplyHigh(long, long)}
347          * for use in the 128-bit LCG sub-generator of the LXM family.
348          *
349          * @param value the value
350          * @return the high 64-bits of the 128-bit result
351          */
352         static long unsignedMultiplyHighML(long value) {
353             final long a = value >>> 32;
354             final long b = value & INT_TO_UNSIGNED_BYTE_MASK;
355 
356             final long by = b * Y;
357             final long bx = b * X;
358             final long ay = a * Y;
359             final long ax = a * X;
360 
361             // Cannot overflow
362             final long carry = (by >>> 32) +
363                                (bx & INT_TO_UNSIGNED_BYTE_MASK) +
364                                 ay;
365 
366             return (bx >>> 32) + (carry >>> 32) + ax;
367         }
368     }
369 
370     /**
371      * Encapsulates a method to compute an unsigned multiply of two 128-bit values to create
372      * a truncated 128-bit result. The upper 128-bits of the 256-bit result are discarded.
373      * The upper and optionally low 64-bits of the truncated 128-bit result are computed.
374      *
375      * <p>This tests methods used during computation of jumps of size {@code 2^k} for a
376      * 128-bit linear congruential generator (LCG).
377      */
378     @State(Scope.Benchmark)
379     public static class UnsignedMultiply128Source {
380         /** A mask to convert an {@code int} to an unsigned integer stored as a {@code long}. */
381         private static final long INT_TO_UNSIGNED_BYTE_MASK = 0xffffffffL;
382 
383         /**
384          * The method to compute the value.
385          */
386         @Param({BASELINE1,
387             "unsignedMultiplyHighPlusProducts",
388             "unsignedMultiply128AndLow",
389             "unsignedMultiplyHighPlusProducts (square)",
390             "unsignedSquareHighPlusProducts",
391             "unsignedMultiply128AndLow (square)",
392             "unsignedSquare128AndLow",
393             })
394         private String method;
395 
396         /** Flag to indicate numbers should be precomputed.
397          * Note: The multiply method is extremely fast and number generation can take
398          * significant part of the overall generation time. */
399         @Param({"true", "false"})
400         private boolean precompute;
401 
402         /** The generator of the next value. */
403         private LongSupplier gen;
404 
405         /**
406          * Compute the next value.
407          *
408          * @return the value
409          */
410         long next() {
411             return gen.getAsLong();
412         }
413 
414         /**
415          * Create the generator of output values.
416          */
417         @Setup
418         public void setup() {
419             final JumpableUniformRandomProvider rng =
420                 (JumpableUniformRandomProvider) RandomSource.XO_RO_SHI_RO_128_PP.create();
421 
422             LongSupplier ga;
423             LongSupplier gb;
424             LongSupplier gc;
425             LongSupplier gd;
426 
427             // Optionally precompute numbers.
428             if (precompute) {
429                 final long[] values = LongStream.generate(rng::nextLong).limit(1024).toArray();
430                 class Gen implements LongSupplier {
431                     private int i;
432                     private final int inc;
433                     Gen(int inc) {
434                         this.inc = inc;
435                     }
436                     @Override
437                     public long getAsLong() {
438                         return values[(i += inc) & 1023];
439                     }
440                 }
441                 // Using any odd increment will sample all values (after wrapping)
442                 ga = new Gen(1);
443                 gb = new Gen(3);
444                 gc = new Gen(5);
445                 gd = new Gen(7);
446             } else {
447                 ga = rng::nextLong;
448                 gb = rng.jump()::nextLong;
449                 gc = rng.jump()::nextLong;
450                 gd = rng.jump()::nextLong;
451             }
452 
453             if (BASELINE2.equals(method)) {
454                 gen = () -> ga.getAsLong() ^ gb.getAsLong();
455             } else if (BASELINE4.equals(method)) {
456                 gen = () -> ga.getAsLong() ^ gb.getAsLong() ^ gc.getAsLong() ^ gd.getAsLong();
457             } else if ("unsignedMultiplyHighPlusProducts".equals(method)) {
458                 gen = () -> {
459                     final long a = ga.getAsLong();
460                     final long b = gb.getAsLong();
461                     final long c = gc.getAsLong();
462                     final long d = gd.getAsLong();
463                     final long hi = UnsignedMultiplyHighSource.unsignedMultiplyHigh(b, d) +
464                                     a * d + b * c;
465                     final long lo = b * d;
466                     return hi ^ lo;
467                 };
468             } else if ("unsignedMultiply128AndLow".equals(method)) {
469                 // Note:
470                 // Running this benchmark should show this method has no
471                 // benefit over the explicit computation using unsignedMultiplyHigh
472                 // and may actually be slower.
473                 final long[] lo = {0};
474                 gen = () -> {
475                     final long a = ga.getAsLong();
476                     final long b = gb.getAsLong();
477                     final long c = gc.getAsLong();
478                     final long d = gd.getAsLong();
479                     final long hi = unsignedMultiply128(a, b, c, d, lo);
480                     return hi ^ lo[0];
481                 };
482             } else if ("unsignedMultiplyHighPlusProducts (square)".equals(method)) {
483                 gen = () -> {
484                     final long a = ga.getAsLong();
485                     final long b = gb.getAsLong();
486                     final long hi = UnsignedMultiplyHighSource.unsignedMultiplyHigh(b, b) +
487                                     2 * a * b;
488                     final long lo = b * b;
489                     return hi ^ lo;
490                 };
491             } else if ("unsignedSquareHighPlusProducts".equals(method)) {
492                 gen = () -> {
493                     final long a = ga.getAsLong();
494                     final long b = gb.getAsLong();
495                     final long hi = unsignedSquareHigh(b) +
496                                     2 * a * b;
497                     final long lo = b * b;
498                     return hi ^ lo;
499                 };
500             } else if ("unsignedMultiply128AndLow (square)".equals(method)) {
501                 final long[] lo = {0};
502                 gen = () -> {
503                     final long a = ga.getAsLong();
504                     final long b = gb.getAsLong();
505                     final long hi = unsignedMultiply128(a, b, a, b, lo);
506                     return hi ^ lo[0];
507                 };
508             } else if ("unsignedSquare128AndLow".equals(method)) {
509                 // Note:
510                 // Running this benchmark should show this method has no
511                 // benefit over the computation using unsignedMultiply128.
512                 // The dedicated square method saves only 1 shift, 1 mask
513                 // and 1 multiply operation.
514                 final long[] lo = {0};
515                 gen = () -> {
516                     final long a = ga.getAsLong();
517                     final long b = gb.getAsLong();
518                     final long hi = unsignedSquare128(a, b, lo);
519                     return hi ^ lo[0];
520                 };
521             } else {
522                 throw new IllegalStateException("Unknown 128-bit method: " + method);
523             }
524         }
525 
526         /**
527          * Multiply the two values as if unsigned 128-bit longs to produce the 128-bit unsigned result.
528          * Note the result is a truncation of the full 256-bit result.
529          *
530          * <p>This is a extended version of {@link UnsignedMultiplyHighSource#unsignedMultiplyHigh(long, long)}
531          * for use in the 128-bit LCG sub-generator of the LXM family. It computes the equivalent of:
532          * <pre>
533          * long hi = unsignedMultiplyHigh(value1l, value2l) +
534          *           value1h * value2l + value1l * value2h;
535          * long lo = value1l * value2l;
536          * </pre>
537          *
538          * @param value1h the high bits of the first value
539          * @param value1l the low bits of the first value
540          * @param value2h the high bits of the second value
541          * @param value2l the low bits of the second value
542          * @param low the low bits of the 128-bit result (output to array index [0])
543          * @return the high 64-bits of the 128-bit result
544          */
545         static long unsignedMultiply128(long value1h, long value1l, long value2h, long value2l,
546             long[] low) {
547 
548             // Multiply the two low halves to compute an initial 128-bit result
549             final long a = value1l >>> 32;
550             final long b = value1l & INT_TO_UNSIGNED_BYTE_MASK;
551             final long x = value2l >>> 32;
552             final long y = value2l & INT_TO_UNSIGNED_BYTE_MASK;
553 
554             final long by = b * y;
555             final long bx = b * x;
556             final long ay = a * y;
557             final long ax = a * x;
558 
559             // Cannot overflow
560             final long carry = (by >>> 32) +
561                                (bx & INT_TO_UNSIGNED_BYTE_MASK) +
562                                 ay;
563             low[0] = (carry << 32) | (by & INT_TO_UNSIGNED_BYTE_MASK);
564             final long high = (bx >>> 32) + (carry >>> 32) + ax;
565 
566             // Incorporate the remaining high bits that do not overflow 128-bits
567             return high + value1h * value2l + value1l * value2h;
568         }
569 
570         /**
571          * Square the value as if an unsigned 128-bit long to produce the 128-bit unsigned result.
572          *
573          * <p>This is a specialisation of {@link UnsignedMultiplyHighSource#unsignedMultiplyHigh(long, long)}
574          * for use in the 128-bit LCG sub-generator of the LXM family. It computes the equivalent of:
575          * <pre>
576          * unsignedMultiplyHigh(value, value);
577          * </pre>
578          *
579          * @param value the high bits of the value
580          * @return the high 64-bits of the 128-bit result
581          */
582         static long unsignedSquareHigh(long value) {
583             final long a = value >>> 32;
584             final long b = value & INT_TO_UNSIGNED_BYTE_MASK;
585 
586             final long by = b * b;
587             final long bx = b * a;
588             final long ax = a * a;
589 
590             // Cannot overflow
591             final long carry = (by >>> 32) +
592                                (bx & INT_TO_UNSIGNED_BYTE_MASK) +
593                                 bx;
594 
595             return (bx >>> 32) + (carry >>> 32) + ax;
596         }
597 
598         /**
599          * Square the value as if an unsigned 128-bit long to produce the 128-bit unsigned result.
600          *
601          * <p>This is a specialisation of {@link #unsignedMultiply128(long, long, long, long, long[])}
602          * for use in the 128-bit LCG sub-generator of the LXM family. It computes the equivalent of:
603          * <pre>
604          * long hi = unsignedMultiplyHigh(valueh, valuel, valueh, valuel) +
605          *           2 * valueh * valuel;
606          * long lo = valuel * valuel;
607          * </pre>
608          *
609          * @param valueh the high bits of the value
610          * @param valuel the low bits of the value
611          * @param low the low bits of the 128-bit result (output to array index [0])
612          * @return the high 64-bits of the 128-bit result
613          */
614         static long unsignedSquare128(long valueh, long valuel, long[] low) {
615 
616             // Multiply the two low halves to compute an initial 128-bit result
617             final long a = valuel >>> 32;
618             final long b = valuel & INT_TO_UNSIGNED_BYTE_MASK;
619             // x = a, y = b
620 
621             final long by = b * b;
622             final long bx = b * a;
623             // ay = bx
624             final long ax = a * a;
625 
626             // Cannot overflow
627             final long carry = (by >>> 32) +
628                                (bx & INT_TO_UNSIGNED_BYTE_MASK) +
629                                 bx;
630             low[0] = (carry << 32) | (by & INT_TO_UNSIGNED_BYTE_MASK);
631             final long high = (bx >>> 32) + (carry >>> 32) + ax;
632 
633             // Incorporate the remaining high bits that do not overflow 128-bits
634             return high + 2 * valueh * valuel;
635         }
636     }
637 
638     /**
639      * Encapsulates a method to compute an update step on a 128-bit linear congruential
640      * generator (LCG). This benchmark source targets optimisation of the 128-bit LCG update
641      * step, in particular the branch to compute carry propagation from the low to high half.
642      */
643     @State(Scope.Benchmark)
644     public static class LCG128Source {
645         /**
646          * The method to compute the value.
647          *
648          * <p>Notes:
649          * <ul>
650          * <li>Final LCG additive parameters make no significant difference.
651          * <li>Inlining the compareUnsigned operation is a marginal improvement.
652          * <li>Using the conditional load of the addition is of benefit when the add parameter
653          *     is close to 2^63. When close to 0 or 2^64 then branch prediction is good and
654          *     this path is fast.
655          * <li>Using the full branchless version creates code with a constant time cost which
656          *     is fast. The branch code is faster when the branch is ~100% predictable, but can
657          *     be much slower.
658          * <li>The branchless version assuming the add parameter is odd is constant time cost
659          *     and fastest. The code is very small and putting it in a method will be inlined
660          *     and allows code reuse.
661          * </ul>
662          */
663         @Param({
664             REFERENCE,
665             //"referenceFinal",
666             //"compareUnsigned",
667             //"conditional",
668             //"conditionalFinal",
669             BRANCHLESS,
670             //"branchlessFull",
671             //"branchlessFullComposed",
672             })
673         private String method;
674 
675         /**
676          * The low half of the additive parameter.
677          * This parameter determines how frequent the carry propagation
678          * must occur. The value is unsigned. When close to 0 then generation of a bit
679          * during carry propagation is unlikely; increasingly large additions will make
680          * a carry bit more likely. A value of -1 will (almost) always trigger a carry.
681          */
682         @Param({
683             // Note: A small value is obtained from -1 + [0, range). Commented out
684             // to reduce benchmark run-time.
685             //"1",
686 
687             // Uniformly spread in [0, 2^64)
688             // [1 .. 8] * 2^61 - 1
689             "2305843009213693951", "4611686018427387903", "6917529027641081855",
690             "9223372036854775807", "-6917529027641081857", "-4611686018427387905",
691             "-2305843009213693953", "-1",
692             })
693         private long add;
694 
695         /**
696          * Range for a random addition to the additive parameter.
697          * <ul>
698          * <li>{@code range == 0}: no addition
699          * <li>{@code range == Long.MIN_VALUE}: random addition
700          * <li>{@code range > 0}: random in [0, range)
701          * <li>{@code range < 0}: random in [0, -range) + 2^63
702          * </ul>
703          */
704         @Param({
705             // Zero is not useful for a realistic LCG which should have a random
706             // add parameter. It can be used to test a specific add value, e.g. 1L:
707             // java -jar target/examples-jmh.jar LXMBenchmark.lcg128 -p add=1 -p range=0
708             //"0",
709 
710             // This is matched to the interval between successive 'add' parameters.
711             // 2^61
712             "2305843009213693952",
713 
714             // This is useful only when the 'add' parameter is fixed, ideally to zero.
715             // Long.MIN_VALUE
716             //"-9223372036854775808",
717             })
718         private long range;
719 
720         /** The generator of the next value. */
721         private LongSupplier gen;
722 
723         /**
724          * Compute the next value.
725          *
726          * @return the value
727          */
728         long next() {
729             return gen.getAsLong();
730         }
731 
732         /**
733          * Create the generator of output values.
734          */
735         @Setup(Level.Iteration)
736         public void setup() {
737             final long ah = ThreadLocalRandom.current().nextLong();
738             long al = add;
739             // Choose to randomise the add parameter
740             if (range == Long.MIN_VALUE) {
741                 // random addition
742                 al += ThreadLocalRandom.current().nextLong();
743             } else if (range > 0) {
744                 // [0, range)
745                 al += ThreadLocalRandom.current().nextLong(range);
746             } else if (range < 0) {
747                 // [0, -range) + 2^63
748                 al += ThreadLocalRandom.current().nextLong(-range) + Long.MIN_VALUE;
749             }
750             // else range == 0: no addition
751 
752             if (REFERENCE.equals(method)) {
753                 gen = new ReferenceLcg128(ah, al)::getAsLong;
754             } else if ("referenceFinal".equals(method)) {
755                 gen = new ReferenceLcg128Final(ah, al)::getAsLong;
756             } else if ("compareUnsigned".equals(method)) {
757                 gen = new CompareUnsignedLcg128(ah, al)::getAsLong;
758             } else if ("conditional".equals(method)) {
759                 gen = new ConditionalLcg128(ah, al)::getAsLong;
760             } else if ("conditionalFinal".equals(method)) {
761                 gen = new ConditionalLcg128Final(ah, al)::getAsLong;
762             } else if (BRANCHLESS.equals(method)) {
763                 gen = new BranchlessLcg128(ah, al)::getAsLong;
764             } else if ("branchlessFull".equals(method)) {
765                 gen = new BranchlessFullLcg128(ah, al)::getAsLong;
766             } else if ("branchlessFullComposed".equals(method)) {
767                 gen = new BranchlessFullComposedLcg128(ah, al)::getAsLong;
768             } else {
769                 throw new IllegalStateException("Unknown LCG method: " + method);
770             }
771         }
772 
773         /**
774          * Base class for the 128-bit LCG. This holds the LCG state. It is initialised
775          * to 1 on construction. The value does not matter. The LCG additive parameter
776          * is what determines how frequent the carry propagation branch will be executed.
777          */
778         abstract static class BaseLcg128 implements LongSupplier {
779             /** High half of the 128-bit state of the LCG. */
780             protected long lsh;
781             /** Low half of the 128-bit state of the LCG. */
782             protected long lsl = 1;
783         }
784 
785         /**
786          * Implements the 128-bit LCG using the reference code in Steele & Vigna (2021).
787          *
788          * <p>Note: the additive parameter is not final. This is due to the requirement
789          * for the LXM generator to implement
790          * {@link org.apache.commons.rng.RestorableUniformRandomProvider}.
791          */
792         static class ReferenceLcg128 extends BaseLcg128 {
793             /** High half of the 128-bit per-instance LCG additive parameter. */
794             private long lah;
795             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
796             private long lal;
797 
798             /**
799              * @param ah High-half of add
800              * @param al Low-half of add
801              */
802             ReferenceLcg128(long ah, long al) {
803                 lah = ah;
804                 lal = al | 1;
805             }
806 
807             @Override
808             public long getAsLong() {
809                 // LCG update
810                 // The LCG is, in effect, "s = m * s + a" where m = ((1LL << 64) + ML)
811                 final long sh = lsh;
812                 final long sl = lsl;
813                 final long u = ML * sl;
814                 // High half
815                 lsh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
816                 // Low half
817                 lsl = u + lal;
818                 // Carry propagation
819                 if (Long.compareUnsigned(lsl, u) < 0) {
820                     ++lsh;
821                 }
822                 return sh;
823             }
824         }
825 
826         /**
827          * Implements the 128-bit LCG using the reference code in Steele & Vigna (2021).
828          * This uses a final additive parameter allowing the JVM to optimise.
829          */
830         static class ReferenceLcg128Final extends BaseLcg128 {
831             /** High half of the 128-bit per-instance LCG additive parameter. */
832             private final long lah;
833             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
834             private final long lal;
835 
836             /**
837              * @param ah High-half of add
838              * @param al Low-half of add
839              */
840             ReferenceLcg128Final(long ah, long al) {
841                 lah = ah;
842                 lal = al | 1;
843             }
844 
845             @Override
846             public long getAsLong() {
847                 final long sh = lsh;
848                 final long sl = lsl;
849                 final long u = ML * sl;
850                 // High half
851                 lsh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
852                 // Low half
853                 lsl = u + lal;
854                 // Carry propagation
855                 if (Long.compareUnsigned(lsl, u) < 0) {
856                     ++lsh;
857                 }
858                 return sh;
859             }
860         }
861 
862         /**
863          * Implements the 128-bit LCG using the reference code in Steele & Vigna (2021)
864          * but directly implements the {@link Long#compareUnsigned(long, long)}.
865          *
866          * <p>Note: the additive parameter is not final. This is due to the requirement
867          * for the LXM generator to implement
868          * {@link org.apache.commons.rng.RestorableUniformRandomProvider}.
869          */
870         static class CompareUnsignedLcg128 extends BaseLcg128 {
871             /** High half of the 128-bit per-instance LCG additive parameter. */
872             private long lah;
873             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
874             private long lal;
875 
876             /**
877              * @param ah High-half of add
878              * @param al Low-half of add
879              */
880             CompareUnsignedLcg128(long ah, long al) {
881                 lah = ah;
882                 lal = al | 1;
883             }
884 
885             @Override
886             public long getAsLong() {
887                 final long sh = lsh;
888                 final long sl = lsl;
889                 final long u = ML * sl;
890                 // High half
891                 lsh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
892                 // Low half
893                 lsl = u + lal;
894                 // Carry propagation using an unsigned compare
895                 if (lsl + Long.MIN_VALUE < u + Long.MIN_VALUE) {
896                     ++lsh;
897                 }
898                 return sh;
899             }
900         }
901 
902         /**
903          * Implements the 128-bit LCG using a conditional load in place of a branch
904          * statement for the carry. Uses a non-final additive parameter.
905          */
906         static class ConditionalLcg128 extends BaseLcg128 {
907             /** High half of the 128-bit per-instance LCG additive parameter. */
908             private long lah;
909             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
910             private long lal;
911 
912             /**
913              * @param ah High-half of add
914              * @param al Low-half of add
915              */
916             ConditionalLcg128(long ah, long al) {
917                 lah = ah;
918                 lal = al | 1;
919             }
920 
921             @Override
922             public long getAsLong() {
923                 long sh = lsh;
924                 long sl = lsl;
925                 final long z = sh;
926                 final long u = ML * sl;
927                 // High half
928                 sh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
929                 // Low half
930                 sl = u + lal;
931                 // Carry propagation using a conditional add of 1 or 0
932                 lsh = (sl + Long.MIN_VALUE < u + Long.MIN_VALUE ? 1 : 0) + sh;
933                 lsl = sl;
934                 return z;
935             }
936         }
937 
938         /**
939          * Implements the 128-bit LCG using a conditional load in place of a branch
940          * statement for the carry. Uses a final additive parameter.
941          */
942         static class ConditionalLcg128Final extends BaseLcg128 {
943             /** High half of the 128-bit per-instance LCG additive parameter. */
944             private final long lah;
945             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
946             private final long lal;
947 
948             /**
949              * @param ah High-half of add
950              * @param al Low-half of add
951              */
952             ConditionalLcg128Final(long ah, long al) {
953                 lah = ah;
954                 lal = al | 1;
955             }
956 
957             @Override
958             public long getAsLong() {
959                 long sh = lsh;
960                 long sl = lsl;
961                 final long z = sh;
962                 final long u = ML * sl;
963                 // High half
964                 sh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
965                 // Low half
966                 sl = u + lal;
967                 // Carry propagation using a conditional add of 1 or 0
968                 lsh = (sl + Long.MIN_VALUE < u + Long.MIN_VALUE ? 1 : 0) + sh;
969                 lsl = sl;
970                 return z;
971             }
972         }
973 
974         /**
975          * Implements the 128-bit LCG using a branchless carry with a bit cascade.
976          * The low part is computed using simple addition {@code lsl = u + al} rather
977          * than composing the low part using bits from the carry computation.
978          * The carry propagation is put in a method. This tests code reusability for the
979          * carry part of the computation.
980          */
981         static class BranchlessLcg128 extends BaseLcg128 {
982             /** High half of the 128-bit per-instance LCG additive parameter. */
983             private long lah;
984             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
985             private long lal;
986 
987             /**
988              * @param ah High-half of add
989              * @param al Low-half of add
990              */
991             BranchlessLcg128(long ah, long al) {
992                 lah = ah;
993                 lal = al | 1;
994             }
995 
996             @Override
997             public long getAsLong() {
998                 long sh = lsh;
999                 final long sl = lsl;
1000                 final long z = sh;
1001                 final long u = ML * sl;
1002                 // High half
1003                 sh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
1004                 // Low half.
1005                 final long al = lal;
1006                 lsl = u + al;
1007                 // Carry propagation using a branchless bit cascade
1008                 lsh = sh + unsignedAddHigh(u, al);
1009                 return z;
1010             }
1011 
1012             /**
1013              * Add the two values as if unsigned 64-bit longs to produce the high 64-bits
1014              * of the 128-bit unsigned result.
1015              * <pre>
1016              * left + right
1017              * </pre>
1018              * <p>This method is computing a carry bit.
1019              *
1020              * @param left the left argument
1021              * @param right the right argument (assumed to have the lowest bit set to 1)
1022              * @return the carry (either 0 or 1)
1023              */
1024             static long unsignedAddHigh(long left, long right) {
1025                 // Method compiles to 13 bytes as Java byte code.
1026                 // This is below the default of 35 for code inlining.
1027                 return ((left >>> 1) + (right >>> 1) + (left & 1)) >>> -1;
1028             }
1029         }
1030 
1031         /**
1032          * Implements the 128-bit LCG using a branchless carry with a bit cascade.
1033          * The low part is computed using simple addition {@code lsl = u + al} rather
1034          * than composing the low part using bits from the carry computation.
1035          * The carry propagation is put in a method. This tests code reusability for the
1036          * carry part of the computation. This carry computation is a full
1037          * unsignedAddHigh which does not assume the LCG addition is odd.
1038          */
1039         static class BranchlessFullLcg128 extends BaseLcg128 {
1040             /** High half of the 128-bit per-instance LCG additive parameter. */
1041             private long lah;
1042             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
1043             private long lal;
1044 
1045             /**
1046              * @param ah High-half of add
1047              * @param al Low-half of add
1048              */
1049             BranchlessFullLcg128(long ah, long al) {
1050                 lah = ah;
1051                 lal = al | 1;
1052             }
1053 
1054             @Override
1055             public long getAsLong() {
1056                 long sh = lsh;
1057                 final long sl = lsl;
1058                 final long z = sh;
1059                 final long u = ML * sl;
1060                 // High half
1061                 sh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
1062                 // Low half.
1063                 final long al = lal;
1064                 lsl = u + al;
1065                 // Carry propagation using a branchless bit cascade
1066                 lsh = sh + unsignedAddHigh(u, al);
1067                 return z;
1068             }
1069 
1070             /**
1071              * Add the two values as if unsigned 64-bit longs to produce the high 64-bits
1072              * of the 128-bit unsigned result.
1073              * <pre>
1074              * left + right
1075              * </pre>
1076              * <p>This method is computing a carry bit.
1077              *
1078              * @param left the left argument
1079              * @param right the right argument
1080              * @return the carry (either 0 or 1)
1081              */
1082             static long unsignedAddHigh(long left, long right) {
1083                 // Method compiles to 27 bytes as Java byte code.
1084                 // This is below the default of 35 for code inlining.
1085                 return ((left >>> 32) + (right >>> 32) +
1086                        (((left & 0xffff_ffffL) + (right & 0xffff_ffffL)) >>> 32)) >>> 32;
1087             }
1088         }
1089 
1090         /**
1091          * Implements the 128-bit LCG using a branchless carry with a bit cascade.
1092          * The low part is composed from bits of the carry computation which requires
1093          * the full computation. It cannot be put into a method as it requires more
1094          * than 1 return variable.
1095          */
1096         static class BranchlessFullComposedLcg128 extends BaseLcg128 {
1097             /** High half of the 128-bit per-instance LCG additive parameter. */
1098             private long lah;
1099             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
1100             private long lal;
1101 
1102             /**
1103              * @param ah High-half of add
1104              * @param al Low-half of add
1105              */
1106             BranchlessFullComposedLcg128(long ah, long al) {
1107                 lah = ah;
1108                 lal = al | 1;
1109             }
1110 
1111             @Override
1112             public long getAsLong() {
1113                 long sh = lsh;
1114                 long sl = lsl;
1115                 final long z = sh;
1116                 final long u = ML * sl;
1117                 // High half
1118                 sh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
1119                 // Low half.
1120                 // Carry propagation using a branchless bit cascade
1121                 final long leftLo = u & 0xffff_ffffL;
1122                 final long leftHi = u >>> 32;
1123                 final long rightLo = lal & 0xffff_ffffL;
1124                 final long rightHi = lal >>> 32;
1125                 final long lo = leftLo + rightLo;
1126                 final long hi = leftHi + rightHi + (lo >>> 32);
1127                 // sl = u + lal;
1128                 sl = (hi << 32) | lo & 0xffff_ffffL;
1129                 lsh = sh + (hi >>> 32);
1130                 lsl = sl;
1131                 return z;
1132             }
1133         }
1134     }
1135 
1136     /**
1137      * Encapsulates a method to compute an update step on an LXM generator with a 128-bit
1138      * Linear Congruential Generator.
1139      */
1140     @State(Scope.Benchmark)
1141     public static class LXM128Source {
1142         /** Initial state1 of the XBG generator. */
1143         static final long X0 = 0xdeadbeefdeadbeefL;
1144         /** Initial state0 of the XBG generator. */
1145         static final long X1 = lea64(0xdeadbeefdeadbeefL);
1146         /** Initial state0 of the LCG generator. */
1147         static final long S0 = 0;
1148         /** Initial state1 of the LCG generator. */
1149         static final long S1 = 1;
1150 
1151         /**
1152          * The method to compute the value.
1153          */
1154         @Param({
1155             REFERENCE,
1156             // Use the best method from the LCG128Source
1157             BRANCHLESS,
1158             })
1159         private String method;
1160 
1161         /** The generator of the next value. */
1162         private LongSupplier gen;
1163 
1164         /**
1165          * Compute the next value.
1166          *
1167          * @return the value
1168          */
1169         long next() {
1170             return gen.getAsLong();
1171         }
1172 
1173         /**
1174          * Create the generator of output values.
1175          */
1176         @Setup(Level.Iteration)
1177         public void setup() {
1178             final long ah = ThreadLocalRandom.current().nextLong();
1179             final long al = ThreadLocalRandom.current().nextLong();
1180 
1181             if (REFERENCE.equals(method)) {
1182                 gen = new ReferenceLxm128(ah, al)::getAsLong;
1183             } else if (BRANCHLESS.equals(method)) {
1184                 gen = new BranchlessLxm128(ah, al)::getAsLong;
1185             } else {
1186                 throw new IllegalStateException("Unknown L method: " + method);
1187             }
1188         }
1189 
1190         /**
1191          * Perform a 64-bit mixing function using Doug Lea's 64-bit mix constants and shifts.
1192          *
1193          * <p>This is based on the original 64-bit mix function of Austin Appleby's
1194          * MurmurHash3 modified to use a single mix constant and 32-bit shifts, which may have
1195          * a performance advantage on some processors. The code is provided in Steele and
1196          * Vigna's paper.
1197          *
1198          * @param x the input value
1199          * @return the output value
1200          */
1201         static long lea64(long x) {
1202             x = (x ^ (x >>> 32)) * 0xdaba0b6eb09322e3L;
1203             x = (x ^ (x >>> 32)) * 0xdaba0b6eb09322e3L;
1204             return x ^ (x >>> 32);
1205         }
1206 
1207         /**
1208          * Base class for the LXM generator. This holds the 128-bit LCG state.
1209          *
1210          * <p>The XBG sub-generator is assumed to be XoRoShiRo128 with 128-bits of state.
1211          * This is fast so the speed effect of changing the LCG implementation can be measured.
1212          *
1213          * <p>The initial state of the two generators should ne matter so they use constants.
1214          * The LCG additive parameter is what determines how frequent the carry propagation
1215          * branch will be executed.
1216          *
1217          * <p>Note: the additive parameter is not final. This is due to the requirement
1218          * for the LXM generator to implement
1219          * {@link org.apache.commons.rng.RestorableUniformRandomProvider}.
1220          */
1221         abstract static class BaseLxm128 implements LongSupplier {
1222             /** State0 of XBG. */
1223             protected long state0 = X0;
1224             /** State1 of XBG. */
1225             protected long state1 = X1;
1226             /** High half of the 128-bit state of the LCG. */
1227             protected long lsh = S0;
1228             /** Low half of the 128-bit state of the LCG. */
1229             protected long lsl = S1;
1230             /** High half of the 128-bit per-instance LCG additive parameter. */
1231             protected long lah;
1232             /** Low half of the 128-bit per-instance LCG additive parameter (must be odd). */
1233             protected long lal;
1234 
1235             /**
1236              * @param ah High-half of add
1237              * @param al Low-half of add
1238              */
1239             BaseLxm128(long ah, long al) {
1240                 lah = ah;
1241                 lal = al | 1;
1242             }
1243         }
1244 
1245         /**
1246          * Implements the L128X128Mix using the reference code for the LCG in Steele & Vigna (2021).
1247          */
1248         static class ReferenceLxm128 extends BaseLxm128 {
1249             /**
1250              * @param ah High-half of add
1251              * @param al Low-half of add
1252              */
1253             ReferenceLxm128(long ah, long al) {
1254                 super(ah, al);
1255             }
1256 
1257             @Override
1258             public long getAsLong() {
1259                 // LXM generate.
1260                 // Old state is used for the output allowing parallel pipelining
1261                 // on processors that support multiple concurrent instructions.
1262 
1263                 final long s0 = state0;
1264                 final long sh = lsh;
1265 
1266                 // Mix
1267                 final long z = lea64(sh + s0);
1268 
1269                 // LCG update
1270                 // The LCG is, in effect, "s = m * s + a" where m = ((1LL << 64) + ML)
1271                 final long sl = lsl;
1272                 final long u = ML * sl;
1273                 // High half
1274                 lsh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
1275                 // Low half
1276                 lsl = u + lal;
1277                 // Carry propagation
1278                 if (Long.compareUnsigned(lsl, u) < 0) {
1279                     ++lsh;
1280                 }
1281 
1282                 // XBG update
1283                 long s1 = state1;
1284 
1285                 s1 ^= s0;
1286                 state0 = Long.rotateLeft(s0, 24) ^ s1 ^ (s1 << 16); // a, b
1287                 state1 = Long.rotateLeft(s1, 37); // c
1288 
1289                 return z;
1290             }
1291         }
1292 
1293         /**
1294          * Implements the L128X128Mix using a branchless carry with a bit cascade.
1295          * The low part is computed using simple addition {@code lsl = u + al} rather
1296          * than composing the low part using bits from the carry computation.
1297          */
1298         static class BranchlessLxm128 extends BaseLxm128 {
1299             /**
1300              * @param ah High-half of add
1301              * @param al Low-half of add
1302              */
1303             BranchlessLxm128(long ah, long al) {
1304                 super(ah, al);
1305             }
1306 
1307             @Override
1308             public long getAsLong() {
1309                 final long s0 = state0;
1310                 long sh = lsh;
1311 
1312                 // Mix
1313                 final long z = lea64(sh + s0);
1314 
1315                 // LCG update
1316                 // The LCG is, in effect, "s = m * s + a" where m = ((1LL << 64) + ML)
1317                 final long sl = lsl;
1318                 final long u = ML * sl;
1319                 // High half
1320                 sh = ML * sh + UnsignedMultiplyHighSource.unsignedMultiplyHigh(ML, sl) + sl + lah;
1321                 // Low half
1322                 final long al = lal;
1323                 lsl = u + al;
1324                 // Carry propagation using a branchless bit cascade.
1325                 // This can be used in a method that will be inlined.
1326                 lsh = sh + LCG128Source.BranchlessLcg128.unsignedAddHigh(u, al);
1327 
1328                 // XBG update
1329                 long s1 = state1;
1330 
1331                 s1 ^= s0;
1332                 state0 = Long.rotateLeft(s0, 24) ^ s1 ^ (s1 << 16); // a, b
1333                 state1 = Long.rotateLeft(s1, 37); // c
1334 
1335                 return z;
1336             }
1337         }
1338     }
1339 
1340     /**
1341      * Benchmark an unsigned multiply.
1342      *
1343      * @param data the data
1344      * @return the value
1345      */
1346     @Benchmark
1347     public long unsignedMultiply(UnsignedMultiplyHighSource data) {
1348         return data.next();
1349     }
1350 
1351     /**
1352      * Benchmark a 128-bit unsigned multiply.
1353      *
1354      * @param data the data
1355      * @return the value
1356      */
1357     @Benchmark
1358     public long unsignedMultiply128(UnsignedMultiply128Source data) {
1359         return data.next();
1360     }
1361 
1362     /**
1363      * Benchmark a 128-bit linear congruential generator.
1364      *
1365      * @param data the data
1366      * @return the value
1367      */
1368     @Benchmark
1369     public long lcg128(LCG128Source data) {
1370         return data.next();
1371     }
1372 
1373     /**
1374      * Benchmark a LXM generator with a 128-bit linear congruential generator.
1375      *
1376      * @param data the data
1377      * @return the value
1378      */
1379     @Benchmark
1380     public long lxm128(LXM128Source data) {
1381         return data.next();
1382     }
1383 }