View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.numbers.core;
18  
19  /**
20   * Computes extended precision floating-point operations.
21   *
22   * <p>It is based on the 1971 paper
23   * <a href="https://doi.org/10.1007/BF01397083">
24   * Dekker (1971) A floating-point technique for extending the available precision</a>.
25   */
26  final class ExtendedPrecision {
27      /*
28       * Caveat:
29       *
30       * The code below uses many additions/subtractions that may
31       * appear redundant. However, they should NOT be simplified, as they
32       * do use IEEE754 floating point arithmetic rounding properties.
33       *
34       * Algorithms are based on computing the product or sum of two values x and y in
35       * extended precision. The standard result is stored using a double (high part z) and
36       * the round-off error (or low part zz) is stored in a second double, e.g:
37       * x * y = (z, zz); z + zz = x * y
38       * x + y = (z, zz); z + zz = x + y
39       *
40       * To sum multiple (z, zz) results ideally the parts are sorted in order of
41       * non-decreasing magnitude and summed. This is exact if each number's most significant
42       * bit is below the least significant bit of the next (i.e. does not
43       * overlap). Creating non-overlapping parts requires a rebalancing
44       * of adjacent pairs using a summation z + zz = (z1, zz1) iteratively through the parts
45       * (see Shewchuk (1997) Grow-Expansion and Expansion-Sum [1]).
46       *
47       * [1] Shewchuk (1997): Arbitrary Precision Floating-Point Arithmetic
48       * http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps
49       */
50  
51      /**
52       * The multiplier used to split the double value into high and low parts. From
53       * Dekker (1971): "The constant should be chosen equal to 2^(p - p/2) + 1,
54       * where p is the number of binary digits in the mantissa". Here p is 53
55       * and the multiplier is {@code 2^27 + 1}.
56       */
57      private static final double MULTIPLIER = 1.0 + 0x1.0p27;
58  
59      /**
60       * The upper limit above which a number may overflow during the split into a high part.
61       * Assuming the multiplier is above 2^27 and the maximum exponent is 1023 then a safe
62       * limit is a value with an exponent of (1023 - 27) = 2^996.
63       * 996 is the value obtained from {@code Math.getExponent(Double.MAX_VALUE / MULTIPLIER)}.
64       */
65      private static final double SAFE_UPPER = 0x1.0p996;
66  
67      /** The scale to use when down-scaling during a split into a high part.
68       * This must be smaller than the inverse of the multiplier and a power of 2 for exact scaling. */
69      private static final double DOWN_SCALE = 0x1.0p-30;
70  
71      /** The scale to use when re-scaling during a split into a high part.
72       * This is the inverse of {@link #DOWN_SCALE}. */
73      private static final double UP_SCALE = 0x1.0p30;
74  
75      /** The mask to extract the raw 11-bit exponent.
76       * The value must be shifted 52-bits to remove the mantissa bits. */
77      private static final int EXP_MASK = 0x7ff;
78  
79      /** The value 2046 converted for use if using {@link Integer#compareUnsigned(int, int)}.
80       * This requires adding {@link Integer#MIN_VALUE} to 2046. */
81      private static final int CMP_UNSIGNED_2046 = Integer.MIN_VALUE + 2046;
82  
83      /** The value -1 converted for use if using {@link Integer#compareUnsigned(int, int)}.
84       * This requires adding {@link Integer#MIN_VALUE} to -1. */
85      private static final int CMP_UNSIGNED_MINUS_1 = Integer.MIN_VALUE - 1;
86  
87      /** Private constructor. */
88      private ExtendedPrecision() {
89          // intentionally empty.
90      }
91  
92      /**
93       * Compute the low part of the double length number {@code (z,zz)} for the exact
94       * product of {@code x} and {@code y}. This is equivalent to computing a {@code double}
95       * containing the magnitude of the rounding error when converting the exact 106-bit
96       * significand of the multiplication result to a 53-bit significand.
97       *
98       * <p>The method is written to be functionally similar to using a fused multiply add (FMA)
99       * operation to compute the low part, for example JDK 9's Math.fma function (note the sign
100      * change in the input argument for the product):
101      * <pre>
102      *  double x = ...;
103      *  double y = ...;
104      *  double xy = x * y;
105      *  double low1 = Math.fma(x, y, -xy);
106      *  double low2 = productLow(x, y, xy);
107      * </pre>
108      *
109      * <p>Special cases:
110      *
111      * <ul>
112      *  <li>If {@code x * y} is sub-normal or zero then the result is 0.0.
113      *  <li>If {@code x * y} is infinite or NaN then the result is NaN.
114      * </ul>
115      *
116      * @param x First factor.
117      * @param y Second factor.
118      * @param xy Product of the factors (x * y).
119      * @return the low part of the product double length number
120      * @see #highPartUnscaled(double)
121      */
122     static double productLow(double x, double y, double xy) {
123         // Verify the input. This must be NaN safe.
124         //assert Double.compare(x * y, xy) == 0
125 
126         // If the number is sub-normal, inf or nan there is no round-off.
127         if (isNotNormal(xy)) {
128             // Returns 0.0 for sub-normal xy, otherwise NaN for inf/nan:
129             return xy - xy;
130         }
131 
132         // The result xy is finite and normal.
133         // Use Dekker's mul12 algorithm that splits the values into high and low parts.
134         // Dekker's split using multiplication will overflow if the value is within 2^27
135         // of double max value. It can also produce 26-bit approximations that are larger
136         // than the input numbers for the high part causing overflow in hx * hy when
137         // x * y does not overflow. So we must scale down big numbers.
138         // We only have to scale the largest number as we know the product does not overflow
139         // (if one is too big then the other cannot be).
140         // We also scale if the product is close to overflow to avoid intermediate overflow.
141         // This could be done at a higher limit (e.g. Math.abs(xy) > Double.MAX_VALUE / 4)
142         // but is included here to have a single low probability branch condition.
143 
144         // Add the absolute inputs for a single comparison. The sum will not be more than
145         // 3-fold higher than any component.
146         final double a = Math.abs(x);
147         final double b = Math.abs(y);
148         if (a + b + Math.abs(xy) >= SAFE_UPPER) {
149             // Only required to scale the largest number as x*y does not overflow.
150             if (a > b) {
151                 return productLowUnscaled(x * DOWN_SCALE, y, xy * DOWN_SCALE) * UP_SCALE;
152             }
153             return productLowUnscaled(x, y * DOWN_SCALE, xy * DOWN_SCALE) * UP_SCALE;
154         }
155 
156         // No scaling required
157         return productLowUnscaled(x, y, xy);
158     }
159 
160     /**
161      * Checks if the number is not normal. This is functionally equivalent to:
162      * <pre>
163      * final double abs = Math.abs(a);
164      * return (abs <= Double.MIN_NORMAL || !(abs <= Double.MAX_VALUE));
165      * </pre>
166      *
167      * @param a The value.
168      * @return true if the value is not normal
169      */
170     static boolean isNotNormal(double a) {
171         // Sub-normal numbers have a biased exponent of 0.
172         // Inf/NaN numbers have a biased exponent of 2047.
173         // Catch both cases by extracting the raw exponent, subtracting 1
174         // and compare unsigned (so 0 underflows to a unsigned large value).
175         final int baisedExponent = ((int) (Double.doubleToRawLongBits(a) >>> 52)) & EXP_MASK;
176         // Pre-compute the additions used by Integer.compareUnsigned
177         return baisedExponent + CMP_UNSIGNED_MINUS_1 >= CMP_UNSIGNED_2046;
178     }
179 
180     /**
181      * Compute the low part of the double length number {@code (z,zz)} for the exact
182      * product of {@code x} and {@code y} using Dekker's mult12 algorithm. The standard
183      * precision product {@code x*y} must be provided. The numbers {@code x} and {@code y}
184      * are split into high and low parts using Dekker's algorithm.
185      *
186      * <p>Warning: This method does not perform scaling in Dekker's split and large
187      * finite numbers can create NaN results.
188      *
189      * @param x First factor.
190      * @param y Second factor.
191      * @param xy Product of the factors (x * y).
192      * @return the low part of the product double length number
193      * @see #highPartUnscaled(double)
194      * @see #productLow(double, double, double, double, double)
195      */
196     private static double productLowUnscaled(double x, double y, double xy) {
197         // Split the numbers using Dekker's algorithm without scaling
198         final double hx = highPartUnscaled(x);
199         final double lx = x - hx;
200 
201         final double hy = highPartUnscaled(y);
202         final double ly = y - hy;
203 
204         return productLow(hx, lx, hy, ly, xy);
205     }
206 
207     /**
208      * Compute the low part of the double length number {@code (z,zz)} for the exact
209      * square of {@code x} using Dekker's mult12 algorithm. The standard precision product
210      * {@code x*x} must be provided. The number {@code x} is split into high and low parts
211      * using Dekker's algorithm.
212      *
213      * <p>Warning: This method does not perform scaling in Dekker's split and large
214      * finite numbers can create NaN results.
215      *
216      * @param x Number to square
217      * @param xx Standard precision product {@code x*x}
218      * @return the low part of the square double length number
219      */
220     static double squareLowUnscaled(double x, double xx) {
221         // Split the numbers using Dekker's algorithm without scaling
222         final double hx = highPartUnscaled(x);
223         final double lx = x - hx;
224 
225         return productLow(hx, lx, hx, lx, xx);
226     }
227 
228     /**
229      * Compute the low part of the double length number {@code (z,zz)} for the exact
230      * product of {@code x} and {@code y} using Dekker's mult12 algorithm. The standard
231      * precision product {@code x*y} must be provided. The numbers {@code x} and {@code y}
232      * should already be split into low and high parts.
233      *
234      * <p>Note: This uses the high part of the result {@code (z,zz)} as {@code x * y} and not
235      * {@code hx * hy + hx * ty + tx * hy} as specified in Dekker's original paper.
236      * See Shewchuk (1997) for working examples.
237      *
238      * @param hx High part of first factor.
239      * @param lx Low part of first factor.
240      * @param hy High part of second factor.
241      * @param ly Low part of second factor.
242      * @param xy Product of the factors.
243      * @return <code>lx * ly - (((xy - hx * hy) - lx * hy) - hx * ly)</code>
244      * @see <a href="http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps">
245      * Shewchuk (1997) Theorum 18</a>
246      */
247     private static double productLow(double hx, double lx, double hy, double ly, double xy) {
248         // Compute the multiply low part:
249         // err1 = xy - hx * hy
250         // err2 = err1 - lx * hy
251         // err3 = err2 - hx * ly
252         // low = lx * ly - err3
253         return lx * ly - (((xy - hx * hy) - lx * hy) - hx * ly);
254     }
255 
256     /**
257      * Implement Dekker's method to split a value into two parts. Multiplying by (2^s + 1) creates
258      * a big value from which to derive the two split parts.
259      * <pre>
260      * c = (2^s + 1) * a
261      * a_big = c - a
262      * a_hi = c - a_big
263      * a_lo = a - a_hi
264      * a = a_hi + a_lo
265      * </pre>
266      *
267      * <p>The multiplicand allows a p-bit value to be split into
268      * (p-s)-bit value {@code a_hi} and a non-overlapping (s-1)-bit value {@code a_lo}.
269      * Combined they have (p-1) bits of significand but the sign bit of {@code a_lo}
270      * contains a bit of information. The constant is chosen so that s is ceil(p/2) where
271      * the precision p for a double is 53-bits (1-bit of the mantissa is assumed to be
272      * 1 for a non sub-normal number) and s is 27.
273      *
274      * <p>This conversion does not use scaling and the result of overflow is NaN. Overflow
275      * may occur when the exponent of the input value is above 996.
276      *
277      * <p>Splitting a NaN or infinite value will return NaN.
278      *
279      * @param value Value.
280      * @return the high part of the value.
281      * @see Math#getExponent(double)
282      */
283     static double highPartUnscaled(double value) {
284         final double c = MULTIPLIER * value;
285         return c - (c - value);
286     }
287 
288     /**
289      * Compute the round-off from the sum of two numbers {@code a} and {@code b} using
290      * Knuth's two-sum algorithm. The values are not required to be ordered by magnitude.
291      * The standard precision sum must be provided.
292      *
293      * @param a First part of sum.
294      * @param b Second part of sum.
295      * @param sum Sum of the parts (a + b).
296      * @return <code>(b - (sum - (sum - b))) + (a - (sum - b))</code>
297      * @see <a href="http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps">
298      * Shewchuk (1997) Theorum 7</a>
299      */
300     static double twoSumLow(double a, double b, double sum) {
301         final double bVirtual = sum - a;
302         // sum - bVirtual == aVirtual.
303         // a - aVirtual == a round-off
304         // b - bVirtual == b round-off
305         return (a - (sum - bVirtual)) + (b - bVirtual);
306     }
307 }