1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.numbers.core; 18 19 /** 20 * Computes extended precision floating-point operations. 21 * 22 * <p>It is based on the 1971 paper 23 * <a href="https://doi.org/10.1007/BF01397083"> 24 * Dekker (1971) A floating-point technique for extending the available precision</a>. 25 */ 26 final class ExtendedPrecision { 27 /* 28 * Caveat: 29 * 30 * The code below uses many additions/subtractions that may 31 * appear redundant. However, they should NOT be simplified, as they 32 * do use IEEE754 floating point arithmetic rounding properties. 33 * 34 * Algorithms are based on computing the product or sum of two values x and y in 35 * extended precision. The standard result is stored using a double (high part z) and 36 * the round-off error (or low part zz) is stored in a second double, e.g: 37 * x * y = (z, zz); z + zz = x * y 38 * x + y = (z, zz); z + zz = x + y 39 * 40 * To sum multiple (z, zz) results ideally the parts are sorted in order of 41 * non-decreasing magnitude and summed. This is exact if each number's most significant 42 * bit is below the least significant bit of the next (i.e. does not 43 * overlap). Creating non-overlapping parts requires a rebalancing 44 * of adjacent pairs using a summation z + zz = (z1, zz1) iteratively through the parts 45 * (see Shewchuk (1997) Grow-Expansion and Expansion-Sum [1]). 46 * 47 * [1] Shewchuk (1997): Arbitrary Precision Floating-Point Arithmetic 48 * http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps 49 */ 50 51 /** 52 * The multiplier used to split the double value into high and low parts. From 53 * Dekker (1971): "The constant should be chosen equal to 2^(p - p/2) + 1, 54 * where p is the number of binary digits in the mantissa". Here p is 53 55 * and the multiplier is {@code 2^27 + 1}. 56 */ 57 private static final double MULTIPLIER = 1.0 + 0x1.0p27; 58 59 /** 60 * The upper limit above which a number may overflow during the split into a high part. 61 * Assuming the multiplier is above 2^27 and the maximum exponent is 1023 then a safe 62 * limit is a value with an exponent of (1023 - 27) = 2^996. 63 * 996 is the value obtained from {@code Math.getExponent(Double.MAX_VALUE / MULTIPLIER)}. 64 */ 65 private static final double SAFE_UPPER = 0x1.0p996; 66 67 /** The scale to use when down-scaling during a split into a high part. 68 * This must be smaller than the inverse of the multiplier and a power of 2 for exact scaling. */ 69 private static final double DOWN_SCALE = 0x1.0p-30; 70 71 /** The scale to use when re-scaling during a split into a high part. 72 * This is the inverse of {@link #DOWN_SCALE}. */ 73 private static final double UP_SCALE = 0x1.0p30; 74 75 /** The mask to extract the raw 11-bit exponent. 76 * The value must be shifted 52-bits to remove the mantissa bits. */ 77 private static final int EXP_MASK = 0x7ff; 78 79 /** The value 2046 converted for use if using {@link Integer#compareUnsigned(int, int)}. 80 * This requires adding {@link Integer#MIN_VALUE} to 2046. */ 81 private static final int CMP_UNSIGNED_2046 = Integer.MIN_VALUE + 2046; 82 83 /** The value -1 converted for use if using {@link Integer#compareUnsigned(int, int)}. 84 * This requires adding {@link Integer#MIN_VALUE} to -1. */ 85 private static final int CMP_UNSIGNED_MINUS_1 = Integer.MIN_VALUE - 1; 86 87 /** Private constructor. */ 88 private ExtendedPrecision() { 89 // intentionally empty. 90 } 91 92 /** 93 * Compute the low part of the double length number {@code (z,zz)} for the exact 94 * product of {@code x} and {@code y}. This is equivalent to computing a {@code double} 95 * containing the magnitude of the rounding error when converting the exact 106-bit 96 * significand of the multiplication result to a 53-bit significand. 97 * 98 * <p>The method is written to be functionally similar to using a fused multiply add (FMA) 99 * operation to compute the low part, for example JDK 9's Math.fma function (note the sign 100 * change in the input argument for the product): 101 * <pre> 102 * double x = ...; 103 * double y = ...; 104 * double xy = x * y; 105 * double low1 = Math.fma(x, y, -xy); 106 * double low2 = productLow(x, y, xy); 107 * </pre> 108 * 109 * <p>Special cases: 110 * 111 * <ul> 112 * <li>If {@code x * y} is sub-normal or zero then the result is 0.0. 113 * <li>If {@code x * y} is infinite or NaN then the result is NaN. 114 * </ul> 115 * 116 * @param x First factor. 117 * @param y Second factor. 118 * @param xy Product of the factors (x * y). 119 * @return the low part of the product double length number 120 * @see #highPartUnscaled(double) 121 */ 122 static double productLow(double x, double y, double xy) { 123 // Verify the input. This must be NaN safe. 124 //assert Double.compare(x * y, xy) == 0 125 126 // If the number is sub-normal, inf or nan there is no round-off. 127 if (isNotNormal(xy)) { 128 // Returns 0.0 for sub-normal xy, otherwise NaN for inf/nan: 129 return xy - xy; 130 } 131 132 // The result xy is finite and normal. 133 // Use Dekker's mul12 algorithm that splits the values into high and low parts. 134 // Dekker's split using multiplication will overflow if the value is within 2^27 135 // of double max value. It can also produce 26-bit approximations that are larger 136 // than the input numbers for the high part causing overflow in hx * hy when 137 // x * y does not overflow. So we must scale down big numbers. 138 // We only have to scale the largest number as we know the product does not overflow 139 // (if one is too big then the other cannot be). 140 // We also scale if the product is close to overflow to avoid intermediate overflow. 141 // This could be done at a higher limit (e.g. Math.abs(xy) > Double.MAX_VALUE / 4) 142 // but is included here to have a single low probability branch condition. 143 144 // Add the absolute inputs for a single comparison. The sum will not be more than 145 // 3-fold higher than any component. 146 final double a = Math.abs(x); 147 final double b = Math.abs(y); 148 if (a + b + Math.abs(xy) >= SAFE_UPPER) { 149 // Only required to scale the largest number as x*y does not overflow. 150 if (a > b) { 151 return productLowUnscaled(x * DOWN_SCALE, y, xy * DOWN_SCALE) * UP_SCALE; 152 } 153 return productLowUnscaled(x, y * DOWN_SCALE, xy * DOWN_SCALE) * UP_SCALE; 154 } 155 156 // No scaling required 157 return productLowUnscaled(x, y, xy); 158 } 159 160 /** 161 * Checks if the number is not normal. This is functionally equivalent to: 162 * <pre> 163 * final double abs = Math.abs(a); 164 * return (abs <= Double.MIN_NORMAL || !(abs <= Double.MAX_VALUE)); 165 * </pre> 166 * 167 * @param a The value. 168 * @return true if the value is not normal 169 */ 170 static boolean isNotNormal(double a) { 171 // Sub-normal numbers have a biased exponent of 0. 172 // Inf/NaN numbers have a biased exponent of 2047. 173 // Catch both cases by extracting the raw exponent, subtracting 1 174 // and compare unsigned (so 0 underflows to a unsigned large value). 175 final int baisedExponent = ((int) (Double.doubleToRawLongBits(a) >>> 52)) & EXP_MASK; 176 // Pre-compute the additions used by Integer.compareUnsigned 177 return baisedExponent + CMP_UNSIGNED_MINUS_1 >= CMP_UNSIGNED_2046; 178 } 179 180 /** 181 * Compute the low part of the double length number {@code (z,zz)} for the exact 182 * product of {@code x} and {@code y} using Dekker's mult12 algorithm. The standard 183 * precision product {@code x*y} must be provided. The numbers {@code x} and {@code y} 184 * are split into high and low parts using Dekker's algorithm. 185 * 186 * <p>Warning: This method does not perform scaling in Dekker's split and large 187 * finite numbers can create NaN results. 188 * 189 * @param x First factor. 190 * @param y Second factor. 191 * @param xy Product of the factors (x * y). 192 * @return the low part of the product double length number 193 * @see #highPartUnscaled(double) 194 * @see #productLow(double, double, double, double, double) 195 */ 196 private static double productLowUnscaled(double x, double y, double xy) { 197 // Split the numbers using Dekker's algorithm without scaling 198 final double hx = highPartUnscaled(x); 199 final double lx = x - hx; 200 201 final double hy = highPartUnscaled(y); 202 final double ly = y - hy; 203 204 return productLow(hx, lx, hy, ly, xy); 205 } 206 207 /** 208 * Compute the low part of the double length number {@code (z,zz)} for the exact 209 * square of {@code x} using Dekker's mult12 algorithm. The standard precision product 210 * {@code x*x} must be provided. The number {@code x} is split into high and low parts 211 * using Dekker's algorithm. 212 * 213 * <p>Warning: This method does not perform scaling in Dekker's split and large 214 * finite numbers can create NaN results. 215 * 216 * @param x Number to square 217 * @param xx Standard precision product {@code x*x} 218 * @return the low part of the square double length number 219 */ 220 static double squareLowUnscaled(double x, double xx) { 221 // Split the numbers using Dekker's algorithm without scaling 222 final double hx = highPartUnscaled(x); 223 final double lx = x - hx; 224 225 return productLow(hx, lx, hx, lx, xx); 226 } 227 228 /** 229 * Compute the low part of the double length number {@code (z,zz)} for the exact 230 * product of {@code x} and {@code y} using Dekker's mult12 algorithm. The standard 231 * precision product {@code x*y} must be provided. The numbers {@code x} and {@code y} 232 * should already be split into low and high parts. 233 * 234 * <p>Note: This uses the high part of the result {@code (z,zz)} as {@code x * y} and not 235 * {@code hx * hy + hx * ty + tx * hy} as specified in Dekker's original paper. 236 * See Shewchuk (1997) for working examples. 237 * 238 * @param hx High part of first factor. 239 * @param lx Low part of first factor. 240 * @param hy High part of second factor. 241 * @param ly Low part of second factor. 242 * @param xy Product of the factors. 243 * @return <code>lx * ly - (((xy - hx * hy) - lx * hy) - hx * ly)</code> 244 * @see <a href="http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps"> 245 * Shewchuk (1997) Theorum 18</a> 246 */ 247 private static double productLow(double hx, double lx, double hy, double ly, double xy) { 248 // Compute the multiply low part: 249 // err1 = xy - hx * hy 250 // err2 = err1 - lx * hy 251 // err3 = err2 - hx * ly 252 // low = lx * ly - err3 253 return lx * ly - (((xy - hx * hy) - lx * hy) - hx * ly); 254 } 255 256 /** 257 * Implement Dekker's method to split a value into two parts. Multiplying by (2^s + 1) creates 258 * a big value from which to derive the two split parts. 259 * <pre> 260 * c = (2^s + 1) * a 261 * a_big = c - a 262 * a_hi = c - a_big 263 * a_lo = a - a_hi 264 * a = a_hi + a_lo 265 * </pre> 266 * 267 * <p>The multiplicand allows a p-bit value to be split into 268 * (p-s)-bit value {@code a_hi} and a non-overlapping (s-1)-bit value {@code a_lo}. 269 * Combined they have (p-1) bits of significand but the sign bit of {@code a_lo} 270 * contains a bit of information. The constant is chosen so that s is ceil(p/2) where 271 * the precision p for a double is 53-bits (1-bit of the mantissa is assumed to be 272 * 1 for a non sub-normal number) and s is 27. 273 * 274 * <p>This conversion does not use scaling and the result of overflow is NaN. Overflow 275 * may occur when the exponent of the input value is above 996. 276 * 277 * <p>Splitting a NaN or infinite value will return NaN. 278 * 279 * @param value Value. 280 * @return the high part of the value. 281 * @see Math#getExponent(double) 282 */ 283 static double highPartUnscaled(double value) { 284 final double c = MULTIPLIER * value; 285 return c - (c - value); 286 } 287 288 /** 289 * Compute the round-off from the sum of two numbers {@code a} and {@code b} using 290 * Knuth's two-sum algorithm. The values are not required to be ordered by magnitude. 291 * The standard precision sum must be provided. 292 * 293 * @param a First part of sum. 294 * @param b Second part of sum. 295 * @param sum Sum of the parts (a + b). 296 * @return <code>(b - (sum - (sum - b))) + (a - (sum - b))</code> 297 * @see <a href="http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps"> 298 * Shewchuk (1997) Theorum 7</a> 299 */ 300 static double twoSumLow(double a, double b, double sum) { 301 final double bVirtual = sum - a; 302 // sum - bVirtual == aVirtual. 303 // a - aVirtual == a round-off 304 // b - bVirtual == b round-off 305 return (a - (sum - bVirtual)) + (b - bVirtual); 306 } 307 }