MurmurHash2.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.digest;

  18. import org.apache.commons.codec.binary.StringUtils;

  19. /**
  20.  * Implements the MurmurHash2 32-bit and 64-bit hash functions.
  21.  *
  22.  * <p>MurmurHash is a non-cryptographic hash function suitable for general
  23.  * hash-based lookup. The name comes from two basic operations, multiply (MU)
  24.  * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
  25.  * it is not specifically designed to be difficult to reverse by an adversary,
  26.  * making it unsuitable for cryptographic purposes.</p>
  27.  *
  28.  * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
  29.  * and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's
  30.  * original {@code c++} code in SMHasher.</p>
  31.  *
  32.  * <p>This is a re-implementation of the original C code plus some additional
  33.  * features.</p>
  34.  *
  35.  * <p>This is public domain code with no copyrights. From home page of
  36.  * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
  37.  *
  38.  * <blockquote>
  39.  * "All MurmurHash versions are public domain software, and the author
  40.  * disclaims all copyright to their code."
  41.  * </blockquote>
  42.  *
  43.  * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
  44.  * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
  45.  *   Original MurmurHash2 c++ code</a>
  46.  * @since 1.13
  47.  */
  48. public final class MurmurHash2 {

  49.     // Constants for 32-bit variant
  50.     private static final int M32 = 0x5bd1e995;
  51.     private static final int R32 = 24;

  52.     // Constants for 64-bit variant
  53.     private static final long M64 = 0xc6a4a7935bd1e995L;
  54.     private static final int R64 = 47;

  55.     /**
  56.      * Gets the little-endian int from 4 bytes starting at the specified index.
  57.      *
  58.      * @param data The data
  59.      * @param index The index
  60.      * @return The little-endian int
  61.      */
  62.     private static int getLittleEndianInt(final byte[] data, final int index) {
  63.         return data[index    ] & 0xff |
  64.                (data[index + 1] & 0xff) <<  8 |
  65.                (data[index + 2] & 0xff) << 16 |
  66.                (data[index + 3] & 0xff) << 24;
  67.     }

  68.     /**
  69.      * Gets the little-endian long from 8 bytes starting at the specified index.
  70.      *
  71.      * @param data The data
  72.      * @param index The index
  73.      * @return The little-endian long
  74.      */
  75.     private static long getLittleEndianLong(final byte[] data, final int index) {
  76.         return (long) data[index    ] & 0xff |
  77.                ((long) data[index + 1] & 0xff) <<  8 |
  78.                ((long) data[index + 2] & 0xff) << 16 |
  79.                ((long) data[index + 3] & 0xff) << 24 |
  80.                ((long) data[index + 4] & 0xff) << 32 |
  81.                ((long) data[index + 5] & 0xff) << 40 |
  82.                ((long) data[index + 6] & 0xff) << 48 |
  83.                ((long) data[index + 7] & 0xff) << 56;
  84.     }

  85.     /**
  86.      * Generates a 32-bit hash from byte array with the given length and a default seed value.
  87.      * This is a helper method that will produce the same result as:
  88.      *
  89.      * <pre>
  90.      * int seed = 0x9747b28c;
  91.      * int hash = MurmurHash2.hash32(data, length, seed);
  92.      * </pre>
  93.      *
  94.      * @param data The input byte array
  95.      * @param length The length of the array
  96.      * @return The 32-bit hash
  97.      * @see #hash32(byte[], int, int)
  98.      */
  99.     public static int hash32(final byte[] data, final int length) {
  100.         return hash32(data, length, 0x9747b28c);
  101.     }

  102.     /**
  103.      * Generates a 32-bit hash from byte array with the given length and seed.
  104.      *
  105.      * @param data The input byte array
  106.      * @param length The length of the array
  107.      * @param seed The initial seed value
  108.      * @return The 32-bit hash
  109.      */
  110.     public static int hash32(final byte[] data, final int length, final int seed) {
  111.         // Initialize the hash to a random value
  112.         int h = seed ^ length;

  113.         // Mix 4 bytes at a time into the hash
  114.         final int nblocks = length >> 2;

  115.         // body
  116.         for (int i = 0; i < nblocks; i++) {
  117.             final int index = i << 2;
  118.             int k = getLittleEndianInt(data, index);
  119.             k *= M32;
  120.             k ^= k >>> R32;
  121.             k *= M32;
  122.             h *= M32;
  123.             h ^= k;
  124.         }

  125.         // Handle the last few bytes of the input array
  126.         final int index = nblocks << 2;
  127.         switch (length - index) {
  128.         case 3:
  129.             h ^= (data[index + 2] & 0xff) << 16;
  130.         case 2:
  131.             h ^= (data[index + 1] & 0xff) << 8;
  132.         case 1:
  133.             h ^= data[index] & 0xff;
  134.             h *= M32;
  135.         }

  136.         // Do a few final mixes of the hash to ensure the last few
  137.         // bytes are well-incorporated.
  138.         h ^= h >>> 13;
  139.         h *= M32;
  140.         h ^= h >>> 15;

  141.         return h;
  142.     }

  143.     /**
  144.      * Generates a 32-bit hash from a string with a default seed.
  145.      * <p>
  146.      * Before 1.14 the string was converted using default encoding.
  147.      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
  148.      * </p>
  149.      * This is a helper method that will produce the same result as:
  150.      *
  151.      * <pre>
  152.      * int seed = 0x9747b28c;
  153.      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
  154.      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
  155.      * </pre>
  156.      *
  157.      * @param text The input string
  158.      * @return The 32-bit hash
  159.      * @see #hash32(byte[], int, int)
  160.      */
  161.     public static int hash32(final String text) {
  162.         final byte[] bytes = StringUtils.getBytesUtf8(text);
  163.         return hash32(bytes, bytes.length);
  164.     }

  165.     /**
  166.      * Generates a 32-bit hash from a substring with a default seed value.
  167.      * The string is converted to bytes using the default encoding.
  168.      * This is a helper method that will produce the same result as:
  169.      *
  170.      * <pre>
  171.      * int seed = 0x9747b28c;
  172.      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
  173.      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
  174.      * </pre>
  175.      *
  176.      * @param text The input string
  177.      * @param from The starting index
  178.      * @param length The length of the substring
  179.      * @return The 32-bit hash
  180.      * @see #hash32(byte[], int, int)
  181.      */
  182.     public static int hash32(final String text, final int from, final int length) {
  183.         return hash32(text.substring(from, from + length));
  184.     }

  185.     /**
  186.      * Generates a 64-bit hash from byte array with given length and a default seed value.
  187.      * This is a helper method that will produce the same result as:
  188.      *
  189.      * <pre>
  190.      * int seed = 0xe17a1465;
  191.      * int hash = MurmurHash2.hash64(data, length, seed);
  192.      * </pre>
  193.      *
  194.      * @param data The input byte array
  195.      * @param length The length of the array
  196.      * @return The 64-bit hash
  197.      * @see #hash64(byte[], int, int)
  198.      */
  199.     public static long hash64(final byte[] data, final int length) {
  200.         return hash64(data, length, 0xe17a1465);
  201.     }

  202.     /**
  203.      * Generates a 64-bit hash from byte array of the given length and seed.
  204.      *
  205.      * @param data The input byte array
  206.      * @param length The length of the array
  207.      * @param seed The initial seed value
  208.      * @return The 64-bit hash of the given array
  209.      */
  210.     public static long hash64(final byte[] data, final int length, final int seed) {
  211.         long h = seed & 0xffffffffL ^ length * M64;

  212.         final int nblocks = length >> 3;

  213.         // body
  214.         for (int i = 0; i < nblocks; i++) {
  215.             final int index = i << 3;
  216.             long k = getLittleEndianLong(data, index);

  217.             k *= M64;
  218.             k ^= k >>> R64;
  219.             k *= M64;

  220.             h ^= k;
  221.             h *= M64;
  222.         }

  223.         final int index = nblocks << 3;
  224.         switch (length - index) {
  225.         case 7:
  226.             h ^= ((long) data[index + 6] & 0xff) << 48;
  227.         case 6:
  228.             h ^= ((long) data[index + 5] & 0xff) << 40;
  229.         case 5:
  230.             h ^= ((long) data[index + 4] & 0xff) << 32;
  231.         case 4:
  232.             h ^= ((long) data[index + 3] & 0xff) << 24;
  233.         case 3:
  234.             h ^= ((long) data[index + 2] & 0xff) << 16;
  235.         case 2:
  236.             h ^= ((long) data[index + 1] & 0xff) << 8;
  237.         case 1:
  238.             h ^= (long) data[index] & 0xff;
  239.             h *= M64;
  240.         }

  241.         h ^= h >>> R64;
  242.         h *= M64;
  243.         h ^= h >>> R64;

  244.         return h;
  245.     }

  246.     /**
  247.      * Generates a 64-bit hash from a string with a default seed.
  248.      * <p>
  249.      * Before 1.14 the string was converted using default encoding.
  250.      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
  251.      * </p>
  252.      * <p>
  253.      * This is a helper method that will produce the same result as:
  254.      * </p>
  255.      *
  256.      * <pre>
  257.      * int seed = 0xe17a1465;
  258.      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
  259.      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
  260.      * </pre>
  261.      *
  262.      * @param text The input string
  263.      * @return The 64-bit hash
  264.      * @see #hash64(byte[], int, int)
  265.      */
  266.     public static long hash64(final String text) {
  267.         final byte[] bytes = StringUtils.getBytesUtf8(text);
  268.         return hash64(bytes, bytes.length);
  269.     }

  270.     /**
  271.      * Generates a 64-bit hash from a substring with a default seed value.
  272.      * The string is converted to bytes using the default encoding.
  273.      * This is a helper method that will produce the same result as:
  274.      *
  275.      * <pre>
  276.      * int seed = 0xe17a1465;
  277.      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
  278.      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
  279.      * </pre>
  280.      *
  281.      * @param text The input string
  282.      * @param from The starting index
  283.      * @param length The length of the substring
  284.      * @return The 64-bit hash
  285.      * @see #hash64(byte[], int, int)
  286.      */
  287.     public static long hash64(final String text, final int from, final int length) {
  288.         return hash64(text.substring(from, from + length));
  289.     }

  290.     /** No instance methods. */
  291.     private MurmurHash2() {
  292.     }
  293. }