MurmurHash2.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.digest;

  18. import org.apache.commons.codec.binary.StringUtils;

  19. /**
  20.  * Implements the MurmurHash2 32-bit and 64-bit hash functions.
  21.  *
  22.  * <p>MurmurHash is a non-cryptographic hash function suitable for general
  23.  * hash-based lookup. The name comes from two basic operations, multiply (MU)
  24.  * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
  25.  * it is not specifically designed to be difficult to reverse by an adversary,
  26.  * making it unsuitable for cryptographic purposes.</p>
  27.  *
  28.  * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
  29.  * and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's
  30.  * original {@code c++} code in SMHasher.</p>
  31.  *
  32.  * <p>This is a re-implementation of the original C code plus some additional
  33.  * features.</p>
  34.  *
  35.  * <p>This is public domain code with no copyrights. From home page of
  36.  * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
  37.  *
  38.  * <blockquote>
  39.  * "All MurmurHash versions are public domain software, and the author
  40.  * disclaims all copyright to their code."
  41.  * </blockquote>
  42.  *
  43.  * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
  44.  * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
  45.  *   Original MurmurHash2 c++ code</a>
  46.  * @since 1.13
  47.  */
  48. public final class MurmurHash2 {

  49.     // Constants for 32-bit variant
  50.     private static final int M32 = 0x5bd1e995;
  51.     private static final int R32 = 24;

  52.     // Constants for 64-bit variant
  53.     private static final long M64 = 0xc6a4a7935bd1e995L;
  54.     private static final int R64 = 47;

  55.     /**
  56.      * Generates a 32-bit hash from byte array with the given length and a default seed value.
  57.      * This is a helper method that will produce the same result as:
  58.      *
  59.      * <pre>
  60.      * int seed = 0x9747b28c;
  61.      * int hash = MurmurHash2.hash32(data, length, seed);
  62.      * </pre>
  63.      *
  64.      * @param data The input byte array
  65.      * @param length The length of the array
  66.      * @return The 32-bit hash
  67.      * @see #hash32(byte[], int, int)
  68.      */
  69.     public static int hash32(final byte[] data, final int length) {
  70.         return hash32(data, length, 0x9747b28c);
  71.     }

  72.     /**
  73.      * Generates a 32-bit hash from byte array with the given length and seed.
  74.      *
  75.      * @param data The input byte array
  76.      * @param length The length of the array
  77.      * @param seed The initial seed value
  78.      * @return The 32-bit hash
  79.      */
  80.     public static int hash32(final byte[] data, final int length, final int seed) {
  81.         // Initialize the hash to a random value
  82.         int h = seed ^ length;
  83.         // Mix 4 bytes at a time into the hash
  84.         final int nblocks = length >> 2;
  85.         // body
  86.         for (int i = 0; i < nblocks; i++) {
  87.             final int index = i << 2;
  88.             int k = MurmurHash.getLittleEndianInt(data, index);
  89.             k *= M32;
  90.             k ^= k >>> R32;
  91.             k *= M32;
  92.             h *= M32;
  93.             h ^= k;
  94.         }
  95.         // Handle the last few bytes of the input array
  96.         final int index = nblocks << 2;
  97.         switch (length - index) {
  98.         case 3:
  99.             h ^= (data[index + 2] & 0xff) << 16;
  100.             // falls-through
  101.         case 2:
  102.             h ^= (data[index + 1] & 0xff) << 8;
  103.             // falls-through
  104.         case 1:
  105.             h ^= data[index] & 0xff;
  106.             h *= M32;
  107.         }
  108.         // Do a few final mixes of the hash to ensure the last few
  109.         // bytes are well-incorporated.
  110.         h ^= h >>> 13;
  111.         h *= M32;
  112.         h ^= h >>> 15;
  113.         return h;
  114.     }

  115.     /**
  116.      * Generates a 32-bit hash from a string with a default seed.
  117.      * <p>
  118.      * Before 1.14 the string was converted using default encoding.
  119.      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
  120.      * </p>
  121.      * This is a helper method that will produce the same result as:
  122.      *
  123.      * <pre>
  124.      * int seed = 0x9747b28c;
  125.      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
  126.      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
  127.      * </pre>
  128.      *
  129.      * @param text The input string
  130.      * @return The 32-bit hash
  131.      * @see #hash32(byte[], int, int)
  132.      */
  133.     public static int hash32(final String text) {
  134.         final byte[] bytes = StringUtils.getBytesUtf8(text);
  135.         return hash32(bytes, bytes.length);
  136.     }

  137.     /**
  138.      * Generates a 32-bit hash from a substring with a default seed value.
  139.      * The string is converted to bytes using the default encoding.
  140.      * This is a helper method that will produce the same result as:
  141.      *
  142.      * <pre>
  143.      * int seed = 0x9747b28c;
  144.      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
  145.      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
  146.      * </pre>
  147.      *
  148.      * @param text The input string
  149.      * @param from The starting index
  150.      * @param length The length of the substring
  151.      * @return The 32-bit hash
  152.      * @see #hash32(byte[], int, int)
  153.      */
  154.     public static int hash32(final String text, final int from, final int length) {
  155.         return hash32(text.substring(from, from + length));
  156.     }

  157.     /**
  158.      * Generates a 64-bit hash from byte array with given length and a default seed value.
  159.      * This is a helper method that will produce the same result as:
  160.      *
  161.      * <pre>
  162.      * int seed = 0xe17a1465;
  163.      * int hash = MurmurHash2.hash64(data, length, seed);
  164.      * </pre>
  165.      *
  166.      * @param data The input byte array
  167.      * @param length The length of the array
  168.      * @return The 64-bit hash
  169.      * @see #hash64(byte[], int, int)
  170.      */
  171.     public static long hash64(final byte[] data, final int length) {
  172.         return hash64(data, length, 0xe17a1465);
  173.     }

  174.     /**
  175.      * Generates a 64-bit hash from byte array of the given length and seed.
  176.      *
  177.      * @param data The input byte array
  178.      * @param length The length of the array
  179.      * @param seed The initial seed value
  180.      * @return The 64-bit hash of the given array
  181.      */
  182.     public static long hash64(final byte[] data, final int length, final int seed) {
  183.         long h = seed & 0xffffffffL ^ length * M64;
  184.         final int nblocks = length >> 3;
  185.         // body
  186.         for (int i = 0; i < nblocks; i++) {
  187.             final int index = i << 3;
  188.             long k = MurmurHash.getLittleEndianLong(data, index);

  189.             k *= M64;
  190.             k ^= k >>> R64;
  191.             k *= M64;

  192.             h ^= k;
  193.             h *= M64;
  194.         }
  195.         final int index = nblocks << 3;
  196.         switch (length - index) {
  197.         case 7:
  198.             h ^= ((long) data[index + 6] & 0xff) << 48;
  199.             // falls-through
  200.         case 6:
  201.             h ^= ((long) data[index + 5] & 0xff) << 40;
  202.             // falls-through
  203.         case 5:
  204.             h ^= ((long) data[index + 4] & 0xff) << 32;
  205.             // falls-through
  206.         case 4:
  207.             h ^= ((long) data[index + 3] & 0xff) << 24;
  208.             // falls-through
  209.         case 3:
  210.             h ^= ((long) data[index + 2] & 0xff) << 16;
  211.             // falls-through
  212.         case 2:
  213.             h ^= ((long) data[index + 1] & 0xff) << 8;
  214.             // falls-through
  215.         case 1:
  216.             h ^= (long) data[index] & 0xff;
  217.             h *= M64;
  218.         }
  219.         h ^= h >>> R64;
  220.         h *= M64;
  221.         h ^= h >>> R64;
  222.         return h;
  223.     }

  224.     /**
  225.      * Generates a 64-bit hash from a string with a default seed.
  226.      * <p>
  227.      * Before 1.14 the string was converted using default encoding.
  228.      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
  229.      * </p>
  230.      * <p>
  231.      * This is a helper method that will produce the same result as:
  232.      * </p>
  233.      *
  234.      * <pre>
  235.      * int seed = 0xe17a1465;
  236.      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
  237.      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
  238.      * </pre>
  239.      *
  240.      * @param text The input string
  241.      * @return The 64-bit hash
  242.      * @see #hash64(byte[], int, int)
  243.      */
  244.     public static long hash64(final String text) {
  245.         final byte[] bytes = StringUtils.getBytesUtf8(text);
  246.         return hash64(bytes, bytes.length);
  247.     }

  248.     /**
  249.      * Generates a 64-bit hash from a substring with a default seed value.
  250.      * The string is converted to bytes using the default encoding.
  251.      * This is a helper method that will produce the same result as:
  252.      *
  253.      * <pre>
  254.      * int seed = 0xe17a1465;
  255.      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
  256.      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
  257.      * </pre>
  258.      *
  259.      * @param text The input string
  260.      * @param from The starting index
  261.      * @param length The length of the substring
  262.      * @return The 64-bit hash
  263.      * @see #hash64(byte[], int, int)
  264.      */
  265.     public static long hash64(final String text, final int from, final int length) {
  266.         return hash64(text.substring(from, from + length));
  267.     }

  268.     /** No instance methods. */
  269.     private MurmurHash2() {
  270.     }
  271. }