DoubleMetaphone.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.commons.codec.language;
- import org.apache.commons.codec.EncoderException;
- import org.apache.commons.codec.StringEncoder;
- import org.apache.commons.codec.binary.StringUtils;
- /**
- * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence
- * Philips</CITE>.
- * <p>
- * This class is conditionally thread-safe. The instance field for the maximum code length is mutable
- * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is
- * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication
- * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup.
- * </p>
- *
- * @see <a href="https://drdobbs.com/the-double-metaphone-search-algorithm/184401251?pgno=2">Dr. Dobbs Original Article</a>
- * @see <a href="https://en.wikipedia.org/wiki/Metaphone">Wikipedia Metaphone</a>
- */
- public class DoubleMetaphone implements StringEncoder {
- /**
- * Stores results, since there is the optional alternate encoding.
- */
- public class DoubleMetaphoneResult {
- private final StringBuilder primary = new StringBuilder(getMaxCodeLen());
- private final StringBuilder alternate = new StringBuilder(getMaxCodeLen());
- private final int maxLength;
- /**
- * Constructs a new instance.
- *
- * @param maxLength The maximum length.
- */
- public DoubleMetaphoneResult(final int maxLength) {
- this.maxLength = maxLength;
- }
- /**
- * Appends the given value as primary and alternative.
- *
- * @param value The value to append.
- */
- public void append(final char value) {
- appendPrimary(value);
- appendAlternate(value);
- }
- /**
- * Appends the given primary and alternative values.
- *
- * @param primary The primary value.
- * @param alternate The alternate value.
- */
- public void append(final char primary, final char alternate) {
- appendPrimary(primary);
- appendAlternate(alternate);
- }
- /**
- * Appends the given value as primary and alternative.
- *
- * @param value The value to append.
- */
- public void append(final String value) {
- appendPrimary(value);
- appendAlternate(value);
- }
- /**
- * Appends the given primary and alternative values.
- *
- * @param primary The primary value.
- * @param alternate The alternate value.
- */
- public void append(final String primary, final String alternate) {
- appendPrimary(primary);
- appendAlternate(alternate);
- }
- /**
- * Appends the given value as alternative.
- *
- * @param value The value to append.
- */
- public void appendAlternate(final char value) {
- if (this.alternate.length() < this.maxLength) {
- this.alternate.append(value);
- }
- }
- /**
- * Appends the given value as alternative.
- *
- * @param value The value to append.
- */
- public void appendAlternate(final String value) {
- final int addChars = this.maxLength - this.alternate.length();
- if (value.length() <= addChars) {
- this.alternate.append(value);
- } else {
- this.alternate.append(value, 0, addChars);
- }
- }
- /**
- * Appends the given value as primary.
- *
- * @param value The value to append.
- */
- public void appendPrimary(final char value) {
- if (this.primary.length() < this.maxLength) {
- this.primary.append(value);
- }
- }
- /**
- * Appends the given value as primary.
- *
- * @param value The value to append.
- */
- public void appendPrimary(final String value) {
- final int addChars = this.maxLength - this.primary.length();
- if (value.length() <= addChars) {
- this.primary.append(value);
- } else {
- this.primary.append(value, 0, addChars);
- }
- }
- /**
- * Gets the alternate string.
- *
- * @return the alternate string.
- */
- public String getAlternate() {
- return this.alternate.toString();
- }
- /**
- * Gets the primary string.
- *
- * @return the primary string.
- */
- public String getPrimary() {
- return this.primary.toString();
- }
- /**
- * Tests whether this result is complete.
- *
- * @return whether this result is complete.
- */
- public boolean isComplete() {
- return this.primary.length() >= this.maxLength && this.alternate.length() >= this.maxLength;
- }
- }
- /**
- * "Vowels" to test.
- */
- private static final String VOWELS = "AEIOUY";
- /**
- * Prefixes when present which are not pronounced.
- */
- private static final String[] SILENT_START = { "GN", "KN", "PN", "WR", "PS" };
- private static final String[] L_R_N_M_B_H_F_V_W_SPACE = { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
- private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
- private static final String[] L_T_K_S_N_M_B_Z = { "L", "T", "K", "S", "N", "M", "B", "Z" };
- /**
- * Tests whether {@code value} contains any of the {@code criteria} starting at index {@code start} and matching up to length {@code length}.
- *
- * @param value The value to test.
- * @param start Where in {@code value} to start testing.
- * @param length How many to test.
- * @param criteria The search criteria.
- * @return Whether there was a match.
- */
- protected static boolean contains(final String value, final int start, final int length, final String... criteria) {
- boolean result = false;
- if (start >= 0 && start + length <= value.length()) {
- final String target = value.substring(start, start + length);
- for (final String element : criteria) {
- if (target.equals(element)) {
- result = true;
- break;
- }
- }
- }
- return result;
- }
- /**
- * Maximum length of an encoding, default is 4
- */
- private int maxCodeLen = 4;
- /**
- * Constructs a new instance.
- */
- public DoubleMetaphone() {
- // empty
- }
- /**
- * Gets the character at index {@code index} if available, or {@link Character#MIN_VALUE} if out of bounds.
- *
- * @param value The String to query.
- * @param index A string index.
- * @return The character at the index or {@link Character#MIN_VALUE} if out of bounds.
- */
- protected char charAt(final String value, final int index) {
- if (index < 0 || index >= value.length()) {
- return Character.MIN_VALUE;
- }
- return value.charAt(index);
- }
- /**
- * Cleans the input.
- */
- private String cleanInput(String input) {
- if (input == null) {
- return null;
- }
- input = input.trim();
- if (input.isEmpty()) {
- return null;
- }
- return input.toUpperCase(java.util.Locale.ENGLISH);
- }
- /**
- * Complex condition 0 for 'C'.
- */
- private boolean conditionC0(final String value, final int index) {
- if (contains(value, index, 4, "CHIA")) {
- return true;
- }
- if (index <= 1) {
- return false;
- }
- if (isVowel(charAt(value, index - 2))) {
- return false;
- }
- if (!contains(value, index - 1, 3, "ACH")) {
- return false;
- }
- final char c = charAt(value, index + 2);
- return c != 'I' && c != 'E' ||
- contains(value, index - 2, 6, "BACHER", "MACHER");
- }
- /**
- * Complex condition 0 for 'CH'.
- */
- private boolean conditionCH0(final String value, final int index) {
- if (index != 0) {
- return false;
- }
- if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
- !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
- return false;
- }
- return !contains(value, 0, 5, "CHORE");
- }
- /**
- * Complex condition 1 for 'CH'.
- */
- private boolean conditionCH1(final String value, final int index) {
- return contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH") ||
- contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
- contains(value, index + 2, 1, "T", "S") ||
- (contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
- (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1);
- }
- /**
- * Complex condition 0 for 'L'.
- */
- private boolean conditionL0(final String value, final int index) {
- if (index == value.length() - 3 &&
- contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
- return true;
- }
- return (contains(value, value.length() - 2, 2, "AS", "OS") ||
- contains(value, value.length() - 1, 1, "A", "O")) &&
- contains(value, index - 1, 4, "ALLE");
- }
- //-- BEGIN HANDLERS --//
- /**
- * Complex condition 0 for 'M'.
- */
- private boolean conditionM0(final String value, final int index) {
- if (charAt(value, index + 1) == 'M') {
- return true;
- }
- return contains(value, index - 1, 3, "UMB") &&
- (index + 1 == value.length() - 1 || contains(value, index + 2, 2, "ER"));
- }
- /**
- * Encode a value with Double Metaphone.
- *
- * @param value String to encode
- * @return an encoded string
- */
- public String doubleMetaphone(final String value) {
- return doubleMetaphone(value, false);
- }
- /**
- * Encode a value with Double Metaphone, optionally using the alternate encoding.
- *
- * @param value String to encode
- * @param alternate use alternate encode
- * @return an encoded string
- */
- public String doubleMetaphone(String value, final boolean alternate) {
- value = cleanInput(value);
- if (value == null) {
- return null;
- }
- final boolean slavoGermanic = isSlavoGermanic(value);
- int index = isSilentStart(value) ? 1 : 0;
- final DoubleMetaphoneResult result = new DoubleMetaphoneResult(getMaxCodeLen());
- while (!result.isComplete() && index <= value.length() - 1) {
- switch (value.charAt(index)) {
- case 'A':
- case 'E':
- case 'I':
- case 'O':
- case 'U':
- case 'Y':
- index = handleAEIOUY(result, index);
- break;
- case 'B':
- result.append('P');
- index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
- break;
- case '\u00C7':
- // A C with a Cedilla
- result.append('S');
- index++;
- break;
- case 'C':
- index = handleC(value, result, index);
- break;
- case 'D':
- index = handleD(value, result, index);
- break;
- case 'F':
- result.append('F');
- index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
- break;
- case 'G':
- index = handleG(value, result, index, slavoGermanic);
- break;
- case 'H':
- index = handleH(value, result, index);
- break;
- case 'J':
- index = handleJ(value, result, index, slavoGermanic);
- break;
- case 'K':
- result.append('K');
- index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
- break;
- case 'L':
- index = handleL(value, result, index);
- break;
- case 'M':
- result.append('M');
- index = conditionM0(value, index) ? index + 2 : index + 1;
- break;
- case 'N':
- result.append('N');
- index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
- break;
- case '\u00D1':
- // N with a tilde (spanish ene)
- result.append('N');
- index++;
- break;
- case 'P':
- index = handleP(value, result, index);
- break;
- case 'Q':
- result.append('K');
- index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
- break;
- case 'R':
- index = handleR(value, result, index, slavoGermanic);
- break;
- case 'S':
- index = handleS(value, result, index, slavoGermanic);
- break;
- case 'T':
- index = handleT(value, result, index);
- break;
- case 'V':
- result.append('F');
- index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
- break;
- case 'W':
- index = handleW(value, result, index);
- break;
- case 'X':
- index = handleX(value, result, index);
- break;
- case 'Z':
- index = handleZ(value, result, index, slavoGermanic);
- break;
- default:
- index++;
- break;
- }
- }
- return alternate ? result.getAlternate() : result.getPrimary();
- }
- /**
- * Encode the value using DoubleMetaphone. It will only work if
- * {@code obj} is a {@code String} (like {@code Metaphone}).
- *
- * @param obj Object to encode (should be of type String)
- * @return An encoded Object (will be of type String)
- * @throws EncoderException encode parameter is not of type String
- */
- @Override
- public Object encode(final Object obj) throws EncoderException {
- if (!(obj instanceof String)) {
- throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
- }
- return doubleMetaphone((String) obj);
- }
- /**
- * Encode the value using DoubleMetaphone.
- *
- * @param value String to encode
- * @return An encoded String
- */
- @Override
- public String encode(final String value) {
- return doubleMetaphone(value);
- }
- /**
- * Returns the maxCodeLen.
- * @return int
- */
- public int getMaxCodeLen() {
- return this.maxCodeLen;
- }
- /**
- * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
- */
- private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) {
- if (index == 0) {
- result.append('A');
- }
- return index + 1;
- }
- /**
- * Handles 'C' cases.
- */
- private int handleC(final String value, final DoubleMetaphoneResult result, int index) {
- if (conditionC0(value, index)) { // very confusing, moved out
- result.append('K');
- index += 2;
- } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
- result.append('S');
- index += 2;
- } else if (contains(value, index, 2, "CH")) {
- index = handleCH(value, result, index);
- } else if (contains(value, index, 2, "CZ") &&
- !contains(value, index - 2, 4, "WICZ")) {
- //-- "Czerny" --//
- result.append('S', 'X');
- index += 2;
- } else if (contains(value, index + 1, 3, "CIA")) {
- //-- "focaccia" --//
- result.append('X');
- index += 3;
- } else if (contains(value, index, 2, "CC") &&
- !(index == 1 && charAt(value, 0) == 'M')) {
- //-- double "cc" but not "McClelland" --//
- return handleCC(value, result, index);
- } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
- result.append('K');
- index += 2;
- } else if (contains(value, index, 2, "CI", "CE", "CY")) {
- //-- Italian vs. English --//
- if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
- result.append('S', 'X');
- } else {
- result.append('S');
- }
- index += 2;
- } else {
- result.append('K');
- if (contains(value, index + 1, 2, " C", " Q", " G")) {
- //-- Mac Caffrey, Mac Gregor --//
- index += 3;
- } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
- !contains(value, index + 1, 2, "CE", "CI")) {
- index += 2;
- } else {
- index++;
- }
- }
- return index;
- }
- /**
- * Handles 'CC' cases.
- */
- private int handleCC(final String value, final DoubleMetaphoneResult result, int index) {
- if (contains(value, index + 2, 1, "I", "E", "H") &&
- !contains(value, index + 2, 2, "HU")) {
- //-- "bellocchio" but not "bacchus" --//
- if (index == 1 && charAt(value, index - 1) == 'A' ||
- contains(value, index - 1, 5, "UCCEE", "UCCES")) {
- //-- "accident", "accede", "succeed" --//
- result.append("KS");
- } else {
- //-- "bacci", "bertucci", other Italian --//
- result.append('X');
- }
- index += 3;
- } else { // Pierce's rule
- result.append('K');
- index += 2;
- }
- return index;
- }
- /**
- * Handles 'CH' cases.
- */
- private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) {
- if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
- result.append('K', 'X');
- return index + 2;
- }
- if (conditionCH0(value, index)) {
- //-- Greek roots ("chemistry", "chorus", etc.) --//
- result.append('K');
- return index + 2;
- }
- if (conditionCH1(value, index)) {
- //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
- result.append('K');
- return index + 2;
- }
- if (index > 0) {
- if (contains(value, 0, 2, "MC")) {
- result.append('K');
- } else {
- result.append('X', 'K');
- }
- } else {
- result.append('X');
- }
- return index + 2;
- }
- /**
- * Handles 'D' cases.
- */
- private int handleD(final String value, final DoubleMetaphoneResult result, int index) {
- if (contains(value, index, 2, "DG")) {
- //-- "Edge" --//
- if (contains(value, index + 2, 1, "I", "E", "Y")) {
- result.append('J');
- index += 3;
- //-- "Edgar" --//
- } else {
- result.append("TK");
- index += 2;
- }
- } else if (contains(value, index, 2, "DT", "DD")) {
- result.append('T');
- index += 2;
- } else {
- result.append('T');
- index++;
- }
- return index;
- }
- /**
- * Handles 'G' cases.
- */
- private int handleG(final String value, final DoubleMetaphoneResult result, int index,
- final boolean slavoGermanic) {
- if (charAt(value, index + 1) == 'H') {
- index = handleGH(value, result, index);
- } else if (charAt(value, index + 1) == 'N') {
- if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
- result.append("KN", "N");
- } else if (!contains(value, index + 2, 2, "EY") &&
- charAt(value, index + 1) != 'Y' && !slavoGermanic) {
- result.append("N", "KN");
- } else {
- result.append("KN");
- }
- index += 2;
- } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
- result.append("KL", "L");
- index += 2;
- } else if (index == 0 &&
- (charAt(value, index + 1) == 'Y' ||
- contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
- //-- -ges-, -gep-, -gel-, -gie- at beginning --//
- result.append('K', 'J');
- index += 2;
- } else if ((contains(value, index + 1, 2, "ER") ||
- charAt(value, index + 1) == 'Y') &&
- !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
- !contains(value, index - 1, 1, "E", "I") &&
- !contains(value, index - 1, 3, "RGY", "OGY")) {
- //-- -ger-, -gy- --//
- result.append('K', 'J');
- index += 2;
- } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
- contains(value, index - 1, 4, "AGGI", "OGGI")) {
- //-- Italian "biaggi" --//
- if (contains(value, 0, 4, "VAN ", "VON ") ||
- contains(value, 0, 3, "SCH") ||
- contains(value, index + 1, 2, "ET")) {
- //-- obvious germanic --//
- result.append('K');
- } else if (contains(value, index + 1, 3, "IER")) {
- result.append('J');
- } else {
- result.append('J', 'K');
- }
- index += 2;
- } else {
- if (charAt(value, index + 1) == 'G') {
- index += 2;
- } else {
- index++;
- }
- result.append('K');
- }
- return index;
- }
- /**
- * Handles 'GH' cases.
- */
- private int handleGH(final String value, final DoubleMetaphoneResult result, int index) {
- if (index > 0 && !isVowel(charAt(value, index - 1))) {
- result.append('K');
- index += 2;
- } else if (index == 0) {
- if (charAt(value, index + 2) == 'I') {
- result.append('J');
- } else {
- result.append('K');
- }
- index += 2;
- } else if (index > 1 && contains(value, index - 2, 1, "B", "H", "D") ||
- index > 2 && contains(value, index - 3, 1, "B", "H", "D") ||
- index > 3 && contains(value, index - 4, 1, "B", "H")) {
- //-- Parker's rule (with some further refinements) - "hugh"
- index += 2;
- } else {
- if (index > 2 && charAt(value, index - 1) == 'U' &&
- contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
- //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
- result.append('F');
- } else if (index > 0 && charAt(value, index - 1) != 'I') {
- result.append('K');
- }
- index += 2;
- }
- return index;
- }
- /**
- * Handles 'H' cases.
- */
- private int handleH(final String value, final DoubleMetaphoneResult result, int index) {
- //-- only keep if first & before vowel or between 2 vowels --//
- if ((index == 0 || isVowel(charAt(value, index - 1))) &&
- isVowel(charAt(value, index + 1))) {
- result.append('H');
- index += 2;
- //-- also takes car of "HH" --//
- } else {
- index++;
- }
- return index;
- }
- /**
- * Handles 'J' cases.
- */
- private int handleJ(final String value, final DoubleMetaphoneResult result, int index,
- final boolean slavoGermanic) {
- if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
- //-- obvious Spanish, "Jose", "San Jacinto" --//
- if (index == 0 && charAt(value, index + 4) == ' ' ||
- value.length() == 4 || contains(value, 0, 4, "SAN ")) {
- result.append('H');
- } else {
- result.append('J', 'H');
- }
- index++;
- } else {
- if (index == 0 && !contains(value, index, 4, "JOSE")) {
- result.append('J', 'A');
- } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
- (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
- result.append('J', 'H');
- } else if (index == value.length() - 1) {
- result.append('J', ' ');
- } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
- !contains(value, index - 1, 1, "S", "K", "L")) {
- result.append('J');
- }
- if (charAt(value, index + 1) == 'J') {
- index += 2;
- } else {
- index++;
- }
- }
- return index;
- }
- /**
- * Handles 'L' cases.
- */
- private int handleL(final String value, final DoubleMetaphoneResult result, int index) {
- if (charAt(value, index + 1) == 'L') {
- if (conditionL0(value, index)) {
- result.appendPrimary('L');
- } else {
- result.append('L');
- }
- index += 2;
- } else {
- index++;
- result.append('L');
- }
- return index;
- }
- /**
- * Handles 'P' cases.
- */
- private int handleP(final String value, final DoubleMetaphoneResult result, int index) {
- if (charAt(value, index + 1) == 'H') {
- result.append('F');
- index += 2;
- } else {
- result.append('P');
- index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
- }
- return index;
- }
- /**
- * Handles 'R' cases.
- */
- private int handleR(final String value, final DoubleMetaphoneResult result, final int index,
- final boolean slavoGermanic) {
- if (index == value.length() - 1 && !slavoGermanic &&
- contains(value, index - 2, 2, "IE") &&
- !contains(value, index - 4, 2, "ME", "MA")) {
- result.appendAlternate('R');
- } else {
- result.append('R');
- }
- return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
- }
- //-- BEGIN CONDITIONS --//
- /**
- * Handles 'S' cases.
- */
- private int handleS(final String value, final DoubleMetaphoneResult result, int index,
- final boolean slavoGermanic) {
- if (contains(value, index - 1, 3, "ISL", "YSL")) {
- //-- special cases "island", "isle", "carlisle", "carlysle" --//
- index++;
- } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
- //-- special case "sugar-" --//
- result.append('X', 'S');
- index++;
- } else if (contains(value, index, 2, "SH")) {
- if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
- //-- germanic --//
- result.append('S');
- } else {
- result.append('X');
- }
- index += 2;
- } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
- //-- Italian and Armenian --//
- if (slavoGermanic) {
- result.append('S');
- } else {
- result.append('S', 'X');
- }
- index += 3;
- } else if (index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W") ||
- contains(value, index + 1, 1, "Z")) {
- //-- german & anglicisations, for example "smith" match "schmidt" //
- // "snider" match "schneider" --//
- //-- also, -sz- in slavic language although in hungarian it //
- // is pronounced "s" --//
- result.append('S', 'X');
- index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
- } else if (contains(value, index, 2, "SC")) {
- index = handleSC(value, result, index);
- } else {
- if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) {
- //-- french for example "resnais", "artois" --//
- result.appendAlternate('S');
- } else {
- result.append('S');
- }
- index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
- }
- return index;
- }
- /**
- * Handles 'SC' cases.
- */
- private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) {
- if (charAt(value, index + 2) == 'H') {
- //-- Schlesinger's rule --//
- if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
- //-- Dutch origin, for example "school", "schooner" --//
- if (contains(value, index + 3, 2, "ER", "EN")) {
- //-- "schermerhorn", "schenker" --//
- result.append("X", "SK");
- } else {
- result.append("SK");
- }
- } else if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
- result.append('X', 'S');
- } else {
- result.append('X');
- }
- } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
- result.append('S');
- } else {
- result.append("SK");
- }
- return index + 3;
- }
- /**
- * Handles 'T' cases.
- */
- private int handleT(final String value, final DoubleMetaphoneResult result, int index) {
- if (contains(value, index, 4, "TION") || contains(value, index, 3, "TIA", "TCH")) {
- result.append('X');
- index += 3;
- } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) {
- if (contains(value, index + 2, 2, "OM", "AM") ||
- //-- special case "thomas", "thames" or germanic --//
- contains(value, 0, 4, "VAN ", "VON ") ||
- contains(value, 0, 3, "SCH")) {
- result.append('T');
- } else {
- result.append('0', 'T');
- }
- index += 2;
- } else {
- result.append('T');
- index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
- }
- return index;
- }
- /**
- * Handles 'W' cases.
- */
- private int handleW(final String value, final DoubleMetaphoneResult result, int index) {
- if (contains(value, index, 2, "WR")) {
- //-- can also be in middle of word --//
- result.append('R');
- index += 2;
- } else if (index == 0 && (isVowel(charAt(value, index + 1)) ||
- contains(value, index, 2, "WH"))) {
- if (isVowel(charAt(value, index + 1))) {
- //-- Wasserman should match Vasserman --//
- result.append('A', 'F');
- } else {
- //-- need Uomo to match Womo --//
- result.append('A');
- }
- index++;
- } else if (index == value.length() - 1 && isVowel(charAt(value, index - 1)) ||
- contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
- contains(value, 0, 3, "SCH")) {
- //-- Arnow should match Arnoff --//
- result.appendAlternate('F');
- index++;
- } else if (contains(value, index, 4, "WICZ", "WITZ")) {
- //-- Polish for example "filipowicz" --//
- result.append("TS", "FX");
- index += 4;
- } else {
- index++;
- }
- return index;
- }
- /**
- * Handles 'X' cases.
- */
- private int handleX(final String value, final DoubleMetaphoneResult result, int index) {
- if (index == 0) {
- result.append('S');
- index++;
- } else {
- if (!(index == value.length() - 1 &&
- (contains(value, index - 3, 3, "IAU", "EAU") ||
- contains(value, index - 2, 2, "AU", "OU")))) {
- //-- French for example breaux --//
- result.append("KS");
- }
- index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
- }
- return index;
- }
- //-- BEGIN HELPER FUNCTIONS --//
- /**
- * Handles 'Z' cases.
- */
- private int handleZ(final String value, final DoubleMetaphoneResult result, int index,
- final boolean slavoGermanic) {
- if (charAt(value, index + 1) == 'H') {
- //-- Chinese pinyin for example "zhao" or Angelina "Zhang" --//
- result.append('J');
- index += 2;
- } else {
- if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
- slavoGermanic && index > 0 && charAt(value, index - 1) != 'T') {
- result.append("S", "TS");
- } else {
- result.append('S');
- }
- index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
- }
- return index;
- }
- /**
- * Check if the Double Metaphone values of two {@code String} values
- * are equal.
- *
- * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
- * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
- * @return {@code true} if the encoded {@code String}s are equal;
- * {@code false} otherwise.
- * @see #isDoubleMetaphoneEqual(String,String,boolean)
- */
- public boolean isDoubleMetaphoneEqual(final String value1, final String value2) {
- return isDoubleMetaphoneEqual(value1, value2, false);
- }
- /**
- * Check if the Double Metaphone values of two {@code String} values
- * are equal, optionally using the alternate value.
- *
- * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
- * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
- * @param alternate use the alternate value if {@code true}.
- * @return {@code true} if the encoded {@code String}s are equal;
- * {@code false} otherwise.
- */
- public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) {
- return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate));
- }
- /**
- * Determines whether or not the value starts with a silent letter. It will
- * return {@code true} if the value starts with any of 'GN', 'KN',
- * 'PN', 'WR' or 'PS'.
- */
- private boolean isSilentStart(final String value) {
- boolean result = false;
- for (final String element : SILENT_START) {
- if (value.startsWith(element)) {
- result = true;
- break;
- }
- }
- return result;
- }
- /**
- * Determines whether or not a value is of slavo-germanic origin. A value is
- * of slavo-germanic origin if it contains any of 'W', 'K', 'CZ', or 'WITZ'.
- */
- private boolean isSlavoGermanic(final String value) {
- return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
- value.contains("CZ") || value.contains("WITZ");
- }
- /**
- * Determines whether or not a character is a vowel or not
- */
- private boolean isVowel(final char ch) {
- return VOWELS.indexOf(ch) != -1;
- }
- //-- BEGIN INNER CLASSES --//
- /**
- * Sets the maxCodeLen.
- * @param maxCodeLen The maxCodeLen to set
- */
- public void setMaxCodeLen(final int maxCodeLen) {
- this.maxCodeLen = maxCodeLen;
- }
- }