SingleLookupTranslator.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.translate;

  18. import java.io.IOException;
  19. import java.io.Writer;
  20. import java.util.HashMap;
  21. import java.util.HashSet;

  22. /**
  23.  * Translates a value using a lookup table.
  24.  * But doesn't translate if that value is already translated.
  25.  *
  26.  * @since 1.0
  27.  */
  28. public class SingleLookupTranslator extends CharSequenceTranslator {

  29.     private final HashMap<String, String> lookupMap;
  30.     private final HashSet<Character>      prefixSet;
  31.     private final int                     shortest;
  32.     private final int                     longest;
  33.     private final int                     shortestValue;
  34.     private final int                     longestValue;

  35.     /**
  36.      * Define the look tables to be used in translation.
  37.      *
  38.      * Note that, as of Lang 3.1, the key to the lookup table is converted to a
  39.      * java.lang.String. This is because we need the key to support hashCode and
  40.      * equals(Object), allowing it to be the key for a HashMap. See LANG-882.
  41.      *
  42.      * Also note that, multiple lookup tables should be passed to this translator
  43.      * instead of passing multiple instances of this translator to the
  44.      * AggregateTranslator. Because, this translator only checks the values of the
  45.      * lookup table passed to this instance while deciding whether a value is
  46.      * already translated or not.
  47.      *
  48.      * @param inputArrays, an array of string arrays.
  49.      */
  50.     public SingleLookupTranslator(final String[][]... inputArrays) {
  51.         String[][] lookup = new String[0][];
  52.         for (String[][] input : inputArrays) {
  53.             lookup = append(lookup, input);
  54.         }
  55.         lookupMap = new HashMap<String, String>();
  56.         prefixSet = new HashSet<Character>();
  57.         int _shortest = Integer.MAX_VALUE;
  58.         int _longest = 0;
  59.         int _shortestValue = Integer.MAX_VALUE;
  60.         int _longestValue = 0;
  61.         if (lookup != null) {
  62.             for (final CharSequence[] seq : lookup) {
  63.                 this.lookupMap.put(seq[0].toString(), seq[1].toString());
  64.                 this.prefixSet.add(seq[0].charAt(0));
  65.                 final int sz = seq[0].length();
  66.                 if (sz < _shortest) {
  67.                     _shortest = sz;
  68.                 }
  69.                 if (sz > _longest) {
  70.                     _longest = sz;
  71.                 }
  72.                 final int sizeOfValue = seq[1].length();
  73.                 if (sizeOfValue < _shortestValue) {
  74.                     _shortestValue = sizeOfValue;
  75.                 }
  76.                 if (sizeOfValue > _longestValue) {
  77.                     _longestValue = sizeOfValue;
  78.                 }
  79.             }
  80.         }
  81.         shortest = _shortest;
  82.         longest = _longest;
  83.         shortestValue = _shortestValue;
  84.         longestValue = _longestValue;
  85.     }

  86.     private static String[][] append(String[][] a, String[][] b) {
  87.         String[][] result = new String[a.length + b.length][];
  88.         System.arraycopy(a, 0, result, 0, a.length);
  89.         System.arraycopy(b, 0, result, a.length, b.length);
  90.         return result;
  91.     }

  92.     /**
  93.      * Translate a set of codepoints, represented by an int index into a CharSequence,
  94.      * into another set of codepoints. The number of codepoints consumed must be returned,
  95.      * and the only IOExceptions thrown must be from interacting with the Writer so that
  96.      * the top level API may reliably ignore StringWriter IOExceptions.
  97.      *
  98.      * @param input CharSequence that is being translated
  99.      * @param index int representing the current point of translation
  100.      * @param out   Writer to translate the text to
  101.      * @return int count of codepoints consumed
  102.      * @throws IOException if and only if the Writer produces an IOException
  103.      */
  104.     @Override
  105.     public int translate(CharSequence input, int index, Writer out) throws IOException {
  106.         // check if already translated
  107.         int maxValue = longestValue;
  108.         if (index + maxValue > input.length()) {
  109.             maxValue = input.length() - index;
  110.         }
  111.         // implement greedy algorithm to check all the possible 'value' matches for which we need to skip translation.
  112.         for (int i = maxValue; i >= shortestValue; i--) {
  113.             final CharSequence subSeq = input.subSequence(index, index + i);
  114.             // If the sub-string is already translated, return without translating.
  115.             if (lookupMap.containsValue(subSeq.toString())) {
  116.                 return 0;
  117.             }
  118.         }

  119.         // check if translation exists for the input at position index
  120.         if (prefixSet.contains(input.charAt(index))) {
  121.             int max = longest;
  122.             if (index + longest > input.length()) {
  123.                 max = input.length() - index;
  124.             }
  125.             // implement greedy algorithm by trying maximum match first
  126.             for (int i = max; i >= shortest; i--) {
  127.                 final CharSequence subSeq = input.subSequence(index, index + i);
  128.                 final String result = lookupMap.get(subSeq.toString());

  129.                 if (result != null) {
  130.                     out.write(result);
  131.                     return i;
  132.                 }
  133.             }
  134.         }
  135.         return 0;
  136.     }
  137. }