001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.translate; 018 019import java.io.IOException; 020import java.io.Writer; 021import java.util.HashMap; 022import java.util.HashSet; 023 024/** 025 * Translates a value using a lookup table. 026 * But doesn't translate if that value is already translated. 027 * 028 * @since 1.0 029 */ 030public class SingleLookupTranslator extends CharSequenceTranslator { 031 032 private final HashMap<String, String> lookupMap; 033 private final HashSet<Character> prefixSet; 034 private final int shortest; 035 private final int longest; 036 private final int shortestValue; 037 private final int longestValue; 038 039 /** 040 * Define the look tables to be used in translation. 041 * 042 * Note that, as of Lang 3.1, the key to the lookup table is converted to a 043 * java.lang.String. This is because we need the key to support hashCode and 044 * equals(Object), allowing it to be the key for a HashMap. See LANG-882. 045 * 046 * Also note that, multiple lookup tables should be passed to this translator 047 * instead of passing multiple instances of this translator to the 048 * AggregateTranslator. Because, this translator only checks the values of the 049 * lookup table passed to this instance while deciding whether a value is 050 * already translated or not. 051 * 052 * @param inputArrays, an array of string arrays. 053 */ 054 public SingleLookupTranslator(final String[][]... inputArrays) { 055 String[][] lookup = new String[0][]; 056 for (String[][] input : inputArrays) { 057 lookup = append(lookup, input); 058 } 059 lookupMap = new HashMap<String, String>(); 060 prefixSet = new HashSet<Character>(); 061 int _shortest = Integer.MAX_VALUE; 062 int _longest = 0; 063 int _shortestValue = Integer.MAX_VALUE; 064 int _longestValue = 0; 065 if (lookup != null) { 066 for (final CharSequence[] seq : lookup) { 067 this.lookupMap.put(seq[0].toString(), seq[1].toString()); 068 this.prefixSet.add(seq[0].charAt(0)); 069 final int sz = seq[0].length(); 070 if (sz < _shortest) { 071 _shortest = sz; 072 } 073 if (sz > _longest) { 074 _longest = sz; 075 } 076 final int sizeOfValue = seq[1].length(); 077 if (sizeOfValue < _shortestValue) { 078 _shortestValue = sizeOfValue; 079 } 080 if (sizeOfValue > _longestValue) { 081 _longestValue = sizeOfValue; 082 } 083 } 084 } 085 shortest = _shortest; 086 longest = _longest; 087 shortestValue = _shortestValue; 088 longestValue = _longestValue; 089 } 090 091 private static String[][] append(String[][] a, String[][] b) { 092 String[][] result = new String[a.length + b.length][]; 093 System.arraycopy(a, 0, result, 0, a.length); 094 System.arraycopy(b, 0, result, a.length, b.length); 095 return result; 096 } 097 098 /** 099 * Translate a set of codepoints, represented by an int index into a CharSequence, 100 * into another set of codepoints. The number of codepoints consumed must be returned, 101 * and the only IOExceptions thrown must be from interacting with the Writer so that 102 * the top level API may reliably ignore StringWriter IOExceptions. 103 * 104 * @param input CharSequence that is being translated 105 * @param index int representing the current point of translation 106 * @param out Writer to translate the text to 107 * @return int count of codepoints consumed 108 * @throws IOException if and only if the Writer produces an IOException 109 */ 110 @Override 111 public int translate(CharSequence input, int index, Writer out) throws IOException { 112 // check if already translated 113 int maxValue = longestValue; 114 if (index + maxValue > input.length()) { 115 maxValue = input.length() - index; 116 } 117 // implement greedy algorithm to check all the possible 'value' matches for which we need to skip translation. 118 for (int i = maxValue; i >= shortestValue; i--) { 119 final CharSequence subSeq = input.subSequence(index, index + i); 120 // If the sub-string is already translated, return without translating. 121 if (lookupMap.containsValue(subSeq.toString())) { 122 return 0; 123 } 124 } 125 126 // check if translation exists for the input at position index 127 if (prefixSet.contains(input.charAt(index))) { 128 int max = longest; 129 if (index + longest > input.length()) { 130 max = input.length() - index; 131 } 132 // implement greedy algorithm by trying maximum match first 133 for (int i = max; i >= shortest; i--) { 134 final CharSequence subSeq = input.subSequence(index, index + i); 135 final String result = lookupMap.get(subSeq.toString()); 136 137 if (result != null) { 138 out.write(result); 139 return i; 140 } 141 } 142 } 143 return 0; 144 } 145}