001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3;
018
019/**
020 * <p>Operations on {@link CharSequence} that are
021 * {@code null} safe.</p>
022 *
023 * @see CharSequence
024 * @since 3.0
025 */
026public class CharSequenceUtils {
027
028    private static final int NOT_FOUND = -1;
029
030    /**
031     * <p>{@code CharSequenceUtils} instances should NOT be constructed in
032     * standard programming. </p>
033     *
034     * <p>This constructor is public to permit tools that require a JavaBean
035     * instance to operate.</p>
036     */
037    public CharSequenceUtils() {
038        super();
039    }
040
041    //-----------------------------------------------------------------------
042    /**
043     * <p>Returns a new {@code CharSequence} that is a subsequence of this
044     * sequence starting with the {@code char} value at the specified index.</p>
045     *
046     * <p>This provides the {@code CharSequence} equivalent to {@link String#substring(int)}.
047     * The length (in {@code char}) of the returned sequence is {@code length() - start},
048     * so if {@code start == end} then an empty sequence is returned.</p>
049     *
050     * @param cs  the specified subsequence, null returns null
051     * @param start  the start index, inclusive, valid
052     * @return a new subsequence, may be null
053     * @throws IndexOutOfBoundsException if {@code start} is negative or if
054     *  {@code start} is greater than {@code length()}
055     */
056    public static CharSequence subSequence(final CharSequence cs, final int start) {
057        return cs == null ? null : cs.subSequence(start, cs.length());
058    }
059
060    //-----------------------------------------------------------------------
061    /**
062     * Returns the index within {@code cs} of the first occurrence of the
063     * specified character, starting the search at the specified index.
064     * <p>
065     * If a character with value {@code searchChar} occurs in the
066     * character sequence represented by the {@code cs}
067     * object at an index no smaller than {@code start}, then
068     * the index of the first such occurrence is returned. For values
069     * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive),
070     * this is the smallest value <i>k</i> such that:
071     * <blockquote><pre>
072     * (this.charAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
073     * </pre></blockquote>
074     * is true. For other values of {@code searchChar}, it is the
075     * smallest value <i>k</i> such that:
076     * <blockquote><pre>
077     * (this.codePointAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
078     * </pre></blockquote>
079     * is true. In either case, if no such character occurs inm {@code cs}
080     * at or after position {@code start}, then
081     * {@code -1} is returned.
082     *
083     * <p>
084     * There is no restriction on the value of {@code start}. If it
085     * is negative, it has the same effect as if it were zero: the entire
086     * {@code CharSequence} may be searched. If it is greater than
087     * the length of {@code cs}, it has the same effect as if it were
088     * equal to the length of {@code cs}: {@code -1} is returned.
089     *
090     * <p>All indices are specified in {@code char} values
091     * (Unicode code units).
092     *
093     * @param cs  the {@code CharSequence} to be processed, not null
094     * @param searchChar  the char to be searched for
095     * @param start  the start index, negative starts at the string start
096     * @return the index where the search char was found, -1 if not found
097     * @since 3.6 updated to behave more like {@code String}
098     */
099    static int indexOf(final CharSequence cs, final int searchChar, int start) {
100        if (cs instanceof String) {
101            return ((String) cs).indexOf(searchChar, start);
102        }
103        final int sz = cs.length();
104        if (start < 0) {
105            start = 0;
106        }
107        if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
108            for (int i = start; i < sz; i++) {
109                if (cs.charAt(i) == searchChar) {
110                    return i;
111                }
112            }
113        }
114        //supplementary characters (LANG1300)
115        if (searchChar <= Character.MAX_CODE_POINT) {
116            final char[] chars = Character.toChars(searchChar);
117            for (int i = start; i < sz - 1; i++) {
118                final char high = cs.charAt(i);
119                final char low = cs.charAt(i + 1);
120                if (high == chars[0] && low == chars[1]) {
121                    return i;
122                }
123            }
124        }
125        return NOT_FOUND;
126    }
127
128    /**
129     * Used by the indexOf(CharSequence methods) as a green implementation of indexOf.
130     *
131     * @param cs the {@code CharSequence} to be processed
132     * @param searchChar the {@code CharSequence} to be searched for
133     * @param start the start index
134     * @return the index where the search sequence was found
135     */
136    static int indexOf(final CharSequence cs, final CharSequence searchChar, final int start) {
137        if (cs instanceof String) {
138            return ((String) cs).indexOf(searchChar.toString(), start);
139        } else if (cs instanceof StringBuilder) {
140            return ((StringBuilder) cs).indexOf(searchChar.toString(), start);
141        } else if (cs instanceof StringBuffer) {
142            return ((StringBuffer) cs).indexOf(searchChar.toString(), start);
143        }
144        return cs.toString().indexOf(searchChar.toString(), start);
145//        if (cs instanceof String && searchChar instanceof String) {
146//            // TODO: Do we assume searchChar is usually relatively small;
147//            //       If so then calling toString() on it is better than reverting to
148//            //       the green implementation in the else block
149//            return ((String) cs).indexOf((String) searchChar, start);
150//        } else {
151//            // TODO: Implement rather than convert to String
152//            return cs.toString().indexOf(searchChar.toString(), start);
153//        }
154    }
155
156    /**
157     * Returns the index within {@code cs} of the last occurrence of
158     * the specified character, searching backward starting at the
159     * specified index. For values of {@code searchChar} in the range
160     * from 0 to 0xFFFF (inclusive), the index returned is the largest
161     * value <i>k</i> such that:
162     * <blockquote><pre>
163     * (this.charAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &lt;= start)
164     * </pre></blockquote>
165     * is true. For other values of {@code searchChar}, it is the
166     * largest value <i>k</i> such that:
167     * <blockquote><pre>
168     * (this.codePointAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &lt;= start)
169     * </pre></blockquote>
170     * is true. In either case, if no such character occurs in {@code cs}
171     * at or before position {@code start}, then {@code -1} is returned.
172     *
173     * <p>All indices are specified in {@code char} values
174     * (Unicode code units).
175     *
176     * @param cs  the {@code CharSequence} to be processed
177     * @param searchChar  the char to be searched for
178     * @param start  the start index, negative returns -1, beyond length starts at end
179     * @return the index where the search char was found, -1 if not found
180     * @since 3.6 updated to behave more like {@code String}
181     */
182    static int lastIndexOf(final CharSequence cs, final int searchChar, int start) {
183        if (cs instanceof String) {
184            return ((String) cs).lastIndexOf(searchChar, start);
185        }
186        final int sz = cs.length();
187        if (start < 0) {
188            return NOT_FOUND;
189        }
190        if (start >= sz) {
191            start = sz - 1;
192        }
193        if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
194            for (int i = start; i >= 0; --i) {
195                if (cs.charAt(i) == searchChar) {
196                    return i;
197                }
198            }
199        }
200        //supplementary characters (LANG1300)
201        //NOTE - we must do a forward traversal for this to avoid duplicating code points
202        if (searchChar <= Character.MAX_CODE_POINT) {
203            final char[] chars = Character.toChars(searchChar);
204            //make sure it's not the last index
205            if (start == sz - 1) {
206                return NOT_FOUND;
207            }
208            for (int i = start; i >= 0; i--) {
209                final char high = cs.charAt(i);
210                final char low = cs.charAt(i + 1);
211                if (chars[0] == high && chars[1] == low) {
212                    return i;
213                }
214            }
215        }
216        return NOT_FOUND;
217    }
218
219    static final int TO_STRING_LIMIT = 16;
220
221    /**
222     * Used by the lastIndexOf(CharSequence methods) as a green implementation of lastIndexOf
223     *
224     * @param cs the {@code CharSequence} to be processed
225     * @param searchChar the {@code CharSequence} to be searched for
226     * @param start the start index
227     * @return the index where the search sequence was found
228     */
229    static int lastIndexOf(final CharSequence cs, final CharSequence searchChar, int start) {
230        if (searchChar instanceof String) {
231            if (cs instanceof String) {
232                return ((String) cs).lastIndexOf((String) searchChar, start);
233            } else if (cs instanceof StringBuilder) {
234                return ((StringBuilder) cs).lastIndexOf((String) searchChar, start);
235            } else if (cs instanceof StringBuffer) {
236                return ((StringBuffer) cs).lastIndexOf((String) searchChar, start);
237            }
238        }
239
240        final int len1 = cs.length();
241        final int len2 = searchChar.length();
242
243        if (start > len1) {
244            start = len1;
245        }
246
247        if (start < 0 || len2 < 0 || len2 > len1) {
248            return -1;
249        }
250
251        if (len2 == 0) {
252            return start;
253        }
254
255        if (len2 <= TO_STRING_LIMIT) {
256            if (cs instanceof String) {
257                return ((String) cs).lastIndexOf(searchChar.toString(), start);
258            } else if (cs instanceof StringBuilder) {
259                return ((StringBuilder) cs).lastIndexOf(searchChar.toString(), start);
260            } else if (cs instanceof StringBuffer) {
261                return ((StringBuffer) cs).lastIndexOf(searchChar.toString(), start);
262            }
263        }
264
265        if (start + len2 > len1) {
266            start = len1 - len2;
267        }
268
269        final char char0 = searchChar.charAt(0);
270
271        int i = start;
272        while (true) {
273            while (cs.charAt(i) != char0) {
274                i--;
275                if (i < 0) {
276                    return -1;
277                }
278            }
279            if (checkLaterThan1(cs, searchChar, len2, i)) {
280                return i;
281            }
282            i--;
283            if (i < 0) {
284                return -1;
285            }
286        }
287    }
288
289    private static boolean checkLaterThan1(final CharSequence cs, final CharSequence searchChar, final int len2, final int start1) {
290        for (int i = 1, j = len2 - 1; i <= j; i++, j--) {
291            if (cs.charAt(start1 + i) != searchChar.charAt(i)
292                    ||
293                    cs.charAt(start1 + j) != searchChar.charAt(j)
294            ) {
295                return false;
296            }
297        }
298        return true;
299    }
300
301    /**
302     * Converts the given CharSequence to a char[].
303     *
304     * @param source the {@code CharSequence} to be processed.
305     * @return the resulting char array, never null.
306     * @since 3.11
307     */
308    public static char[] toCharArray(final CharSequence source) {
309        final int len = StringUtils.length(source);
310        if (len == 0) {
311            return ArrayUtils.EMPTY_CHAR_ARRAY;
312        }
313        if (source instanceof String) {
314            return ((String) source).toCharArray();
315        }
316        final char[] array = new char[len];
317        for (int i = 0; i < len; i++) {
318            array[i] = source.charAt(i);
319        }
320        return array;
321    }
322
323    /**
324     * Green implementation of regionMatches.
325     *
326     * @param cs the {@code CharSequence} to be processed
327     * @param ignoreCase whether or not to be case insensitive
328     * @param thisStart the index to start on the {@code cs} CharSequence
329     * @param substring the {@code CharSequence} to be looked for
330     * @param start the index to start on the {@code substring} CharSequence
331     * @param length character length of the region
332     * @return whether the region matched
333     */
334    static boolean regionMatches(final CharSequence cs, final boolean ignoreCase, final int thisStart,
335            final CharSequence substring, final int start, final int length)    {
336        if (cs instanceof String && substring instanceof String) {
337            return ((String) cs).regionMatches(ignoreCase, thisStart, (String) substring, start, length);
338        }
339        int index1 = thisStart;
340        int index2 = start;
341        int tmpLen = length;
342
343        // Extract these first so we detect NPEs the same as the java.lang.String version
344        final int srcLen = cs.length() - thisStart;
345        final int otherLen = substring.length() - start;
346
347        // Check for invalid parameters
348        if (thisStart < 0 || start < 0 || length < 0) {
349            return false;
350        }
351
352        // Check that the regions are long enough
353        if (srcLen < length || otherLen < length) {
354            return false;
355        }
356
357        while (tmpLen-- > 0) {
358            final char c1 = cs.charAt(index1++);
359            final char c2 = substring.charAt(index2++);
360
361            if (c1 == c2) {
362                continue;
363            }
364
365            if (!ignoreCase) {
366                return false;
367            }
368
369            // The real same check as in String.regionMatches():
370            final char u1 = Character.toUpperCase(c1);
371            final char u2 = Character.toUpperCase(c2);
372            if (u1 != u2 && Character.toLowerCase(u1) != Character.toLowerCase(u2)) {
373                return false;
374            }
375        }
376
377        return true;
378    }
379}