View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3;
18  
19  /**
20   * Operations on {@link CharSequence} that are
21   * {@code null} safe.
22   *
23   * @see CharSequence
24   * @since 3.0
25   */
26  public class CharSequenceUtils {
27  
28      private static final int NOT_FOUND = -1;
29  
30      static final int TO_STRING_LIMIT = 16;
31  
32      private static boolean checkLaterThan1(final CharSequence cs, final CharSequence searchChar, final int len2, final int start1) {
33          for (int i = 1, j = len2 - 1; i <= j; i++, j--) {
34              if (cs.charAt(start1 + i) != searchChar.charAt(i) || cs.charAt(start1 + j) != searchChar.charAt(j)) {
35                  return false;
36              }
37          }
38          return true;
39      }
40  
41      /**
42       * Used by the indexOf(CharSequence methods) as a green implementation of indexOf.
43       *
44       * @param cs the {@link CharSequence} to be processed
45       * @param searchChar the {@link CharSequence} to be searched for
46       * @param start the start index
47       * @return the index where the search sequence was found
48       */
49      static int indexOf(final CharSequence cs, final CharSequence searchChar, final int start) {
50          if (cs instanceof String) {
51              return ((String) cs).indexOf(searchChar.toString(), start);
52          }
53          if (cs instanceof StringBuilder) {
54              return ((StringBuilder) cs).indexOf(searchChar.toString(), start);
55          }
56          if (cs instanceof StringBuffer) {
57              return ((StringBuffer) cs).indexOf(searchChar.toString(), start);
58          }
59          return cs.toString().indexOf(searchChar.toString(), start);
60  //        if (cs instanceof String && searchChar instanceof String) {
61  //            // TODO: Do we assume searchChar is usually relatively small;
62  //            //       If so then calling toString() on it is better than reverting to
63  //            //       the green implementation in the else block
64  //            return ((String) cs).indexOf((String) searchChar, start);
65  //        } else {
66  //            // TODO: Implement rather than convert to String
67  //            return cs.toString().indexOf(searchChar.toString(), start);
68  //        }
69      }
70  
71      /**
72       * Returns the index within {@code cs} of the first occurrence of the
73       * specified character, starting the search at the specified index.
74       * <p>
75       * If a character with value {@code searchChar} occurs in the
76       * character sequence represented by the {@code cs}
77       * object at an index no smaller than {@code start}, then
78       * the index of the first such occurrence is returned. For values
79       * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive),
80       * this is the smallest value <i>k</i> such that:
81       * </p>
82       * <blockquote><pre>
83       * (this.charAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
84       * </pre></blockquote>
85       * is true. For other values of {@code searchChar}, it is the
86       * smallest value <i>k</i> such that:
87       * <blockquote><pre>
88       * (this.codePointAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
89       * </pre></blockquote>
90       * <p>
91       * is true. In either case, if no such character occurs inm {@code cs}
92       * at or after position {@code start}, then
93       * {@code -1} is returned.
94       * </p>
95       * <p>
96       * There is no restriction on the value of {@code start}. If it
97       * is negative, it has the same effect as if it were zero: the entire
98       * {@link CharSequence} may be searched. If it is greater than
99       * the length of {@code cs}, it has the same effect as if it were
100      * equal to the length of {@code cs}: {@code -1} is returned.
101      * </p>
102      * <p>All indices are specified in {@code char} values
103      * (Unicode code units).
104      * </p>
105      *
106      * @param cs  the {@link CharSequence} to be processed, not null
107      * @param searchChar  the char to be searched for
108      * @param start  the start index, negative starts at the string start
109      * @return the index where the search char was found, -1 if not found
110      * @since 3.6 updated to behave more like {@link String}
111      */
112     static int indexOf(final CharSequence cs, final int searchChar, int start) {
113         if (cs instanceof String) {
114             return ((String) cs).indexOf(searchChar, start);
115         }
116         final int sz = cs.length();
117         if (start < 0) {
118             start = 0;
119         }
120         if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
121             for (int i = start; i < sz; i++) {
122                 if (cs.charAt(i) == searchChar) {
123                     return i;
124                 }
125             }
126             return NOT_FOUND;
127         }
128         //supplementary characters (LANG1300)
129         if (searchChar <= Character.MAX_CODE_POINT) {
130             final char[] chars = Character.toChars(searchChar);
131             for (int i = start; i < sz - 1; i++) {
132                 final char high = cs.charAt(i);
133                 final char low = cs.charAt(i + 1);
134                 if (high == chars[0] && low == chars[1]) {
135                     return i;
136                 }
137             }
138         }
139         return NOT_FOUND;
140     }
141 
142     /**
143      * Used by the lastIndexOf(CharSequence methods) as a green implementation of lastIndexOf
144      *
145      * @param cs the {@link CharSequence} to be processed
146      * @param searchChar the {@link CharSequence} to find
147      * @param start the start index
148      * @return the index where the search sequence was found
149      */
150     static int lastIndexOf(final CharSequence cs, final CharSequence searchChar, int start) {
151         if (searchChar == null || cs == null) {
152             return NOT_FOUND;
153         }
154         if (searchChar instanceof String) {
155             if (cs instanceof String) {
156                 return ((String) cs).lastIndexOf((String) searchChar, start);
157             }
158             if (cs instanceof StringBuilder) {
159                 return ((StringBuilder) cs).lastIndexOf((String) searchChar, start);
160             }
161             if (cs instanceof StringBuffer) {
162                 return ((StringBuffer) cs).lastIndexOf((String) searchChar, start);
163             }
164         }
165 
166         final int len1 = cs.length();
167         final int len2 = searchChar.length();
168 
169         if (start > len1) {
170             start = len1;
171         }
172 
173         if (start < 0 || len2 > len1) {
174             return NOT_FOUND;
175         }
176 
177         if (len2 == 0) {
178             return start;
179         }
180 
181         if (len2 <= TO_STRING_LIMIT) {
182             if (cs instanceof String) {
183                 return ((String) cs).lastIndexOf(searchChar.toString(), start);
184             }
185             if (cs instanceof StringBuilder) {
186                 return ((StringBuilder) cs).lastIndexOf(searchChar.toString(), start);
187             }
188             if (cs instanceof StringBuffer) {
189                 return ((StringBuffer) cs).lastIndexOf(searchChar.toString(), start);
190             }
191         }
192 
193         if (start + len2 > len1) {
194             start = len1 - len2;
195         }
196 
197         final char char0 = searchChar.charAt(0);
198 
199         int i = start;
200         while (true) {
201             while (cs.charAt(i) != char0) {
202                 i--;
203                 if (i < 0) {
204                     return NOT_FOUND;
205                 }
206             }
207             if (checkLaterThan1(cs, searchChar, len2, i)) {
208                 return i;
209             }
210             i--;
211             if (i < 0) {
212                 return NOT_FOUND;
213             }
214         }
215     }
216 
217     /**
218      * Returns the index within {@code cs} of the last occurrence of
219      * the specified character, searching backward starting at the
220      * specified index. For values of {@code searchChar} in the range
221      * from 0 to 0xFFFF (inclusive), the index returned is the largest
222      * value <i>k</i> such that:
223      * <blockquote><pre>
224      * (this.charAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &lt;= start)
225      * </pre></blockquote>
226      * is true. For other values of {@code searchChar}, it is the
227      * largest value <i>k</i> such that:
228      * <blockquote><pre>
229      * (this.codePointAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &lt;= start)
230      * </pre></blockquote>
231      * is true. In either case, if no such character occurs in {@code cs}
232      * at or before position {@code start}, then {@code -1} is returned.
233      *
234      * <p>
235      * All indices are specified in {@code char} values
236      * (Unicode code units).
237      * </p>
238      *
239      * @param cs  the {@link CharSequence} to be processed
240      * @param searchChar  the char to be searched for
241      * @param start  the start index, negative returns -1, beyond length starts at end
242      * @return the index where the search char was found, -1 if not found
243      * @since 3.6 updated to behave more like {@link String}
244      */
245     static int lastIndexOf(final CharSequence cs, final int searchChar, int start) {
246         if (cs instanceof String) {
247             return ((String) cs).lastIndexOf(searchChar, start);
248         }
249         final int sz = cs.length();
250         if (start < 0) {
251             return NOT_FOUND;
252         }
253         if (start >= sz) {
254             start = sz - 1;
255         }
256         if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
257             for (int i = start; i >= 0; --i) {
258                 if (cs.charAt(i) == searchChar) {
259                     return i;
260                 }
261             }
262             return NOT_FOUND;
263         }
264         //supplementary characters (LANG1300)
265         //NOTE - we must do a forward traversal for this to avoid duplicating code points
266         if (searchChar <= Character.MAX_CODE_POINT) {
267             final char[] chars = Character.toChars(searchChar);
268             //make sure it's not the last index
269             if (start == sz - 1) {
270                 return NOT_FOUND;
271             }
272             for (int i = start; i >= 0; i--) {
273                 final char high = cs.charAt(i);
274                 final char low = cs.charAt(i + 1);
275                 if (chars[0] == high && chars[1] == low) {
276                     return i;
277                 }
278             }
279         }
280         return NOT_FOUND;
281     }
282 
283     /**
284      * Green implementation of regionMatches.
285      *
286      * @param cs the {@link CharSequence} to be processed
287      * @param ignoreCase whether or not to be case-insensitive
288      * @param thisStart the index to start on the {@code cs} CharSequence
289      * @param substring the {@link CharSequence} to be looked for
290      * @param start the index to start on the {@code substring} CharSequence
291      * @param length character length of the region
292      * @return whether the region matched
293      */
294     static boolean regionMatches(final CharSequence cs, final boolean ignoreCase, final int thisStart,
295             final CharSequence substring, final int start, final int length)    {
296         if (cs instanceof String && substring instanceof String) {
297             return ((String) cs).regionMatches(ignoreCase, thisStart, (String) substring, start, length);
298         }
299         int index1 = thisStart;
300         int index2 = start;
301         int tmpLen = length;
302 
303         // Extract these first so we detect NPEs the same as the java.lang.String version
304         final int srcLen = cs.length() - thisStart;
305         final int otherLen = substring.length() - start;
306 
307         // Check for invalid parameters
308         if (thisStart < 0 || start < 0 || length < 0) {
309             return false;
310         }
311 
312         // Check that the regions are long enough
313         if (srcLen < length || otherLen < length) {
314             return false;
315         }
316 
317         while (tmpLen-- > 0) {
318             final char c1 = cs.charAt(index1++);
319             final char c2 = substring.charAt(index2++);
320 
321             if (c1 == c2) {
322                 continue;
323             }
324 
325             if (!ignoreCase) {
326                 return false;
327             }
328 
329             // The real same check as in String.regionMatches():
330             final char u1 = Character.toUpperCase(c1);
331             final char u2 = Character.toUpperCase(c2);
332             if (u1 != u2 && Character.toLowerCase(u1) != Character.toLowerCase(u2)) {
333                 return false;
334             }
335         }
336 
337         return true;
338     }
339 
340     /**
341      * Returns a new {@link CharSequence} that is a subsequence of this
342      * sequence starting with the {@code char} value at the specified index.
343      *
344      * <p>This provides the {@link CharSequence} equivalent to {@link String#substring(int)}.
345      * The length (in {@code char}) of the returned sequence is {@code length() - start},
346      * so if {@code start == end} then an empty sequence is returned.</p>
347      *
348      * @param cs  the specified subsequence, null returns null
349      * @param start  the start index, inclusive, valid
350      * @return a new subsequence, may be null
351      * @throws IndexOutOfBoundsException if {@code start} is negative or if
352      *  {@code start} is greater than {@code length()}
353      */
354     public static CharSequence subSequence(final CharSequence cs, final int start) {
355         return cs == null ? null : cs.subSequence(start, cs.length());
356     }
357 
358     /**
359      * Converts the given CharSequence to a char[].
360      *
361      * @param source the {@link CharSequence} to be processed.
362      * @return the resulting char array, never null.
363      * @since 3.11
364      */
365     public static char[] toCharArray(final CharSequence source) {
366         final int len = StringUtils.length(source);
367         if (len == 0) {
368             return ArrayUtils.EMPTY_CHAR_ARRAY;
369         }
370         if (source instanceof String) {
371             return ((String) source).toCharArray();
372         }
373         final char[] array = new char[len];
374         for (int i = 0; i < len; i++) {
375             array[i] = source.charAt(i);
376         }
377         return array;
378     }
379 
380     /**
381      * {@link CharSequenceUtils} instances should NOT be constructed in
382      * standard programming.
383      *
384      * <p>This constructor is public to permit tools that require a JavaBean
385      * instance to operate.</p>
386      */
387     public CharSequenceUtils() {
388     }
389 }