1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.rng.examples.stress;
18
19 import java.io.Serializable;
20 import java.util.Comparator;
21
22 /**
23 * Provides number sensitive sorting for character sequences.
24 *
25 * <p>Extracts sub-sequences of either numeric ({@code [0, 9]}) or non-numeric characters
26 * and compares them numerically or lexicographically. Leading zeros are ignored from
27 * numbers. Negative numbers are not supported.
28 *
29 * <pre>
30 * Traditional AlphaNumeric
31 * z0200.html z2.html
32 * z100.html z100.html
33 * z2.html z0200.html
34 * </pre>
35 *
36 * <p>This is based on ideas in the Alphanum algorithm by David Koelle.</p>
37 *
38 * <p>This implementation supports:</p>
39 *
40 * <ul>
41 * <li>{@link CharSequence} comparison
42 * <li>Direct use of input sequences for minimal memory consumption
43 * <li>Numbers with leading zeros
44 * </ul>
45 *
46 * <p>Any null sequences are ordered before non-null sequences.</p>
47 *
48 * <p>Note: The comparator is thread-safe so can be used in a parallel sort.
49 *
50 * @see <a href="http://www.DaveKoelle.com">Alphanum Algorithm</a>
51 */
52 class AlphaNumericComparator implements Comparator<CharSequence>, Serializable {
53 /**
54 * An instance.
55 */
56 public static final AlphaNumericComparator INSTANCE = new AlphaNumericComparator();
57
58 /**
59 * The serial version ID.
60 * Note: Comparators are recommended to be Serializable to allow serialization of
61 * collections constructed with a Comparator.
62 */
63 private static final long serialVersionUID = 1L;
64
65 @Override
66 public int compare(CharSequence seq1, CharSequence seq2) {
67 // Null is less
68 if (seq1 == null) {
69 return -1;
70 }
71 if (seq2 == null) {
72 return 1;
73 }
74 if (seq1.equals(seq2)) {
75 return 0;
76 }
77
78 int pos1 = 0;
79 int pos2 = 0;
80 final int length1 = seq1.length();
81 final int length2 = seq2.length();
82
83 while (pos1 < length1 && pos2 < length2) {
84 final int end1 = nextSubSequenceEnd(seq1, pos1, length1);
85 final int end2 = nextSubSequenceEnd(seq2, pos2, length2);
86
87 // If both sub-sequences contain numeric characters, sort them numerically
88 int result = 0;
89 if (isDigit(seq1.charAt(pos1)) && isDigit(seq2.charAt(pos2))) {
90 result = compareNumerically(seq1, pos1, end1, seq2, pos2, end2);
91 } else {
92 result = compareLexicographically(seq1, pos1, end1, seq2, pos2, end2);
93 }
94
95 if (result != 0) {
96 return result;
97 }
98
99 pos1 = end1;
100 pos2 = end2;
101 }
102
103 return length1 - length2;
104 }
105
106 /**
107 * Get the end position of the next sub-sequence of either digits or non-digit
108 * characters starting from the start position.
109 *
110 * <p>The end position is exclusive such that the sub-sequence is the interval
111 * {@code [start, end)}.
112 *
113 * @param seq the character sequence
114 * @param start the start position
115 * @param length the sequence length
116 * @return the sub-sequence end position (exclusive)
117 */
118 private static int nextSubSequenceEnd(CharSequence seq, int start, int length) {
119 int pos = start;
120 // Set the sub-sequence type (digits or non-digits)
121 final boolean seqType = isDigit(seq.charAt(pos++));
122 while (pos < length && seqType == isDigit(seq.charAt(pos))) {
123 // Extend the sub-sequence
124 pos++;
125 }
126 return pos;
127 }
128
129 /**
130 * Checks if the character is a digit.
131 *
132 * @param ch the character
133 * @return true if a digit
134 */
135 private static boolean isDigit(char ch) {
136 return ch >= 48 && ch <= 57;
137 }
138
139 /**
140 * Compares two sub-sequences numerically. Ignores leading zeros. Assumes all
141 * characters are digits.
142 *
143 * @param seq1 the first sequence
144 * @param start1 the start of the first sub-sequence
145 * @param end1 the end of the first sub-sequence
146 * @param seq2 the second sequence
147 * @param start2 the start of the second sub-sequence
148 * @param end2 the end of the second sub-sequence sequence
149 * @return the value {@code 0} if the sub-sequences are equal; a value less than
150 * {@code 0} if sub-sequence 1 is numerically less than sub-sequence 2; and a value
151 * greater than {@code 0} if sub-sequence 1 is numerically greater than sub-sequence
152 * 2.
153 */
154 private static int compareNumerically(CharSequence seq1, int start1, int end1,
155 CharSequence seq2, int start2, int end2) {
156 // Ignore leading zeros in numbers
157 int pos1 = advancePastLeadingZeros(seq1, start1, end1);
158 int pos2 = advancePastLeadingZeros(seq2, start2, end2);
159
160 // Simple comparison by length
161 final int result = (end1 - pos1) - (end2 - pos2);
162 // If equal, the first different number counts.
163 if (result == 0) {
164 while (pos1 < end1) {
165 final char c1 = seq1.charAt(pos1++);
166 final char c2 = seq2.charAt(pos2++);
167 if (c1 != c2) {
168 return c1 - c2;
169 }
170 }
171 }
172 return result;
173 }
174
175 /**
176 * Advances past leading zeros in the sub-sequence. Returns the index of the start
177 * character of the number.
178 *
179 * @param seq the sequence
180 * @param start the start of the sub-sequence
181 * @param end the end of the sub-sequence
182 * @return the start index of the number
183 */
184 private static int advancePastLeadingZeros(CharSequence seq, int start, int end) {
185 int pos = start;
186 // Ignore zeros only when there are further characters
187 while (pos < end - 1 && seq.charAt(pos) == '0') {
188 pos++;
189 }
190 return pos;
191 }
192
193 /**
194 * Compares two sub-sequences lexicographically. This matches the compare function in
195 * {@link String} using extracted sub-sequences.
196 *
197 * @param seq1 the first sequence
198 * @param start1 the start of the first sub-sequence
199 * @param end1 the end of the first sub-sequence
200 * @param seq2 the second sequence
201 * @param start2 the start of the second sub-sequence
202 * @param end2 the end of the second sub-sequence sequence
203 * @return the value {@code 0} if the sub-sequences are equal; a value less than
204 * {@code 0} if sub-sequence 1 is lexicographically less than sub-sequence 2; and a
205 * value greater than {@code 0} if sub-sequence 1 is lexicographically greater than
206 * sub-sequence 2.
207 * @see String#compareTo(String)
208 */
209 private static int compareLexicographically(CharSequence seq1, int start1, int end1,
210 CharSequence seq2, int start2, int end2) {
211 final int len1 = end1 - start1;
212 final int len2 = end2 - start2;
213 final int limit = Math.min(len1, len2);
214
215 for (int i = 0; i < limit; i++) {
216 final char c1 = seq1.charAt(i + start1);
217 final char c2 = seq2.charAt(i + start2);
218 if (c1 != c2) {
219 return c1 - c2;
220 }
221 }
222 return len1 - len2;
223 }
224 }