001 /* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 022 import com.google.common.annotations.Beta; 023 import com.google.common.annotations.GwtCompatible; 024 import com.google.common.annotations.GwtIncompatible; 025 026 import java.util.Collections; 027 import java.util.Iterator; 028 import java.util.LinkedHashMap; 029 import java.util.Map; 030 import java.util.regex.Matcher; 031 import java.util.regex.Pattern; 032 033 import javax.annotation.CheckReturnValue; 034 035 /** 036 * An object that divides strings (or other instances of {@code CharSequence}) 037 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 038 * which can be expressed as a single character, literal string, regular 039 * expression, {@code CharMatcher}, or by using a fixed substring length. This 040 * class provides the complementary functionality to {@link Joiner}. 041 * 042 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 043 * 044 * Splitter.on(',').split("foo,bar")}</pre> 045 * 046 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 047 * and {@code "bar"}, in that order. 048 * 049 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 050 * 051 * Splitter.on(',').split("foo,,bar, quux")}</pre> 052 * 053 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 054 * Notice that the splitter does not assume that you want empty strings removed, 055 * or that you wish to trim whitespace. If you want features like these, simply 056 * ask for them: <pre> {@code 057 * 058 * private static final Splitter MY_SPLITTER = Splitter.on(',') 059 * .trimResults() 060 * .omitEmptyStrings();}</pre> 061 * 062 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 063 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 064 * the configuration methods are called is never significant; for instance, 065 * trimming is always applied first before checking for an empty result, 066 * regardless of the order in which the {@link #trimResults()} and 067 * {@link #omitEmptyStrings()} methods were invoked. 068 * 069 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 070 * method such as {@code omitEmptyStrings} has no effect on the instance it 071 * is invoked on! You must store and use the new splitter instance returned by 072 * the method. This makes splitters thread-safe, and safe to store as {@code 073 * static final} constants (as illustrated above). <pre> {@code 074 * 075 * // Bad! Do not do this! 076 * Splitter splitter = Splitter.on('/'); 077 * splitter.trimResults(); // does nothing! 078 * return splitter.split("wrong / wrong / wrong");}</pre> 079 * 080 * The separator recognized by the splitter does not have to be a single 081 * literal character as in the examples above. See the methods {@link 082 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 083 * of other ways to specify separators. 084 * 085 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 086 * similar JDK methods; for instance, it does not silently discard trailing 087 * separators, as does {@link String#split(String)}, nor does it have a default 088 * behavior of using five particular whitespace characters as separators, like 089 * {@link java.util.StringTokenizer}. 090 * 091 * @author Julien Silland 092 * @author Jesse Wilson 093 * @author Kevin Bourrillion 094 * @author Louis Wasserman 095 * @since 1.0 096 */ 097 @GwtCompatible(emulated = true) 098 public final class Splitter { 099 private final CharMatcher trimmer; 100 private final boolean omitEmptyStrings; 101 private final Strategy strategy; 102 private final int limit; 103 104 private Splitter(Strategy strategy) { 105 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 106 } 107 108 private Splitter(Strategy strategy, boolean omitEmptyStrings, 109 CharMatcher trimmer, int limit) { 110 this.strategy = strategy; 111 this.omitEmptyStrings = omitEmptyStrings; 112 this.trimmer = trimmer; 113 this.limit = limit; 114 } 115 116 /** 117 * Returns a splitter that uses the given single-character separator. For 118 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 119 * containing {@code ["foo", "", "bar"]}. 120 * 121 * @param separator the character to recognize as a separator 122 * @return a splitter, with default settings, that recognizes that separator 123 */ 124 public static Splitter on(char separator) { 125 return on(CharMatcher.is(separator)); 126 } 127 128 /** 129 * Returns a splitter that considers any single character matched by the 130 * given {@code CharMatcher} to be a separator. For example, {@code 131 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 132 * iterable containing {@code ["foo", "", "bar", "quux"]}. 133 * 134 * @param separatorMatcher a {@link CharMatcher} that determines whether a 135 * character is a separator 136 * @return a splitter, with default settings, that uses this matcher 137 */ 138 public static Splitter on(final CharMatcher separatorMatcher) { 139 checkNotNull(separatorMatcher); 140 141 return new Splitter(new Strategy() { 142 @Override public SplittingIterator iterator( 143 Splitter splitter, final CharSequence toSplit) { 144 return new SplittingIterator(splitter, toSplit) { 145 @Override int separatorStart(int start) { 146 return separatorMatcher.indexIn(toSplit, start); 147 } 148 149 @Override int separatorEnd(int separatorPosition) { 150 return separatorPosition + 1; 151 } 152 }; 153 } 154 }); 155 } 156 157 /** 158 * Returns a splitter that uses the given fixed string as a separator. For 159 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 160 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 161 * 162 * @param separator the literal, nonempty string to recognize as a separator 163 * @return a splitter, with default settings, that recognizes that separator 164 */ 165 public static Splitter on(final String separator) { 166 checkArgument(separator.length() != 0, 167 "The separator may not be the empty string."); 168 169 return new Splitter(new Strategy() { 170 @Override public SplittingIterator iterator( 171 Splitter splitter, CharSequence toSplit) { 172 return new SplittingIterator(splitter, toSplit) { 173 @Override public int separatorStart(int start) { 174 int delimeterLength = separator.length(); 175 176 positions: 177 for (int p = start, last = toSplit.length() - delimeterLength; 178 p <= last; p++) { 179 for (int i = 0; i < delimeterLength; i++) { 180 if (toSplit.charAt(i + p) != separator.charAt(i)) { 181 continue positions; 182 } 183 } 184 return p; 185 } 186 return -1; 187 } 188 189 @Override public int separatorEnd(int separatorPosition) { 190 return separatorPosition + separator.length(); 191 } 192 }; 193 } 194 }); 195 } 196 197 /** 198 * Returns a splitter that considers any subsequence matching {@code 199 * pattern} to be a separator. For example, {@code 200 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 201 * into lines whether it uses DOS-style or UNIX-style line terminators. 202 * 203 * @param separatorPattern the pattern that determines whether a subsequence 204 * is a separator. This pattern may not match the empty string. 205 * @return a splitter, with default settings, that uses this pattern 206 * @throws IllegalArgumentException if {@code separatorPattern} matches the 207 * empty string 208 */ 209 @GwtIncompatible("java.util.regex") 210 public static Splitter on(final Pattern separatorPattern) { 211 checkNotNull(separatorPattern); 212 checkArgument(!separatorPattern.matcher("").matches(), 213 "The pattern may not match the empty string: %s", separatorPattern); 214 215 return new Splitter(new Strategy() { 216 @Override public SplittingIterator iterator( 217 final Splitter splitter, CharSequence toSplit) { 218 final Matcher matcher = separatorPattern.matcher(toSplit); 219 return new SplittingIterator(splitter, toSplit) { 220 @Override public int separatorStart(int start) { 221 return matcher.find(start) ? matcher.start() : -1; 222 } 223 224 @Override public int separatorEnd(int separatorPosition) { 225 return matcher.end(); 226 } 227 }; 228 } 229 }); 230 } 231 232 /** 233 * Returns a splitter that considers any subsequence matching a given 234 * pattern (regular expression) to be a separator. For example, {@code 235 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 236 * whether it uses DOS-style or UNIX-style line terminators. This is 237 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 238 * 239 * @param separatorPattern the pattern that determines whether a subsequence 240 * is a separator. This pattern may not match the empty string. 241 * @return a splitter, with default settings, that uses this pattern 242 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern} 243 * is a malformed expression 244 * @throws IllegalArgumentException if {@code separatorPattern} matches the 245 * empty string 246 */ 247 @GwtIncompatible("java.util.regex") 248 public static Splitter onPattern(String separatorPattern) { 249 return on(Pattern.compile(separatorPattern)); 250 } 251 252 /** 253 * Returns a splitter that divides strings into pieces of the given length. 254 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 255 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 256 * smaller than {@code length} but will never be empty. 257 * 258 * @param length the desired length of pieces after splitting 259 * @return a splitter, with default settings, that can split into fixed sized 260 * pieces 261 */ 262 public static Splitter fixedLength(final int length) { 263 checkArgument(length > 0, "The length may not be less than 1"); 264 265 return new Splitter(new Strategy() { 266 @Override public SplittingIterator iterator( 267 final Splitter splitter, CharSequence toSplit) { 268 return new SplittingIterator(splitter, toSplit) { 269 @Override public int separatorStart(int start) { 270 int nextChunkStart = start + length; 271 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 272 } 273 274 @Override public int separatorEnd(int separatorPosition) { 275 return separatorPosition; 276 } 277 }; 278 } 279 }); 280 } 281 282 /** 283 * Returns a splitter that behaves equivalently to {@code this} splitter, but 284 * automatically omits empty strings from the results. For example, {@code 285 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 286 * iterable containing only {@code ["a", "b", "c"]}. 287 * 288 * <p>If either {@code trimResults} option is also specified when creating a 289 * splitter, that splitter always trims results first before checking for 290 * emptiness. So, for example, {@code 291 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 292 * an empty iterable. 293 * 294 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 295 * to return an empty iterable, but when using this option, it can (if the 296 * input sequence consists of nothing but separators). 297 * 298 * @return a splitter with the desired configuration 299 */ 300 @CheckReturnValue 301 public Splitter omitEmptyStrings() { 302 return new Splitter(strategy, true, trimmer, limit); 303 } 304 305 /** 306 * Returns a splitter that behaves equivalently to {@code this} splitter but 307 * stops splitting after it reaches the limit. 308 * The limit defines the maximum number of items returned by the iterator. 309 * 310 * <p>For example, 311 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable 312 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the 313 * omitted strings do no count. Hence, 314 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")} 315 * returns an iterable containing {@code ["a", "b", "c,d"}. 316 * When trim is requested, all entries, including the last are trimmed. Hence 317 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")} 318 * results in @{code ["a", "b", "c , d"]}. 319 * 320 * @param limit the maximum number of items returns 321 * @return a splitter with the desired configuration 322 * @since 9.0 323 */ 324 @CheckReturnValue 325 public Splitter limit(int limit) { 326 checkArgument(limit > 0, "must be greater than zero: %s", limit); 327 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 328 } 329 330 /** 331 * Returns a splitter that behaves equivalently to {@code this} splitter, but 332 * automatically removes leading and trailing {@linkplain 333 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 334 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 335 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 336 * containing {@code ["a", "b", "c"]}. 337 * 338 * @return a splitter with the desired configuration 339 */ 340 @CheckReturnValue 341 public Splitter trimResults() { 342 return trimResults(CharMatcher.WHITESPACE); 343 } 344 345 /** 346 * Returns a splitter that behaves equivalently to {@code this} splitter, but 347 * removes all leading or trailing characters matching the given {@code 348 * CharMatcher} from each returned substring. For example, {@code 349 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 350 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 351 * 352 * @param trimmer a {@link CharMatcher} that determines whether a character 353 * should be removed from the beginning/end of a subsequence 354 * @return a splitter with the desired configuration 355 */ 356 // TODO(kevinb): throw if a trimmer was already specified! 357 @CheckReturnValue 358 public Splitter trimResults(CharMatcher trimmer) { 359 checkNotNull(trimmer); 360 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 361 } 362 363 /** 364 * Splits {@code sequence} into string components and makes them available 365 * through an {@link Iterator}, which may be lazily evaluated. 366 * 367 * @param sequence the sequence of characters to split 368 * @return an iteration over the segments split from the parameter. 369 */ 370 public Iterable<String> split(final CharSequence sequence) { 371 checkNotNull(sequence); 372 373 return new Iterable<String>() { 374 @Override public Iterator<String> iterator() { 375 return spliterator(sequence); 376 } 377 }; 378 } 379 380 private Iterator<String> spliterator(CharSequence sequence) { 381 return strategy.iterator(this, sequence); 382 } 383 384 /** 385 * Returns a {@code MapSplitter} which splits entries based on this splitter, 386 * and splits entries into keys and values using the specified separator. 387 * 388 * @since 10.0 389 */ 390 @CheckReturnValue 391 @Beta 392 public MapSplitter withKeyValueSeparator(String separator) { 393 return withKeyValueSeparator(on(separator)); 394 } 395 396 /** 397 * Returns a {@code MapSplitter} which splits entries based on this splitter, 398 * and splits entries into keys and values using the specified key-value 399 * splitter. 400 * 401 * @since 10.0 402 */ 403 @CheckReturnValue 404 @Beta 405 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) { 406 return new MapSplitter(this, keyValueSplitter); 407 } 408 409 /** 410 * An object that splits strings into maps as {@code Splitter} splits 411 * iterables and lists. Like {@code Splitter}, it is thread-safe and 412 * immutable. 413 * 414 * @since 10.0 415 */ 416 @Beta 417 public static final class MapSplitter { 418 private static final String INVALID_ENTRY_MESSAGE = 419 "Chunk [%s] is not a valid entry"; 420 private final Splitter outerSplitter; 421 private final Splitter entrySplitter; 422 423 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) { 424 this.outerSplitter = outerSplitter; // only "this" is passed 425 this.entrySplitter = checkNotNull(entrySplitter); 426 } 427 428 /** 429 * Splits {@code sequence} into substrings, splits each substring into 430 * an entry, and returns an unmodifiable map with each of the entries. For 431 * example, <code> 432 * Splitter.on(';').trimResults().withKeyValueSeparator("=>") 433 * .split("a=>b ; c=>b") 434 * </code> will return a mapping from {@code "a"} to {@code "b"} and 435 * {@code "c"} to {@code b}. 436 * 437 * <p>The returned map preserves the order of the entries from 438 * {@code sequence}. 439 * 440 * @throws IllegalArgumentException if the specified sequence does not split 441 * into valid map entries, or if there are duplicate keys 442 */ 443 public Map<String, String> split(CharSequence sequence) { 444 Map<String, String> map = new LinkedHashMap<String, String>(); 445 for (String entry : outerSplitter.split(sequence)) { 446 Iterator<String> entryFields = entrySplitter.spliterator(entry); 447 448 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 449 String key = entryFields.next(); 450 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key); 451 452 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 453 String value = entryFields.next(); 454 map.put(key, value); 455 456 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 457 } 458 return Collections.unmodifiableMap(map); 459 } 460 } 461 462 private interface Strategy { 463 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 464 } 465 466 private abstract static class SplittingIterator 467 extends AbstractIterator<String> { 468 final CharSequence toSplit; 469 final CharMatcher trimmer; 470 final boolean omitEmptyStrings; 471 472 /** 473 * Returns the first index in {@code toSplit} at or after {@code start} 474 * that contains the separator. 475 */ 476 abstract int separatorStart(int start); 477 478 /** 479 * Returns the first index in {@code toSplit} after {@code 480 * separatorPosition} that does not contain a separator. This method is only 481 * invoked after a call to {@code separatorStart}. 482 */ 483 abstract int separatorEnd(int separatorPosition); 484 485 int offset = 0; 486 int limit; 487 488 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 489 this.trimmer = splitter.trimmer; 490 this.omitEmptyStrings = splitter.omitEmptyStrings; 491 this.limit = splitter.limit; 492 this.toSplit = toSplit; 493 } 494 495 @Override protected String computeNext() { 496 while (offset != -1) { 497 int start = offset; 498 int end; 499 500 int separatorPosition = separatorStart(offset); 501 if (separatorPosition == -1) { 502 end = toSplit.length(); 503 offset = -1; 504 } else { 505 end = separatorPosition; 506 offset = separatorEnd(separatorPosition); 507 } 508 509 while (start < end && trimmer.matches(toSplit.charAt(start))) { 510 start++; 511 } 512 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 513 end--; 514 } 515 516 if (omitEmptyStrings && start == end) { 517 continue; 518 } 519 520 if (limit == 1) { 521 // The limit has been reached, return the rest of the string as the 522 // final item. This is tested after empty string removal so that 523 // empty strings do not count towards the limit. 524 end = toSplit.length(); 525 offset = -1; 526 // Since we may have changed the end, we need to trim it again. 527 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 528 end--; 529 } 530 } else { 531 limit--; 532 } 533 534 return toSplit.subSequence(start, end).toString(); 535 } 536 return endOfData(); 537 } 538 } 539 }