001 /* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 022 import com.google.common.annotations.Beta; 023 import com.google.common.annotations.GwtCompatible; 024 025 import java.util.ArrayList; 026 import java.util.Arrays; 027 import java.util.List; 028 029 import javax.annotation.CheckReturnValue; 030 031 /** 032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 033 * for any {@link Object}. Also offers basic text processing methods based on this function. 034 * Implementations are strongly encouraged to be side-effect-free and immutable. 035 * 036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 038 * 039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 042 * treats these just as two separate characters. 043 * 044 * <p>Example usages: <pre> 045 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput); 046 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 047 * 048 * @author Kevin Bourrillion 049 * @since 1.0 050 */ 051 @Beta // Possibly change from chars to code points; decide constants vs. methods 052 @GwtCompatible 053 public abstract class CharMatcher implements Predicate<Character> { 054 // Constants 055 056 // Excludes 2000-2000a, which is handled as a range 057 private static final String BREAKING_WHITESPACE_CHARS = 058 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000"; 059 060 // Excludes 2007, which is handled as a gap in a pair of ranges 061 private static final String NON_BREAKING_WHITESPACE_CHARS = 062 "\u00a0\u180e\u202f"; 063 064 /** 065 * Determines whether a character is whitespace according to the latest Unicode standard, as 066 * illustrated 067 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 068 * This is not the same definition used by other Java APIs. (See a 069 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 070 * definitions of "whitespace"</a>.) 071 * 072 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 073 * to date. 074 */ 075 public static final CharMatcher WHITESPACE = 076 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS) 077 .or(inRange('\u2000', '\u200a')) 078 .precomputed(); 079 080 /** 081 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 082 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 083 * discussion of that term. 084 * 085 * @since 2.0 086 */ 087 public static final CharMatcher BREAKING_WHITESPACE = 088 anyOf(BREAKING_WHITESPACE_CHARS) 089 .or(inRange('\u2000', '\u2006')) 090 .or(inRange('\u2008', '\u200a')) 091 .precomputed(); 092 093 /** 094 * Determines whether a character is ASCII, meaning that its code point is less than 128. 095 */ 096 public static final CharMatcher ASCII = inRange('\0', '\u007f'); 097 098 /** 099 * Determines whether a character is a digit according to 100 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 101 */ 102 public static final CharMatcher DIGIT; 103 104 static { 105 CharMatcher digit = inRange('0', '9'); 106 String zeroes = 107 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 108 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 109 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 110 for (char base : zeroes.toCharArray()) { 111 digit = digit.or(inRange(base, (char) (base + 9))); 112 } 113 DIGIT = digit.precomputed(); 114 } 115 116 /** 117 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 118 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 119 */ 120 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 121 @Override public boolean matches(char c) { 122 return Character.isDigit(c); 123 } 124 }; 125 126 /** 127 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 128 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 129 * inRange('a', 'z').or(inRange('A', 'Z'))}. 130 */ 131 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 132 @Override public boolean matches(char c) { 133 return Character.isLetter(c); 134 } 135 }; 136 137 /** 138 * Determines whether a character is a letter or digit according to {@link 139 * Character#isLetterOrDigit(char) Java's definition}. 140 */ 141 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 142 @Override public boolean matches(char c) { 143 return Character.isLetterOrDigit(c); 144 } 145 }; 146 147 /** 148 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 149 * Java's definition}. 150 */ 151 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 152 @Override public boolean matches(char c) { 153 return Character.isUpperCase(c); 154 } 155 }; 156 157 /** 158 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 159 * Java's definition}. 160 */ 161 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 162 @Override public boolean matches(char c) { 163 return Character.isLowerCase(c); 164 } 165 }; 166 167 /** 168 * Determines whether a character is an ISO control character as specified by {@link 169 * Character#isISOControl(char)}. 170 */ 171 public static final CharMatcher JAVA_ISO_CONTROL = 172 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f')); 173 174 /** 175 * Determines whether a character is invisible; that is, if its Unicode category is any of 176 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 177 * PRIVATE_USE according to ICU4J. 178 */ 179 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 180 .or(inRange('\u007f', '\u00a0')) 181 .or(is('\u00ad')) 182 .or(inRange('\u0600', '\u0603')) 183 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 184 .or(inRange('\u2000', '\u200f')) 185 .or(inRange('\u2028', '\u202f')) 186 .or(inRange('\u205f', '\u2064')) 187 .or(inRange('\u206a', '\u206f')) 188 .or(is('\u3000')) 189 .or(inRange('\ud800', '\uf8ff')) 190 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")) 191 .precomputed(); 192 193 /** 194 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 195 * errs on the side of returning {@code false} (that is, it tends to assume a character is 196 * double-width). 197 * 198 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 199 * date. 200 */ 201 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 202 .or(is('\u05be')) 203 .or(inRange('\u05d0', '\u05ea')) 204 .or(is('\u05f3')) 205 .or(is('\u05f4')) 206 .or(inRange('\u0600', '\u06ff')) 207 .or(inRange('\u0750', '\u077f')) 208 .or(inRange('\u0e00', '\u0e7f')) 209 .or(inRange('\u1e00', '\u20af')) 210 .or(inRange('\u2100', '\u213a')) 211 .or(inRange('\ufb50', '\ufdff')) 212 .or(inRange('\ufe70', '\ufeff')) 213 .or(inRange('\uff61', '\uffdc')) 214 .precomputed(); 215 216 /** Matches any character. */ 217 public static final CharMatcher ANY = 218 new CharMatcher() { 219 @Override public boolean matches(char c) { 220 return true; 221 } 222 223 @Override public int indexIn(CharSequence sequence) { 224 return (sequence.length() == 0) ? -1 : 0; 225 } 226 227 @Override public int indexIn(CharSequence sequence, int start) { 228 int length = sequence.length(); 229 Preconditions.checkPositionIndex(start, length); 230 return (start == length) ? -1 : start; 231 } 232 233 @Override public int lastIndexIn(CharSequence sequence) { 234 return sequence.length() - 1; 235 } 236 237 @Override public boolean matchesAllOf(CharSequence sequence) { 238 checkNotNull(sequence); 239 return true; 240 } 241 242 @Override public boolean matchesNoneOf(CharSequence sequence) { 243 return sequence.length() == 0; 244 } 245 246 @Override public String removeFrom(CharSequence sequence) { 247 checkNotNull(sequence); 248 return ""; 249 } 250 251 @Override public String replaceFrom(CharSequence sequence, char replacement) { 252 char[] array = new char[sequence.length()]; 253 Arrays.fill(array, replacement); 254 return new String(array); 255 } 256 257 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 258 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 259 for (int i = 0; i < sequence.length(); i++) { 260 retval.append(replacement); 261 } 262 return retval.toString(); 263 } 264 265 @Override public String collapseFrom(CharSequence sequence, char replacement) { 266 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 267 } 268 269 @Override public String trimFrom(CharSequence sequence) { 270 checkNotNull(sequence); 271 return ""; 272 } 273 274 @Override public int countIn(CharSequence sequence) { 275 return sequence.length(); 276 } 277 278 @Override public CharMatcher and(CharMatcher other) { 279 return checkNotNull(other); 280 } 281 282 @Override public CharMatcher or(CharMatcher other) { 283 checkNotNull(other); 284 return this; 285 } 286 287 @Override public CharMatcher negate() { 288 return NONE; 289 } 290 291 @Override public CharMatcher precomputed() { 292 return this; 293 } 294 }; 295 296 /** Matches no characters. */ 297 public static final CharMatcher NONE = 298 new CharMatcher() { 299 @Override public boolean matches(char c) { 300 return false; 301 } 302 303 @Override public int indexIn(CharSequence sequence) { 304 checkNotNull(sequence); 305 return -1; 306 } 307 308 @Override public int indexIn(CharSequence sequence, int start) { 309 int length = sequence.length(); 310 Preconditions.checkPositionIndex(start, length); 311 return -1; 312 } 313 314 @Override public int lastIndexIn(CharSequence sequence) { 315 checkNotNull(sequence); 316 return -1; 317 } 318 319 @Override public boolean matchesAllOf(CharSequence sequence) { 320 return sequence.length() == 0; 321 } 322 323 @Override public boolean matchesNoneOf(CharSequence sequence) { 324 checkNotNull(sequence); 325 return true; 326 } 327 328 @Override public String removeFrom(CharSequence sequence) { 329 return sequence.toString(); 330 } 331 332 @Override public String replaceFrom(CharSequence sequence, char replacement) { 333 return sequence.toString(); 334 } 335 336 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 337 checkNotNull(replacement); 338 return sequence.toString(); 339 } 340 341 @Override public String collapseFrom(CharSequence sequence, char replacement) { 342 return sequence.toString(); 343 } 344 345 @Override public String trimFrom(CharSequence sequence) { 346 return sequence.toString(); 347 } 348 349 @Override public int countIn(CharSequence sequence) { 350 checkNotNull(sequence); 351 return 0; 352 } 353 354 @Override public CharMatcher and(CharMatcher other) { 355 checkNotNull(other); 356 return this; 357 } 358 359 @Override public CharMatcher or(CharMatcher other) { 360 return checkNotNull(other); 361 } 362 363 @Override public CharMatcher negate() { 364 return ANY; 365 } 366 367 @Override void setBits(LookupTable table) {} 368 369 @Override public CharMatcher precomputed() { 370 return this; 371 } 372 }; 373 374 // Static factories 375 376 /** 377 * Returns a {@code char} matcher that matches only one specified character. 378 */ 379 public static CharMatcher is(final char match) { 380 return new CharMatcher() { 381 @Override public boolean matches(char c) { 382 return c == match; 383 } 384 385 @Override public String replaceFrom(CharSequence sequence, char replacement) { 386 return sequence.toString().replace(match, replacement); 387 } 388 389 @Override public CharMatcher and(CharMatcher other) { 390 return other.matches(match) ? this : NONE; 391 } 392 393 @Override public CharMatcher or(CharMatcher other) { 394 return other.matches(match) ? other : super.or(other); 395 } 396 397 @Override public CharMatcher negate() { 398 return isNot(match); 399 } 400 401 @Override void setBits(LookupTable table) { 402 table.set(match); 403 } 404 405 @Override public CharMatcher precomputed() { 406 return this; 407 } 408 }; 409 } 410 411 /** 412 * Returns a {@code char} matcher that matches any character except the one specified. 413 * 414 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 415 */ 416 public static CharMatcher isNot(final char match) { 417 return new CharMatcher() { 418 @Override public boolean matches(char c) { 419 return c != match; 420 } 421 422 @Override public CharMatcher and(CharMatcher other) { 423 return other.matches(match) ? super.and(other) : other; 424 } 425 426 @Override public CharMatcher or(CharMatcher other) { 427 return other.matches(match) ? ANY : this; 428 } 429 430 @Override public CharMatcher negate() { 431 return is(match); 432 } 433 }; 434 } 435 436 /** 437 * Returns a {@code char} matcher that matches any character present in the given character 438 * sequence. 439 */ 440 public static CharMatcher anyOf(final CharSequence sequence) { 441 switch (sequence.length()) { 442 case 0: 443 return NONE; 444 case 1: 445 return is(sequence.charAt(0)); 446 case 2: 447 final char match1 = sequence.charAt(0); 448 final char match2 = sequence.charAt(1); 449 return new CharMatcher() { 450 @Override public boolean matches(char c) { 451 return c == match1 || c == match2; 452 } 453 454 @Override void setBits(LookupTable table) { 455 table.set(match1); 456 table.set(match2); 457 } 458 459 @Override public CharMatcher precomputed() { 460 return this; 461 } 462 }; 463 } 464 465 final char[] chars = sequence.toString().toCharArray(); 466 Arrays.sort(chars); // not worth collapsing duplicates 467 468 return new CharMatcher() { 469 @Override public boolean matches(char c) { 470 return Arrays.binarySearch(chars, c) >= 0; 471 } 472 473 @Override void setBits(LookupTable table) { 474 for (char c : chars) { 475 table.set(c); 476 } 477 } 478 }; 479 } 480 481 /** 482 * Returns a {@code char} matcher that matches any character not present in the given character 483 * sequence. 484 */ 485 public static CharMatcher noneOf(CharSequence sequence) { 486 return anyOf(sequence).negate(); 487 } 488 489 /** 490 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 491 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 492 * CharMatcher.inRange('a', 'z')}. 493 * 494 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 495 */ 496 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 497 checkArgument(endInclusive >= startInclusive); 498 return new CharMatcher() { 499 @Override public boolean matches(char c) { 500 return startInclusive <= c && c <= endInclusive; 501 } 502 503 @Override void setBits(LookupTable table) { 504 char c = startInclusive; 505 while (true) { 506 table.set(c); 507 if (c++ == endInclusive) { 508 break; 509 } 510 } 511 } 512 513 @Override public CharMatcher precomputed() { 514 return this; 515 } 516 }; 517 } 518 519 /** 520 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 521 * which operates on primitive {@code char} instances instead. 522 */ 523 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 524 checkNotNull(predicate); 525 if (predicate instanceof CharMatcher) { 526 return (CharMatcher) predicate; 527 } 528 return new CharMatcher() { 529 @Override public boolean matches(char c) { 530 return predicate.apply(c); 531 } 532 533 @Override public boolean apply(Character character) { 534 return predicate.apply(checkNotNull(character)); 535 } 536 }; 537 } 538 539 // Constructors 540 541 /** 542 * Constructor for use by subclasses. 543 */ 544 protected CharMatcher() {} 545 546 // Abstract methods 547 548 /** Determines a true or false value for the given character. */ 549 public abstract boolean matches(char c); 550 551 // Non-static factories 552 553 /** 554 * Returns a matcher that matches any character not matched by this matcher. 555 */ 556 public CharMatcher negate() { 557 final CharMatcher original = this; 558 return new CharMatcher() { 559 @Override public boolean matches(char c) { 560 return !original.matches(c); 561 } 562 563 @Override public boolean matchesAllOf(CharSequence sequence) { 564 return original.matchesNoneOf(sequence); 565 } 566 567 @Override public boolean matchesNoneOf(CharSequence sequence) { 568 return original.matchesAllOf(sequence); 569 } 570 571 @Override public int countIn(CharSequence sequence) { 572 return sequence.length() - original.countIn(sequence); 573 } 574 575 @Override public CharMatcher negate() { 576 return original; 577 } 578 }; 579 } 580 581 /** 582 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 583 */ 584 public CharMatcher and(CharMatcher other) { 585 return new And(Arrays.asList(this, checkNotNull(other))); 586 } 587 588 private static class And extends CharMatcher { 589 List<CharMatcher> components; 590 591 And(List<CharMatcher> components) { 592 this.components = components; // Skip defensive copy (private) 593 } 594 595 @Override public boolean matches(char c) { 596 for (CharMatcher matcher : components) { 597 if (!matcher.matches(c)) { 598 return false; 599 } 600 } 601 return true; 602 } 603 604 @Override public CharMatcher and(CharMatcher other) { 605 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 606 newComponents.add(checkNotNull(other)); 607 return new And(newComponents); 608 } 609 } 610 611 /** 612 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 613 */ 614 public CharMatcher or(CharMatcher other) { 615 return new Or(Arrays.asList(this, checkNotNull(other))); 616 } 617 618 private static class Or extends CharMatcher { 619 List<CharMatcher> components; 620 621 Or(List<CharMatcher> components) { 622 this.components = components; // Skip defensive copy (private) 623 } 624 625 @Override public boolean matches(char c) { 626 for (CharMatcher matcher : components) { 627 if (matcher.matches(c)) { 628 return true; 629 } 630 } 631 return false; 632 } 633 634 @Override public CharMatcher or(CharMatcher other) { 635 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 636 newComponents.add(checkNotNull(other)); 637 return new Or(newComponents); 638 } 639 640 @Override void setBits(LookupTable table) { 641 for (CharMatcher matcher : components) { 642 matcher.setBits(table); 643 } 644 } 645 } 646 647 /** 648 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 649 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 650 * worthwhile only if the precomputed matcher is queried many thousands of times. 651 * 652 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 653 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 654 * worthwhile tradeoff in a browser. 655 */ 656 public CharMatcher precomputed() { 657 return Platform.precomputeCharMatcher(this); 658 } 659 660 /** 661 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 662 * on {@link Platform} so that we can have different behavior in GWT. 663 * 664 * <p>The default precomputation is to cache the configuration of the original matcher in an 665 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query 666 * than the original. 667 * 668 * <p>The default implementation creates a new bit array and passes it to {@link 669 * #setBits(LookupTable)}. 670 */ 671 CharMatcher precomputedInternal() { 672 final LookupTable table = new LookupTable(); 673 setBits(table); 674 675 return new CharMatcher() { 676 @Override public boolean matches(char c) { 677 return table.get(c); 678 } 679 680 // TODO(kevinb): make methods like negate() smart? 681 682 @Override public CharMatcher precomputed() { 683 return this; 684 } 685 }; 686 } 687 688 /** 689 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal 690 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched. 691 * 692 * <p>The default implementation loops over every possible character value, invoking {@link 693 * #matches} for each one. 694 */ 695 void setBits(LookupTable table) { 696 char c = Character.MIN_VALUE; 697 while (true) { 698 if (matches(c)) { 699 table.set(c); 700 } 701 if (c++ == Character.MAX_VALUE) { 702 break; 703 } 704 } 705 } 706 707 /** 708 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}. 709 * 710 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a 711 * simpler java.util.BitSet. 712 */ 713 private static final class LookupTable { 714 int[] data = new int[2048]; 715 716 void set(char index) { 717 data[index >> 5] |= (1 << index); 718 } 719 720 boolean get(char index) { 721 return (data[index >> 5] & (1 << index)) != 0; 722 } 723 } 724 725 // Text processing routines 726 727 /** 728 * Returns {@code true} if a character sequence contains at least one matching character. 729 * Equivalent to {@code !matchesNoneOf(sequence)}. 730 * 731 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 732 * character, until this returns {@code true} or the end is reached. 733 * 734 * @param sequence the character sequence to examine, possibly empty 735 * @return {@code true} if this matcher matches at least one character in the sequence 736 * @since 8.0 737 */ 738 public boolean matchesAnyOf(CharSequence sequence) { 739 return !matchesNoneOf(sequence); 740 } 741 742 /** 743 * Returns {@code true} if a character sequence contains only matching characters. 744 * 745 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 746 * character, until this returns {@code false} or the end is reached. 747 * 748 * @param sequence the character sequence to examine, possibly empty 749 * @return {@code true} if this matcher matches every character in the sequence, including when 750 * the sequence is empty 751 */ 752 public boolean matchesAllOf(CharSequence sequence) { 753 for (int i = sequence.length() - 1; i >= 0; i--) { 754 if (!matches(sequence.charAt(i))) { 755 return false; 756 } 757 } 758 return true; 759 } 760 761 /** 762 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 763 * {@code !matchesAnyOf(sequence)}. 764 * 765 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 766 * character, until this returns {@code false} or the end is reached. 767 * 768 * @param sequence the character sequence to examine, possibly empty 769 * @return {@code true} if this matcher matches every character in the sequence, including when 770 * the sequence is empty 771 */ 772 public boolean matchesNoneOf(CharSequence sequence) { 773 return indexIn(sequence) == -1; 774 } 775 776 // TODO(kevinb): add matchesAnyOf() 777 778 /** 779 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 780 * matching character is present. 781 * 782 * <p>The default implementation iterates over the sequence in forward order calling {@link 783 * #matches} for each character. 784 * 785 * @param sequence the character sequence to examine from the beginning 786 * @return an index, or {@code -1} if no character matches 787 */ 788 public int indexIn(CharSequence sequence) { 789 int length = sequence.length(); 790 for (int i = 0; i < length; i++) { 791 if (matches(sequence.charAt(i))) { 792 return i; 793 } 794 } 795 return -1; 796 } 797 798 /** 799 * Returns the index of the first matching character in a character sequence, starting from a 800 * given position, or {@code -1} if no character matches after that position. 801 * 802 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 803 * start}, calling {@link #matches} for each character. 804 * 805 * @param sequence the character sequence to examine 806 * @param start the first index to examine; must be nonnegative and no greater than {@code 807 * sequence.length()} 808 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 809 * or {@code -1} if no character matches 810 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 811 * sequence.length()} 812 */ 813 public int indexIn(CharSequence sequence, int start) { 814 int length = sequence.length(); 815 Preconditions.checkPositionIndex(start, length); 816 for (int i = start; i < length; i++) { 817 if (matches(sequence.charAt(i))) { 818 return i; 819 } 820 } 821 return -1; 822 } 823 824 /** 825 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 826 * matching character is present. 827 * 828 * <p>The default implementation iterates over the sequence in reverse order calling {@link 829 * #matches} for each character. 830 * 831 * @param sequence the character sequence to examine from the end 832 * @return an index, or {@code -1} if no character matches 833 */ 834 public int lastIndexIn(CharSequence sequence) { 835 for (int i = sequence.length() - 1; i >= 0; i--) { 836 if (matches(sequence.charAt(i))) { 837 return i; 838 } 839 } 840 return -1; 841 } 842 843 /** 844 * Returns the number of matching characters found in a character sequence. 845 */ 846 public int countIn(CharSequence sequence) { 847 int count = 0; 848 for (int i = 0; i < sequence.length(); i++) { 849 if (matches(sequence.charAt(i))) { 850 count++; 851 } 852 } 853 return count; 854 } 855 856 /** 857 * Returns a string containing all non-matching characters of a character sequence, in order. For 858 * example: <pre> {@code 859 * 860 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 861 * 862 * ... returns {@code "bzr"}. 863 */ 864 @CheckReturnValue 865 public String removeFrom(CharSequence sequence) { 866 String string = sequence.toString(); 867 int pos = indexIn(string); 868 if (pos == -1) { 869 return string; 870 } 871 872 char[] chars = string.toCharArray(); 873 int spread = 1; 874 875 // This unusual loop comes from extensive benchmarking 876 OUT: while (true) { 877 pos++; 878 while (true) { 879 if (pos == chars.length) { 880 break OUT; 881 } 882 if (matches(chars[pos])) { 883 break; 884 } 885 chars[pos - spread] = chars[pos]; 886 pos++; 887 } 888 spread++; 889 } 890 return new String(chars, 0, pos - spread); 891 } 892 893 /** 894 * Returns a string containing all matching characters of a character sequence, in order. For 895 * example: <pre> {@code 896 * 897 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 898 * 899 * ... returns {@code "aaa"}. 900 */ 901 @CheckReturnValue 902 public String retainFrom(CharSequence sequence) { 903 return negate().removeFrom(sequence); 904 } 905 906 /** 907 * Returns a string copy of the input character sequence, with each character that matches this 908 * matcher replaced by a given replacement character. For example: <pre> {@code 909 * 910 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 911 * 912 * ... returns {@code "rodor"}. 913 * 914 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 915 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 916 * character. 917 * 918 * @param sequence the character sequence to replace matching characters in 919 * @param replacement the character to append to the result string in place of each matching 920 * character in {@code sequence} 921 * @return the new string 922 */ 923 @CheckReturnValue 924 public String replaceFrom(CharSequence sequence, char replacement) { 925 String string = sequence.toString(); 926 int pos = indexIn(string); 927 if (pos == -1) { 928 return string; 929 } 930 char[] chars = string.toCharArray(); 931 chars[pos] = replacement; 932 for (int i = pos + 1; i < chars.length; i++) { 933 if (matches(chars[i])) { 934 chars[i] = replacement; 935 } 936 } 937 return new String(chars); 938 } 939 940 /** 941 * Returns a string copy of the input character sequence, with each character that matches this 942 * matcher replaced by a given replacement sequence. For example: <pre> {@code 943 * 944 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 945 * 946 * ... returns {@code "yoohoo"}. 947 * 948 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 949 * off calling {@link #replaceFrom(CharSequence, char)} directly. 950 * 951 * @param sequence the character sequence to replace matching characters in 952 * @param replacement the characters to append to the result string in place of each matching 953 * character in {@code sequence} 954 * @return the new string 955 */ 956 @CheckReturnValue 957 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 958 int replacementLen = replacement.length(); 959 if (replacementLen == 0) { 960 return removeFrom(sequence); 961 } 962 if (replacementLen == 1) { 963 return replaceFrom(sequence, replacement.charAt(0)); 964 } 965 966 String string = sequence.toString(); 967 int pos = indexIn(string); 968 if (pos == -1) { 969 return string; 970 } 971 972 int len = string.length(); 973 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 974 975 int oldpos = 0; 976 do { 977 buf.append(string, oldpos, pos); 978 buf.append(replacement); 979 oldpos = pos + 1; 980 pos = indexIn(string, oldpos); 981 } while (pos != -1); 982 983 buf.append(string, oldpos, len); 984 return buf.toString(); 985 } 986 987 /** 988 * Returns a substring of the input character sequence that omits all characters this matcher 989 * matches from the beginning and from the end of the string. For example: <pre> {@code 990 * 991 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 992 * 993 * ... returns {@code "cat"}. 994 * 995 * <p>Note that: <pre> {@code 996 * 997 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 998 * 999 * ... is equivalent to {@link String#trim()}. 1000 */ 1001 @CheckReturnValue 1002 public String trimFrom(CharSequence sequence) { 1003 int len = sequence.length(); 1004 int first; 1005 int last; 1006 1007 for (first = 0; first < len; first++) { 1008 if (!matches(sequence.charAt(first))) { 1009 break; 1010 } 1011 } 1012 for (last = len - 1; last > first; last--) { 1013 if (!matches(sequence.charAt(last))) { 1014 break; 1015 } 1016 } 1017 1018 return sequence.subSequence(first, last + 1).toString(); 1019 } 1020 1021 /** 1022 * Returns a substring of the input character sequence that omits all characters this matcher 1023 * matches from the beginning of the string. For example: <pre> {@code 1024 * 1025 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1026 * 1027 * ... returns {@code "catbab"}. 1028 */ 1029 @CheckReturnValue 1030 public String trimLeadingFrom(CharSequence sequence) { 1031 int len = sequence.length(); 1032 int first; 1033 1034 for (first = 0; first < len; first++) { 1035 if (!matches(sequence.charAt(first))) { 1036 break; 1037 } 1038 } 1039 1040 return sequence.subSequence(first, len).toString(); 1041 } 1042 1043 /** 1044 * Returns a substring of the input character sequence that omits all characters this matcher 1045 * matches from the end of the string. For example: <pre> {@code 1046 * 1047 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1048 * 1049 * ... returns {@code "abacat"}. 1050 */ 1051 @CheckReturnValue 1052 public String trimTrailingFrom(CharSequence sequence) { 1053 int len = sequence.length(); 1054 int last; 1055 1056 for (last = len - 1; last >= 0; last--) { 1057 if (!matches(sequence.charAt(last))) { 1058 break; 1059 } 1060 } 1061 1062 return sequence.subSequence(0, last + 1).toString(); 1063 } 1064 1065 /** 1066 * Returns a string copy of the input character sequence, with each group of consecutive 1067 * characters that match this matcher replaced by a single replacement character. For example: 1068 * <pre> {@code 1069 * 1070 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1071 * 1072 * ... returns {@code "b-p-r"}. 1073 * 1074 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1075 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1076 * character. 1077 * 1078 * @param sequence the character sequence to replace matching groups of characters in 1079 * @param replacement the character to append to the result string in place of each group of 1080 * matching characters in {@code sequence} 1081 * @return the new string 1082 */ 1083 @CheckReturnValue 1084 public String collapseFrom(CharSequence sequence, char replacement) { 1085 int first = indexIn(sequence); 1086 if (first == -1) { 1087 return sequence.toString(); 1088 } 1089 1090 // TODO(kevinb): see if this implementation can be made faster 1091 StringBuilder builder = new StringBuilder(sequence.length()) 1092 .append(sequence.subSequence(0, first)) 1093 .append(replacement); 1094 boolean in = true; 1095 for (int i = first + 1; i < sequence.length(); i++) { 1096 char c = sequence.charAt(i); 1097 if (apply(c)) { 1098 if (!in) { 1099 builder.append(replacement); 1100 in = true; 1101 } 1102 } else { 1103 builder.append(c); 1104 in = false; 1105 } 1106 } 1107 return builder.toString(); 1108 } 1109 1110 /** 1111 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1112 * groups of matching characters at the start or end of the sequence are removed without 1113 * replacement. 1114 */ 1115 @CheckReturnValue 1116 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1117 int first = negate().indexIn(sequence); 1118 if (first == -1) { 1119 return ""; // everything matches. nothing's left. 1120 } 1121 StringBuilder builder = new StringBuilder(sequence.length()); 1122 boolean inMatchingGroup = false; 1123 for (int i = first; i < sequence.length(); i++) { 1124 char c = sequence.charAt(i); 1125 if (apply(c)) { 1126 inMatchingGroup = true; 1127 } else { 1128 if (inMatchingGroup) { 1129 builder.append(replacement); 1130 inMatchingGroup = false; 1131 } 1132 builder.append(c); 1133 } 1134 } 1135 return builder.toString(); 1136 } 1137 1138 // Predicate interface 1139 1140 /** 1141 * Returns {@code true} if this matcher matches the given character. 1142 * 1143 * @throws NullPointerException if {@code character} is null 1144 */ 1145 @Override public boolean apply(Character character) { 1146 return matches(character); 1147 } 1148 }