1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.Char; 13 14 import hunt.Nullable; 15 import hunt.Exceptions; 16 import hunt.text.Common; 17 /** 18 * The {@code Character} class wraps a value of the primitive 19 * type {@code char} in an object. An object of type 20 * {@code Character} contains a single field whose type is 21 * {@code char}. 22 * <p> 23 * In addition, this class provides several methods for determining 24 * a character's category (lowercase letter, digit, etc.) and for converting 25 * characters from uppercase to lowercase and vice versa. 26 * <p> 27 * Character information is based on the Unicode Standard, version 8.0.0. 28 * <p> 29 * The methods and data of class {@code Character} are defined by 30 * the information in the <i>UnicodeData</i> file that is part of the 31 * Unicode Character Database maintained by the Unicode 32 * Consortium. This file specifies various properties including name 33 * and general category for every defined Unicode code point or 34 * character range. 35 * <p> 36 * The file and its description are available from the Unicode Consortium at: 37 * <ul> 38 * <li><a href="http://www.unicode.org">http://www.unicode.org</a> 39 * </ul> 40 * 41 * <h3><a id="unicode">Unicode Character Representations</a></h3> 42 * 43 * <p>The {@code char} data type (and therefore the value that a 44 * {@code Character} object encapsulates) are based on the 45 * original Unicode specification, which defined characters as 46 * fixed-width 16-bit entities. The Unicode Standard has since been 47 * changed to allow for characters whose representation requires more 48 * than 16 bits. The range of legal <em>code point</em>s is now 49 * U+0000 to U+10FFFF, known as <em>Unicode scalar value</em>. 50 * (Refer to the <a 51 * href="http://www.unicode.org/reports/tr27/#notation"><i> 52 * definition</i></a> of the U+<i>n</i> notation in the Unicode 53 * Standard.) 54 * 55 * <p><a id="BMP">The set of characters from U+0000 to U+FFFF</a> is 56 * sometimes referred to as the <em>Basic Multilingual Plane (BMP)</em>. 57 * <a id="supplementary">Characters</a> whose code points are greater 58 * than U+FFFF are called <em>supplementary character</em>s. The Java 59 * platform uses the UTF-16 representation in {@code char} arrays and 60 * in the {@code string} and {@code StringBuffer} classes. In 61 * this representation, supplementary characters are represented as a pair 62 * of {@code char} values, the first from the <em>high-surrogates</em> 63 * range, (\uD800-\uDBFF), the second from the 64 * <em>low-surrogates</em> range (\uDC00-\uDFFF). 65 * 66 * <p>A {@code char} value, therefore, represents Basic 67 * Multilingual Plane (BMP) code points, including the surrogate 68 * code points, or code units of the UTF-16 encoding. An 69 * {@code int} value represents all Unicode code points, 70 * including supplementary code points. The lower (least significant) 71 * 21 bits of {@code int} are used to represent Unicode code 72 * points and the upper (most significant) 11 bits must be zero. 73 * Unless otherwise specified, the behavior with respect to 74 * supplementary characters and surrogate {@code char} values is 75 * as follows: 76 * 77 * <ul> 78 * <li>The methods that only accept a {@code char} value cannot support 79 * supplementary characters. They treat {@code char} values from the 80 * surrogate ranges as undefined characters. For example, 81 * {@code Character.isLetter('\u005CuD840')} returns {@code false}, even though 82 * this specific value if followed by any low-surrogate value in a string 83 * would represent a letter. 84 * 85 * <li>The methods that accept an {@code int} value support all 86 * Unicode characters, including supplementary characters. For 87 * example, {@code Character.isLetter(0x2F81A)} returns 88 * {@code true} because the code point value represents a letter 89 * (a CJK ideograph). 90 * </ul> 91 * 92 * <p>In the Java SE API documentation, <em>Unicode code point</em> is 93 * used for character values in the range between U+0000 and U+10FFFF, 94 * and <em>Unicode code unit</em> is used for 16-bit 95 * {@code char} values that are code units of the <em>UTF-16</em> 96 * encoding. For more information on Unicode terminology, refer to the 97 * <a href="http://www.unicode.org/glossary/">Unicode Glossary</a>. 98 * 99 * @author Lee Boynton 100 * @author Guy Steele 101 * @author Akira Tanaka 102 * @author Martin Buchholz 103 * @author Ulf Zibis 104 */ 105 class Char : Nullable!char { 106 /** 107 * The minimum radix available for conversion to and from strings. 108 * The constant value of this field is the smallest value permitted 109 * for the radix argument in radix-conversion methods such as the 110 * {@code digit} method, the {@code forDigit} method, and the 111 * {@code toString} method of class {@code Integer}. 112 * 113 * @see Character#digit(char, int) 114 * @see Character#forDigit(int, int) 115 * @see Integer#toString(int, int) 116 * @see Integer#valueOf(string) 117 */ 118 enum int MIN_RADIX = 2; 119 120 /** 121 * The maximum radix available for conversion to and from strings. 122 * The constant value of this field is the largest value permitted 123 * for the radix argument in radix-conversion methods such as the 124 * {@code digit} method, the {@code forDigit} method, and the 125 * {@code toString} method of class {@code Integer}. 126 * 127 * @see Character#digit(char, int) 128 * @see Character#forDigit(int, int) 129 * @see Integer#toString(int, int) 130 * @see Integer#valueOf(string) 131 */ 132 enum int MAX_RADIX = 36; 133 134 /** 135 * The constant value of this field is the smallest value of type 136 * {@code char}, {@code '\u005Cu0000'}. 137 * 138 */ 139 enum char MIN_VALUE = '\u0000'; 140 141 /** 142 * The constant value of this field is the largest value of type 143 * {@code char}, {@code '\u005CuFFFF'}. 144 * 145 */ 146 // enum char MAX_VALUE = '\uFFFF'; 147 148 /** 149 * The {@code Class} instance representing the primitive type 150 * {@code char}. 151 * 152 */ 153 // 154 // enum Class<Character> TYPE = (Class<Character>) Class.getPrimitiveClass("char"); 155 156 /* 157 * Normative general types 158 */ 159 160 /* 161 * General character types 162 */ 163 164 /** 165 * General category "Cn" in the Unicode specification. 166 */ 167 enum byte UNASSIGNED = 0; 168 169 /** 170 * General category "Lu" in the Unicode specification. 171 */ 172 enum byte UPPERCASE_LETTER = 1; 173 174 /** 175 * General category "Ll" in the Unicode specification. 176 */ 177 enum byte LOWERCASE_LETTER = 2; 178 179 /** 180 * General category "Lt" in the Unicode specification. 181 */ 182 enum byte TITLECASE_LETTER = 3; 183 184 /** 185 * General category "Lm" in the Unicode specification. 186 */ 187 enum byte MODIFIER_LETTER = 4; 188 189 /** 190 * General category "Lo" in the Unicode specification. 191 */ 192 enum byte OTHER_LETTER = 5; 193 194 /** 195 * General category "Mn" in the Unicode specification. 196 */ 197 enum byte NON_SPACING_MARK = 6; 198 199 /** 200 * General category "Me" in the Unicode specification. 201 */ 202 enum byte ENCLOSING_MARK = 7; 203 204 /** 205 * General category "Mc" in the Unicode specification. 206 */ 207 enum byte COMBINING_SPACING_MARK = 8; 208 209 /** 210 * General category "Nd" in the Unicode specification. 211 */ 212 enum byte DECIMAL_DIGIT_NUMBER = 9; 213 214 /** 215 * General category "Nl" in the Unicode specification. 216 */ 217 enum byte LETTER_NUMBER = 10; 218 219 /** 220 * General category "No" in the Unicode specification. 221 */ 222 enum byte OTHER_NUMBER = 11; 223 224 /** 225 * General category "Zs" in the Unicode specification. 226 */ 227 enum byte SPACE_SEPARATOR = 12; 228 229 /** 230 * General category "Zl" in the Unicode specification. 231 */ 232 enum byte LINE_SEPARATOR = 13; 233 234 /** 235 * General category "Zp" in the Unicode specification. 236 */ 237 enum byte PARAGRAPH_SEPARATOR = 14; 238 239 /** 240 * General category "Cc" in the Unicode specification. 241 */ 242 enum byte CONTROL = 15; 243 244 /** 245 * General category "Cf" in the Unicode specification. 246 */ 247 enum byte FORMAT = 16; 248 249 /** 250 * General category "Co" in the Unicode specification. 251 */ 252 enum byte PRIVATE_USE = 18; 253 254 /** 255 * General category "Cs" in the Unicode specification. 256 */ 257 enum byte SURROGATE = 19; 258 259 /** 260 * General category "Pd" in the Unicode specification. 261 */ 262 enum byte DASH_PUNCTUATION = 20; 263 264 /** 265 * General category "Ps" in the Unicode specification. 266 */ 267 enum byte START_PUNCTUATION = 21; 268 269 /** 270 * General category "Pe" in the Unicode specification. 271 */ 272 enum byte END_PUNCTUATION = 22; 273 274 /** 275 * General category "Pc" in the Unicode specification. 276 */ 277 enum byte CONNECTOR_PUNCTUATION = 23; 278 279 /** 280 * General category "Po" in the Unicode specification. 281 */ 282 enum byte OTHER_PUNCTUATION = 24; 283 284 /** 285 * General category "Sm" in the Unicode specification. 286 */ 287 enum byte MATH_SYMBOL = 25; 288 289 /** 290 * General category "Sc" in the Unicode specification. 291 */ 292 enum byte CURRENCY_SYMBOL = 26; 293 294 /** 295 * General category "Sk" in the Unicode specification. 296 */ 297 enum byte MODIFIER_SYMBOL = 27; 298 299 /** 300 * General category "So" in the Unicode specification. 301 */ 302 enum byte OTHER_SYMBOL = 28; 303 304 /** 305 * General category "Pi" in the Unicode specification. 306 */ 307 enum byte INITIAL_QUOTE_PUNCTUATION = 29; 308 309 /** 310 * General category "Pf" in the Unicode specification. 311 */ 312 enum byte FINAL_QUOTE_PUNCTUATION = 30; 313 314 /** 315 * Error flag. Use int (code point) to avoid confusion with U+FFFF. 316 */ 317 enum int ERROR = 0xFFFFFFFF; 318 319 /** 320 * Undefined bidirectional character type. Undefined {@code char} 321 * values have undefined directionality in the Unicode specification. 322 */ 323 enum byte DIRECTIONALITY_UNDEFINED = -1; 324 325 /** 326 * Strong bidirectional character type "L" in the Unicode specification. 327 */ 328 enum byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 329 330 /** 331 * Strong bidirectional character type "R" in the Unicode specification. 332 */ 333 enum byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 334 335 /** 336 * Strong bidirectional character type "AL" in the Unicode specification. 337 */ 338 enum byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 339 340 /** 341 * Weak bidirectional character type "EN" in the Unicode specification. 342 */ 343 enum byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 344 345 /** 346 * Weak bidirectional character type "ES" in the Unicode specification. 347 */ 348 enum byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 349 350 /** 351 * Weak bidirectional character type "ET" in the Unicode specification. 352 */ 353 enum byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 354 355 /** 356 * Weak bidirectional character type "AN" in the Unicode specification. 357 */ 358 enum byte DIRECTIONALITY_ARABIC_NUMBER = 6; 359 360 /** 361 * Weak bidirectional character type "CS" in the Unicode specification. 362 */ 363 enum byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 364 365 /** 366 * Weak bidirectional character type "NSM" in the Unicode specification. 367 */ 368 enum byte DIRECTIONALITY_NONSPACING_MARK = 8; 369 370 /** 371 * Weak bidirectional character type "BN" in the Unicode specification. 372 */ 373 enum byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 374 375 /** 376 * Neutral bidirectional character type "B" in the Unicode specification. 377 */ 378 enum byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 379 380 /** 381 * Neutral bidirectional character type "S" in the Unicode specification. 382 */ 383 enum byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 384 385 /** 386 * Neutral bidirectional character type "WS" in the Unicode specification. 387 */ 388 enum byte DIRECTIONALITY_WHITESPACE = 12; 389 390 /** 391 * Neutral bidirectional character type "ON" in the Unicode specification. 392 */ 393 enum byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 394 395 /** 396 * Strong bidirectional character type "LRE" in the Unicode specification. 397 */ 398 enum byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 399 400 /** 401 * Strong bidirectional character type "LRO" in the Unicode specification. 402 */ 403 enum byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 404 405 /** 406 * Strong bidirectional character type "RLE" in the Unicode specification. 407 */ 408 enum byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 409 410 /** 411 * Strong bidirectional character type "RLO" in the Unicode specification. 412 */ 413 enum byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 414 415 /** 416 * Weak bidirectional character type "PDF" in the Unicode specification. 417 */ 418 enum byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 419 420 /** 421 * The minimum value of a 422 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 423 * Unicode high-surrogate code unit</a> 424 * in the UTF-16 encoding, constant {@code '\u005CuD800'}. 425 * A high-surrogate is also known as a <i>leading-surrogate</i>. 426 * 427 */ 428 enum wchar MIN_HIGH_SURROGATE = 0xD800; 429 430 /** 431 * The maximum value of a 432 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 433 * Unicode high-surrogate code unit</a> 434 * in the UTF-16 encoding, constant {@code '\u005CuDBFF'}. 435 * A high-surrogate is also known as a <i>leading-surrogate</i>. 436 * 437 */ 438 enum wchar MAX_HIGH_SURROGATE = 0xDBFF; 439 440 /** 441 * The minimum value of a 442 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 443 * Unicode low-surrogate code unit</a> 444 * in the UTF-16 encoding, constant {@code '\u005CuDC00'}. 445 * A low-surrogate is also known as a <i>trailing-surrogate</i>. 446 * 447 */ 448 enum wchar MIN_LOW_SURROGATE = 0xDC00; 449 450 /** 451 * The maximum value of a 452 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 453 * Unicode low-surrogate code unit</a> 454 * in the UTF-16 encoding, constant {@code '\u005CuDFFF'}. 455 * A low-surrogate is also known as a <i>trailing-surrogate</i>. 456 * 457 */ 458 enum wchar MAX_LOW_SURROGATE = 0xDFFF; 459 460 /** 461 * The minimum value of a Unicode surrogate code unit in the 462 * UTF-16 encoding, constant {@code '\u005CuD800'}. 463 * 464 */ 465 enum wchar MIN_SURROGATE = MIN_HIGH_SURROGATE; 466 467 /** 468 * The maximum value of a Unicode surrogate code unit in the 469 * UTF-16 encoding, constant {@code '\u005CuDFFF'}. 470 * 471 */ 472 enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE; 473 474 /** 475 * The maximum value of a Unicode surrogate code unit in the 476 * UTF-16 encoding, constant {@code '\u005CuDFFF'}. 477 * 478 */ 479 // enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE; 480 481 /** 482 * The minimum value of a 483 * <a href="http://www.unicode.org/glossary/#supplementary_code_point"> 484 * Unicode supplementary code point</a>, constant {@code U+10000}. 485 * 486 */ 487 enum int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000; 488 489 /** 490 * The minimum value of a 491 * <a href="http://www.unicode.org/glossary/#code_point"> 492 * Unicode code point</a>, constant {@code U+0000}. 493 * 494 */ 495 enum int MIN_CODE_POINT = 0x000000; 496 497 /** 498 * The maximum value of a 499 * <a href="http://www.unicode.org/glossary/#code_point"> 500 * Unicode code point</a>, constant {@code U+10FFFF}. 501 * 502 */ 503 enum int MAX_CODE_POINT = 0X10FFFF; 504 505 this(char value) { 506 super(value); 507 } 508 509 /** 510 * Returns a {@code Character} instance representing the specified 511 * {@code char} value. 512 * If a new {@code Character} instance is not required, this method 513 * should generally be used in preference to the constructor 514 * {@link #Character(char)}, as this method is likely to yield 515 * significantly better space and time performance by caching 516 * frequently requested values. 517 * 518 * This method will always cache values in the range {@code 519 * '\u005Cu0000'} to {@code '\u005Cu007F'}, inclusive, and may 520 * cache other values outside of this range. 521 * 522 * @param c a char value. 523 * @return a {@code Character} instance representing {@code c}. 524 */ 525 static Char valueOf(char c) { 526 if (c <= 127) { // must cache 527 return CharacterCache.cache[cast(int)c]; 528 } 529 return new Char(c); 530 } 531 532 /** 533 * Returns the value of this {@code Character} object. 534 * @return the primitive {@code char} value represented by 535 * this object. 536 */ 537 char charValue() { 538 return _value; 539 } 540 541 override size_t toHash() @trusted nothrow { 542 return _value; 543 } 544 545 /** 546 * Determines the number of {@code char} values needed to 547 * represent the specified character (Unicode code point). If the 548 * specified character is equal to or greater than 0x10000, then 549 * the method returns 2. Otherwise, the method returns 1. 550 * 551 * <p>This method doesn't validate the specified character to be a 552 * valid Unicode code point. The caller must validate the 553 * character value using {@link #isValidCodePoint(int) isValidCodePoint} 554 * if necessary. 555 * 556 * @param codePoint the character (Unicode code point) to be tested. 557 * @return 2 if the character is a valid supplementary character; 1 otherwise. 558 * @see Character#isSupplementaryCodePoint(int) 559 */ 560 static int charCount(int codePoint) { 561 return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1; 562 } 563 564 /** 565 * Converts the specified surrogate pair to its supplementary code 566 * point value. This method does not validate the specified 567 * surrogate pair. The caller must validate it using {@link 568 * #isSurrogatePair(char, char) isSurrogatePair} if necessary. 569 * 570 * @param high the high-surrogate code unit 571 * @param low the low-surrogate code unit 572 * @return the supplementary code point composed from the 573 * specified surrogate pair. 574 */ 575 static int toCodePoint(char high, char low) { 576 // Optimized form of: 577 // return ((high - MIN_HIGH_SURROGATE) << 10) 578 // + (low - MIN_LOW_SURROGATE) 579 // + MIN_SUPPLEMENTARY_CODE_POINT; 580 return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT 581 - (MIN_HIGH_SURROGATE << 10) 582 - MIN_LOW_SURROGATE); 583 } 584 585 /** 586 * Determines if the specified character is an ISO control 587 * character. A character is considered to be an ISO control 588 * character if its code is in the range {@code '\u005Cu0000'} 589 * through {@code '\u005Cu001F'} or in the range 590 * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}. 591 * 592 * <p><b>Note:</b> This method cannot handle <a 593 * href="#supplementary"> supplementary characters</a>. To support 594 * all Unicode characters, including supplementary characters, use 595 * the {@link #isISOControl(int)} method. 596 * 597 * @param ch the character to be tested. 598 * @return {@code true} if the character is an ISO control character; 599 * {@code false} otherwise. 600 * 601 * @see Character#isSpaceChar(char) 602 * @see Character#isWhitespace(char) 603 */ 604 public static bool isISOControl(char ch) { 605 return isISOControl(cast(int)ch); 606 } 607 608 /** 609 * Determines if the referenced character (Unicode code point) is an ISO control 610 * character. A character is considered to be an ISO control 611 * character if its code is in the range {@code '\u005Cu0000'} 612 * through {@code '\u005Cu001F'} or in the range 613 * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}. 614 * 615 * @param codePoint the character (Unicode code point) to be tested. 616 * @return {@code true} if the character is an ISO control character; 617 * {@code false} otherwise. 618 * @see Character#isSpaceChar(int) 619 * @see Character#isWhitespace(int) 620 */ 621 public static bool isISOControl(int codePoint) { 622 // Optimized form of: 623 // (codePoint >= 0x00 && codePoint <= 0x1F) || 624 // (codePoint >= 0x7F && codePoint <= 0x9F); 625 return codePoint <= 0x9F && 626 (codePoint >= 0x7F || (codePoint >>> 5 == 0)); 627 } 628 629 /** 630 * Converts the specified character (Unicode code point) to its 631 * UTF-16 representation stored in a {@code char} array. If 632 * the specified code point is a BMP (Basic Multilingual Plane or 633 * Plane 0) value, the resulting {@code char} array has 634 * the same value as {@code codePoint}. If the specified code 635 * point is a supplementary code point, the resulting 636 * {@code char} array has the corresponding surrogate pair. 637 * 638 * @param codePoint a Unicode code point 639 * @return a {@code char} array having 640 * {@code codePoint}'s UTF-16 representation. 641 * @throws IllegalArgumentException if the specified 642 * {@code codePoint} is not a valid Unicode code point. 643 */ 644 public static char[] toChars(int codePoint) { 645 if (isBmpCodePoint(codePoint)) { 646 return [ cast(char) codePoint ]; 647 } else if (isValidCodePoint(codePoint)) { 648 char[] result = new char[2]; 649 toSurrogates(codePoint, result, 0); 650 return result; 651 } else { 652 import std.string; 653 throw new IllegalArgumentException( 654 format("Not a valid Unicode code point: 0x%X", codePoint)); 655 } 656 } 657 658 /** 659 * Determines whether the specified character (Unicode code point) 660 * is in the <a href="#BMP">Basic Multilingual Plane (BMP)</a>. 661 * Such code points can be represented using a single {@code char}. 662 * 663 * @param codePoint the character (Unicode code point) to be tested 664 * @return {@code true} if the specified code point is between 665 * {@link #MIN_VALUE} and {@link #MAX_VALUE} inclusive; 666 * {@code false} otherwise. 667 */ 668 public static bool isBmpCodePoint(int codePoint) { 669 return codePoint >>> 16 == 0; 670 // Optimized form of: 671 // codePoint >= MIN_VALUE && codePoint <= MAX_VALUE 672 // We consistently use logical shift (>>>) to facilitate 673 // additional runtime optimizations. 674 } 675 676 /** 677 * Determines whether the specified code point is a valid 678 * <a href="http://www.unicode.org/glossary/#code_point"> 679 * Unicode code point value</a>. 680 * 681 * @param codePoint the Unicode code point to be tested 682 * @return {@code true} if the specified code point value is between 683 * {@link #MIN_CODE_POINT} and 684 * {@link #MAX_CODE_POINT} inclusive; 685 * {@code false} otherwise. 686 */ 687 public static bool isValidCodePoint(int codePoint) { 688 // Optimized form of: 689 // codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT 690 int plane = codePoint >>> 16; 691 return plane < ((MAX_CODE_POINT + 1) >>> 16); 692 } 693 694 static void toSurrogates(int codePoint, char[] dst, int index) { 695 // We write elements "backwards" to guarantee all-or-nothing 696 dst[index+1] = lowSurrogate(codePoint); 697 dst[index] = highSurrogate(codePoint); 698 } 699 700 /** 701 * Returns the trailing surrogate (a 702 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 703 * low surrogate code unit</a>) of the 704 * <a href="http://www.unicode.org/glossary/#surrogate_pair"> 705 * surrogate pair</a> 706 * representing the specified supplementary character (Unicode 707 * code point) in the UTF-16 encoding. If the specified character 708 * is not a 709 * <a href="Character.html#supplementary">supplementary character</a>, 710 * an unspecified {@code char} is returned. 711 * 712 * <p>If 713 * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)} 714 * is {@code true}, then 715 * {@link #isLowSurrogate isLowSurrogate}{@code (lowSurrogate(x))} and 716 * {@link #toCodePoint toCodePoint}{@code (}{@link #highSurrogate highSurrogate}{@code (x), lowSurrogate(x)) == x} 717 * are also always {@code true}. 718 * 719 * @param codePoint a supplementary character (Unicode code point) 720 * @return the trailing surrogate code unit used to represent the 721 * character in the UTF-16 encoding 722 */ 723 public static char lowSurrogate(int codePoint) { 724 return cast(char) ((codePoint & 0x3ff) + MIN_LOW_SURROGATE); 725 } 726 727 728 /** 729 * Returns the leading surrogate (a 730 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 731 * high surrogate code unit</a>) of the 732 * <a href="http://www.unicode.org/glossary/#surrogate_pair"> 733 * surrogate pair</a> 734 * representing the specified supplementary character (Unicode 735 * code point) in the UTF-16 encoding. If the specified character 736 * is not a 737 * <a href="Character.html#supplementary">supplementary character</a>, 738 * an unspecified {@code char} is returned. 739 * 740 * <p>If 741 * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)} 742 * is {@code true}, then 743 * {@link #isHighSurrogate isHighSurrogate}{@code (highSurrogate(x))} and 744 * {@link #toCodePoint toCodePoint}{@code (highSurrogate(x), }{@link #lowSurrogate lowSurrogate}{@code (x)) == x} 745 * are also always {@code true}. 746 * 747 * @param codePoint a supplementary character (Unicode code point) 748 * @return the leading surrogate code unit used to represent the 749 * character in the UTF-16 encoding 750 */ 751 public static char highSurrogate(int codePoint) { 752 return cast(char) ((codePoint >>> 10) 753 + (MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))); 754 } 755 756 /** 757 * Returns the code point at the given index of the 758 * {@code CharSequence}. If the {@code char} value at 759 * the given index in the {@code CharSequence} is in the 760 * high-surrogate range, the following index is less than the 761 * length of the {@code CharSequence}, and the 762 * {@code char} value at the following index is in the 763 * low-surrogate range, then the supplementary code point 764 * corresponding to this surrogate pair is returned. Otherwise, 765 * the {@code char} value at the given index is returned. 766 * 767 * @param seq a sequence of {@code char} values (Unicode code 768 * units) 769 * @param index the index to the {@code char} values (Unicode 770 * code units) in {@code seq} to be converted 771 * @return the Unicode code point at the given index 772 * @throws NullPointerException if {@code seq} is null. 773 * @throws IndexOutOfBoundsException if the value 774 * {@code index} is negative or not less than 775 * {@link CharSequence#length() seq.length()}. 776 */ 777 public static int codePointAt(string seq, int index) { 778 char c1 = seq.charAt(index); 779 if (isHighSurrogate(c1) && ++index < seq.length) { 780 char c2 = seq.charAt(index); 781 if (isLowSurrogate(c2)) { 782 return toCodePoint(c1, c2); 783 } 784 } 785 return c1; 786 } 787 788 /** 789 * Determines if the given {@code char} value is a 790 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 791 * Unicode high-surrogate code unit</a> 792 * (also known as <i>leading-surrogate code unit</i>). 793 * 794 * <p>Such values do not represent characters by themselves, 795 * but are used in the representation of 796 * <a href="#supplementary">supplementary characters</a> 797 * in the UTF-16 encoding. 798 * 799 * @param ch the {@code char} value to be tested. 800 * @return {@code true} if the {@code char} value is between 801 * {@link #MIN_HIGH_SURROGATE} and 802 * {@link #MAX_HIGH_SURROGATE} inclusive; 803 * {@code false} otherwise. 804 * @see Character#isLowSurrogate(char) 805 * @see Character.UnicodeBlock#of(int) 806 */ 807 public static bool isHighSurrogate(char ch) { 808 // Help VM constant-fold; MAX_HIGH_SURROGATE + 1 == MIN_LOW_SURROGATE 809 return ch >= MIN_HIGH_SURROGATE && ch < (MAX_HIGH_SURROGATE + 1); 810 } 811 812 /** 813 * Determines if the given {@code char} value is a 814 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 815 * Unicode low-surrogate code unit</a> 816 * (also known as <i>trailing-surrogate code unit</i>). 817 * 818 * <p>Such values do not represent characters by themselves, 819 * but are used in the representation of 820 * <a href="#supplementary">supplementary characters</a> 821 * in the UTF-16 encoding. 822 * 823 * @param ch the {@code char} value to be tested. 824 * @return {@code true} if the {@code char} value is between 825 * {@link #MIN_LOW_SURROGATE} and 826 * {@link #MAX_LOW_SURROGATE} inclusive; 827 * {@code false} otherwise. 828 * @see Character#isHighSurrogate(char) 829 */ 830 public static bool isLowSurrogate(char ch) { 831 return ch >= MIN_LOW_SURROGATE && ch < (MAX_LOW_SURROGATE + 1); 832 } 833 834 /** 835 * Determines if the specified character is a letter. 836 * <p> 837 * A character is considered to be a letter if its general 838 * category type, provided by {@code Character.getType(ch)}, 839 * is any of the following: 840 * <ul> 841 * <li> {@code UPPERCASE_LETTER} 842 * <li> {@code LOWERCASE_LETTER} 843 * <li> {@code TITLECASE_LETTER} 844 * <li> {@code MODIFIER_LETTER} 845 * <li> {@code OTHER_LETTER} 846 * </ul> 847 * 848 * Not all letters have case. Many characters are 849 * letters but are neither uppercase nor lowercase nor titlecase. 850 * 851 * <p><b>Note:</b> This method cannot handle <a 852 * href="#supplementary"> supplementary characters</a>. To support 853 * all Unicode characters, including supplementary characters, use 854 * the {@link #isLetter(int)} method. 855 * 856 * @param ch the character to be tested. 857 * @return {@code true} if the character is a letter; 858 * {@code false} otherwise. 859 * @see Character#isDigit(char) 860 * @see Character#isJavaIdentifierStart(char) 861 * @see Character#isJavaLetter(char) 862 * @see Character#isJavaLetterOrDigit(char) 863 * @see Character#isLetterOrDigit(char) 864 * @see Character#isLowerCase(char) 865 * @see Character#isTitleCase(char) 866 * @see Character#isUnicodeIdentifierStart(char) 867 * @see Character#isUpperCase(char) 868 */ 869 public static bool isLetter(char ch) { 870 return isLetter(cast(int)ch); 871 } 872 873 /** 874 * Determines if the specified character (Unicode code point) is a letter. 875 * <p> 876 * A character is considered to be a letter if its general 877 * category type, provided by {@link Character#getType(int) getType(codePoint)}, 878 * is any of the following: 879 * <ul> 880 * <li> {@code UPPERCASE_LETTER} 881 * <li> {@code LOWERCASE_LETTER} 882 * <li> {@code TITLECASE_LETTER} 883 * <li> {@code MODIFIER_LETTER} 884 * <li> {@code OTHER_LETTER} 885 * </ul> 886 * 887 * Not all letters have case. Many characters are 888 * letters but are neither uppercase nor lowercase nor titlecase. 889 * 890 * @param codePoint the character (Unicode code point) to be tested. 891 * @return {@code true} if the character is a letter; 892 * {@code false} otherwise. 893 * @see Character#isDigit(int) 894 * @see Character#isJavaIdentifierStart(int) 895 * @see Character#isLetterOrDigit(int) 896 * @see Character#isLowerCase(int) 897 * @see Character#isTitleCase(int) 898 * @see Character#isUnicodeIdentifierStart(int) 899 * @see Character#isUpperCase(int) 900 */ 901 // public static bool isLetter(int codePoint) { 902 // return ((((1 << Char.UPPERCASE_LETTER) | 903 // (1 << Char.LOWERCASE_LETTER) | 904 // (1 << Char.TITLECASE_LETTER) | 905 // (1 << Char.MODIFIER_LETTER) | 906 // (1 << Char.OTHER_LETTER)) >> getType(codePoint)) & 1) 907 // != 0; 908 // } 909 } 910 911 private class CharacterCache { 912 private this() { 913 } 914 915 __gshared Char[] cache; 916 917 shared static this() { 918 cache = new Char[127 + 1]; 919 for (int i = 0; i < cast(int)cache.length; i++) { 920 cache[i] = new Char(cast(char) i); 921 } 922 } 923 }