hunt.Char source code

1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.Char;
13 
14 import hunt.Nullable;
15 import hunt.Exceptions;
16 import hunt.text.Common;
17 /**
18  * The {@code Character} class wraps a value of the primitive
19  * type {@code char} in an object. An object of type
20  * {@code Character} contains a single field whose type is
21  * {@code char}.
22  * <p>
23  * In addition, this class provides several methods for determining
24  * a character's category (lowercase letter, digit, etc.) and for converting
25  * characters from uppercase to lowercase and vice versa.
26  * <p>
27  * Character information is based on the Unicode Standard, version 8.0.0.
28  * <p>
29  * The methods and data of class {@code Character} are defined by
30  * the information in the <i>UnicodeData</i> file that is part of the
31  * Unicode Character Database maintained by the Unicode
32  * Consortium. This file specifies various properties including name
33  * and general category for every defined Unicode code point or
34  * character range.
35  * <p>
36  * The file and its description are available from the Unicode Consortium at:
37  * <ul>
38  * <li><a href="http://www.unicode.org">http://www.unicode.org</a>
39  * </ul>
40  *
41  * <h3><a id="unicode">Unicode Character Representations</a></h3>
42  *
43  * <p>The {@code char} data type (and therefore the value that a
44  * {@code Character} object encapsulates) are based on the
45  * original Unicode specification, which defined characters as
46  * fixed-width 16-bit entities. The Unicode Standard has since been
47  * changed to allow for characters whose representation requires more
48  * than 16 bits.  The range of legal <em>code point</em>s is now
49  * U+0000 to U+10FFFF, known as <em>Unicode scalar value</em>.
50  * (Refer to the <a
51  * href="http://www.unicode.org/reports/tr27/#notation"><i>
52  * definition</i></a> of the U+<i>n</i> notation in the Unicode
53  * Standard.)
54  *
55  * <p><a id="BMP">The set of characters from U+0000 to U+FFFF</a> is
56  * sometimes referred to as the <em>Basic Multilingual Plane (BMP)</em>.
57  * <a id="supplementary">Characters</a> whose code points are greater
58  * than U+FFFF are called <em>supplementary character</em>s.  The Java
59  * platform uses the UTF-16 representation in {@code char} arrays and
60  * in the {@code string} and {@code StringBuffer} classes. In
61  * this representation, supplementary characters are represented as a pair
62  * of {@code char} values, the first from the <em>high-surrogates</em>
63  * range, (&#92;uD800-&#92;uDBFF), the second from the
64  * <em>low-surrogates</em> range (&#92;uDC00-&#92;uDFFF).
65  *
66  * <p>A {@code char} value, therefore, represents Basic
67  * Multilingual Plane (BMP) code points, including the surrogate
68  * code points, or code units of the UTF-16 encoding. An
69  * {@code int} value represents all Unicode code points,
70  * including supplementary code points. The lower (least significant)
71  * 21 bits of {@code int} are used to represent Unicode code
72  * points and the upper (most significant) 11 bits must be zero.
73  * Unless otherwise specified, the behavior with respect to
74  * supplementary characters and surrogate {@code char} values is
75  * as follows:
76  *
77  * <ul>
78  * <li>The methods that only accept a {@code char} value cannot support
79  * supplementary characters. They treat {@code char} values from the
80  * surrogate ranges as undefined characters. For example,
81  * {@code Character.isLetter('\u005CuD840')} returns {@code false}, even though
82  * this specific value if followed by any low-surrogate value in a string
83  * would represent a letter.
84  *
85  * <li>The methods that accept an {@code int} value support all
86  * Unicode characters, including supplementary characters. For
87  * example, {@code Character.isLetter(0x2F81A)} returns
88  * {@code true} because the code point value represents a letter
89  * (a CJK ideograph).
90  * </ul>
91  *
92  * <p>In the Java SE API documentation, <em>Unicode code point</em> is
93  * used for character values in the range between U+0000 and U+10FFFF,
94  * and <em>Unicode code unit</em> is used for 16-bit
95  * {@code char} values that are code units of the <em>UTF-16</em>
96  * encoding. For more information on Unicode terminology, refer to the
97  * <a href="http://www.unicode.org/glossary/">Unicode Glossary</a>.
98  *
99  * @author  Lee Boynton
100  * @author  Guy Steele
101  * @author  Akira Tanaka
102  * @author  Martin Buchholz
103  * @author  Ulf Zibis
104  */
105 class Char : Nullable!char {
106     /**
107      * The minimum radix available for conversion to and from strings.
108      * The constant value of this field is the smallest value permitted
109      * for the radix argument in radix-conversion methods such as the
110      * {@code digit} method, the {@code forDigit} method, and the
111      * {@code toString} method of class {@code Integer}.
112      *
113      * @see     Character#digit(char, int)
114      * @see     Character#forDigit(int, int)
115      * @see     Integer#toString(int, int)
116      * @see     Integer#valueOf(string)
117      */
118     enum int MIN_RADIX = 2;
119 
120     /**
121      * The maximum radix available for conversion to and from strings.
122      * The constant value of this field is the largest value permitted
123      * for the radix argument in radix-conversion methods such as the
124      * {@code digit} method, the {@code forDigit} method, and the
125      * {@code toString} method of class {@code Integer}.
126      *
127      * @see     Character#digit(char, int)
128      * @see     Character#forDigit(int, int)
129      * @see     Integer#toString(int, int)
130      * @see     Integer#valueOf(string)
131      */
132     enum int MAX_RADIX = 36;
133 
134     /**
135      * The constant value of this field is the smallest value of type
136      * {@code char}, {@code '\u005Cu0000'}.
137      *
138      */
139     enum char MIN_VALUE = '\u0000';
140 
141     /**
142      * The constant value of this field is the largest value of type
143      * {@code char}, {@code '\u005CuFFFF'}.
144      *
145      */
146     // enum char MAX_VALUE = '\uFFFF';
147 
148     /**
149      * The {@code Class} instance representing the primitive type
150      * {@code char}.
151      *
152      */
153     // 
154     // enum Class<Character> TYPE = (Class<Character>) Class.getPrimitiveClass("char");
155 
156     /*
157      * Normative general types
158      */
159 
160     /*
161      * General character types
162      */
163 
164     /**
165      * General category "Cn" in the Unicode specification.
166      */
167     enum byte UNASSIGNED = 0;
168 
169     /**
170      * General category "Lu" in the Unicode specification.
171      */
172     enum byte UPPERCASE_LETTER = 1;
173 
174     /**
175      * General category "Ll" in the Unicode specification.
176      */
177     enum byte LOWERCASE_LETTER = 2;
178 
179     /**
180      * General category "Lt" in the Unicode specification.
181      */
182     enum byte TITLECASE_LETTER = 3;
183 
184     /**
185      * General category "Lm" in the Unicode specification.
186      */
187     enum byte MODIFIER_LETTER = 4;
188 
189     /**
190      * General category "Lo" in the Unicode specification.
191      */
192     enum byte OTHER_LETTER = 5;
193 
194     /**
195      * General category "Mn" in the Unicode specification.
196      */
197     enum byte NON_SPACING_MARK = 6;
198 
199     /**
200      * General category "Me" in the Unicode specification.
201      */
202     enum byte ENCLOSING_MARK = 7;
203 
204     /**
205      * General category "Mc" in the Unicode specification.
206      */
207     enum byte COMBINING_SPACING_MARK = 8;
208 
209     /**
210      * General category "Nd" in the Unicode specification.
211      */
212     enum byte DECIMAL_DIGIT_NUMBER = 9;
213 
214     /**
215      * General category "Nl" in the Unicode specification.
216      */
217     enum byte LETTER_NUMBER = 10;
218 
219     /**
220      * General category "No" in the Unicode specification.
221      */
222     enum byte OTHER_NUMBER = 11;
223 
224     /**
225      * General category "Zs" in the Unicode specification.
226      */
227     enum byte SPACE_SEPARATOR = 12;
228 
229     /**
230      * General category "Zl" in the Unicode specification.
231      */
232     enum byte LINE_SEPARATOR = 13;
233 
234     /**
235      * General category "Zp" in the Unicode specification.
236      */
237     enum byte PARAGRAPH_SEPARATOR = 14;
238 
239     /**
240      * General category "Cc" in the Unicode specification.
241      */
242     enum byte CONTROL = 15;
243 
244     /**
245      * General category "Cf" in the Unicode specification.
246      */
247     enum byte FORMAT = 16;
248 
249     /**
250      * General category "Co" in the Unicode specification.
251      */
252     enum byte PRIVATE_USE = 18;
253 
254     /**
255      * General category "Cs" in the Unicode specification.
256      */
257     enum byte SURROGATE = 19;
258 
259     /**
260      * General category "Pd" in the Unicode specification.
261      */
262     enum byte DASH_PUNCTUATION = 20;
263 
264     /**
265      * General category "Ps" in the Unicode specification.
266      */
267     enum byte START_PUNCTUATION = 21;
268 
269     /**
270      * General category "Pe" in the Unicode specification.
271      */
272     enum byte END_PUNCTUATION = 22;
273 
274     /**
275      * General category "Pc" in the Unicode specification.
276      */
277     enum byte CONNECTOR_PUNCTUATION = 23;
278 
279     /**
280      * General category "Po" in the Unicode specification.
281      */
282     enum byte OTHER_PUNCTUATION = 24;
283 
284     /**
285      * General category "Sm" in the Unicode specification.
286      */
287     enum byte MATH_SYMBOL = 25;
288 
289     /**
290      * General category "Sc" in the Unicode specification.
291      */
292     enum byte CURRENCY_SYMBOL = 26;
293 
294     /**
295      * General category "Sk" in the Unicode specification.
296      */
297     enum byte MODIFIER_SYMBOL = 27;
298 
299     /**
300      * General category "So" in the Unicode specification.
301      */
302     enum byte OTHER_SYMBOL = 28;
303 
304     /**
305      * General category "Pi" in the Unicode specification.
306      */
307     enum byte INITIAL_QUOTE_PUNCTUATION = 29;
308 
309     /**
310      * General category "Pf" in the Unicode specification.
311      */
312     enum byte FINAL_QUOTE_PUNCTUATION = 30;
313 
314     /**
315      * Error flag. Use int (code point) to avoid confusion with U+FFFF.
316      */
317     enum int ERROR = 0xFFFFFFFF;
318 
319     /**
320      * Undefined bidirectional character type. Undefined {@code char}
321      * values have undefined directionality in the Unicode specification.
322      */
323     enum byte DIRECTIONALITY_UNDEFINED = -1;
324 
325     /**
326      * Strong bidirectional character type "L" in the Unicode specification.
327      */
328     enum byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
329 
330     /**
331      * Strong bidirectional character type "R" in the Unicode specification.
332      */
333     enum byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
334 
335     /**
336     * Strong bidirectional character type "AL" in the Unicode specification.
337      */
338     enum byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
339 
340     /**
341      * Weak bidirectional character type "EN" in the Unicode specification.
342      */
343     enum byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
344 
345     /**
346      * Weak bidirectional character type "ES" in the Unicode specification.
347      */
348     enum byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
349 
350     /**
351      * Weak bidirectional character type "ET" in the Unicode specification.
352      */
353     enum byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
354 
355     /**
356      * Weak bidirectional character type "AN" in the Unicode specification.
357      */
358     enum byte DIRECTIONALITY_ARABIC_NUMBER = 6;
359 
360     /**
361      * Weak bidirectional character type "CS" in the Unicode specification.
362      */
363     enum byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
364 
365     /**
366      * Weak bidirectional character type "NSM" in the Unicode specification.
367      */
368     enum byte DIRECTIONALITY_NONSPACING_MARK = 8;
369 
370     /**
371      * Weak bidirectional character type "BN" in the Unicode specification.
372      */
373     enum byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
374 
375     /**
376      * Neutral bidirectional character type "B" in the Unicode specification.
377      */
378     enum byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
379 
380     /**
381      * Neutral bidirectional character type "S" in the Unicode specification.
382      */
383     enum byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
384 
385     /**
386      * Neutral bidirectional character type "WS" in the Unicode specification.
387      */
388     enum byte DIRECTIONALITY_WHITESPACE = 12;
389 
390     /**
391      * Neutral bidirectional character type "ON" in the Unicode specification.
392      */
393     enum byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
394 
395     /**
396      * Strong bidirectional character type "LRE" in the Unicode specification.
397      */
398     enum byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
399 
400     /**
401      * Strong bidirectional character type "LRO" in the Unicode specification.
402      */
403     enum byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
404 
405     /**
406      * Strong bidirectional character type "RLE" in the Unicode specification.
407      */
408     enum byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
409 
410     /**
411      * Strong bidirectional character type "RLO" in the Unicode specification.
412      */
413     enum byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
414 
415     /**
416      * Weak bidirectional character type "PDF" in the Unicode specification.
417      */
418     enum byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
419 
420     /**
421      * The minimum value of a
422      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
423      * Unicode high-surrogate code unit</a>
424      * in the UTF-16 encoding, constant {@code '\u005CuD800'}.
425      * A high-surrogate is also known as a <i>leading-surrogate</i>.
426      *
427      */
428     enum wchar MIN_HIGH_SURROGATE = 0xD800;
429 
430     /**
431      * The maximum value of a
432      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
433      * Unicode high-surrogate code unit</a>
434      * in the UTF-16 encoding, constant {@code '\u005CuDBFF'}.
435      * A high-surrogate is also known as a <i>leading-surrogate</i>.
436      *
437      */
438     enum wchar MAX_HIGH_SURROGATE = 0xDBFF;
439 
440     /**
441      * The minimum value of a
442      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
443      * Unicode low-surrogate code unit</a>
444      * in the UTF-16 encoding, constant {@code '\u005CuDC00'}.
445      * A low-surrogate is also known as a <i>trailing-surrogate</i>.
446      *
447      */
448     enum wchar MIN_LOW_SURROGATE  = 0xDC00;
449 
450     /**
451      * The maximum value of a
452      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
453      * Unicode low-surrogate code unit</a>
454      * in the UTF-16 encoding, constant {@code '\u005CuDFFF'}.
455      * A low-surrogate is also known as a <i>trailing-surrogate</i>.
456      *
457      */
458     enum wchar MAX_LOW_SURROGATE  = 0xDFFF;
459 
460     /**
461      * The minimum value of a Unicode surrogate code unit in the
462      * UTF-16 encoding, constant {@code '\u005CuD800'}.
463      *
464      */
465     enum wchar MIN_SURROGATE = MIN_HIGH_SURROGATE;
466 
467     /**
468      * The maximum value of a Unicode surrogate code unit in the
469      * UTF-16 encoding, constant {@code '\u005CuDFFF'}.
470      *
471      */
472     enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE;
473 
474     /**
475      * The maximum value of a Unicode surrogate code unit in the
476      * UTF-16 encoding, constant {@code '\u005CuDFFF'}.
477      *
478      */
479     // enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE;
480 
481     /**
482      * The minimum value of a
483      * <a href="http://www.unicode.org/glossary/#supplementary_code_point">
484      * Unicode supplementary code point</a>, constant {@code U+10000}.
485      *
486      */
487     enum int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000;
488 
489     /**
490      * The minimum value of a
491      * <a href="http://www.unicode.org/glossary/#code_point">
492      * Unicode code point</a>, constant {@code U+0000}.
493      *
494      */
495     enum int MIN_CODE_POINT = 0x000000;
496 
497     /**
498      * The maximum value of a
499      * <a href="http://www.unicode.org/glossary/#code_point">
500      * Unicode code point</a>, constant {@code U+10FFFF}.
501      *
502      */
503     enum int MAX_CODE_POINT = 0X10FFFF;
504 
505     this(char value) {
506         super(value);
507     }
508 
509     /**
510      * Returns a {@code Character} instance representing the specified
511      * {@code char} value.
512      * If a new {@code Character} instance is not required, this method
513      * should generally be used in preference to the constructor
514      * {@link #Character(char)}, as this method is likely to yield
515      * significantly better space and time performance by caching
516      * frequently requested values.
517      *
518      * This method will always cache values in the range {@code
519      * '\u005Cu0000'} to {@code '\u005Cu007F'}, inclusive, and may
520      * cache other values outside of this range.
521      *
522      * @param  c a char value.
523      * @return a {@code Character} instance representing {@code c}.
524      */
525     static Char valueOf(char c) {
526         if (c <= 127) { // must cache
527             return CharacterCache.cache[cast(int)c];
528         }
529         return new Char(c);
530     }
531 
532     /**
533      * Returns the value of this {@code Character} object.
534      * @return  the primitive {@code char} value represented by
535      *          this object.
536      */
537     char charValue() {
538         return _value;
539     }
540 
541     override size_t toHash() @trusted nothrow {
542         return _value;
543     }
544 
545     /**
546      * Determines the number of {@code char} values needed to
547      * represent the specified character (Unicode code point). If the
548      * specified character is equal to or greater than 0x10000, then
549      * the method returns 2. Otherwise, the method returns 1.
550      *
551      * <p>This method doesn't validate the specified character to be a
552      * valid Unicode code point. The caller must validate the
553      * character value using {@link #isValidCodePoint(int) isValidCodePoint}
554      * if necessary.
555      *
556      * @param   codePoint the character (Unicode code point) to be tested.
557      * @return  2 if the character is a valid supplementary character; 1 otherwise.
558      * @see     Character#isSupplementaryCodePoint(int)
559      */
560     static int charCount(int codePoint) {
561         return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1;
562     }
563 
564     /**
565      * Converts the specified surrogate pair to its supplementary code
566      * point value. This method does not validate the specified
567      * surrogate pair. The caller must validate it using {@link
568      * #isSurrogatePair(char, char) isSurrogatePair} if necessary.
569      *
570      * @param  high the high-surrogate code unit
571      * @param  low the low-surrogate code unit
572      * @return the supplementary code point composed from the
573      *         specified surrogate pair.
574      */
575     static int toCodePoint(char high, char low) {
576         // Optimized form of:
577         // return ((high - MIN_HIGH_SURROGATE) << 10)
578         //         + (low - MIN_LOW_SURROGATE)
579         //         + MIN_SUPPLEMENTARY_CODE_POINT;
580         return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT
581                                        - (MIN_HIGH_SURROGATE << 10)
582                                        - MIN_LOW_SURROGATE);
583     }
584 
585      /**
586      * Determines if the specified character is an ISO control
587      * character.  A character is considered to be an ISO control
588      * character if its code is in the range {@code '\u005Cu0000'}
589      * through {@code '\u005Cu001F'} or in the range
590      * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}.
591      *
592      * <p><b>Note:</b> This method cannot handle <a
593      * href="#supplementary"> supplementary characters</a>. To support
594      * all Unicode characters, including supplementary characters, use
595      * the {@link #isISOControl(int)} method.
596      *
597      * @param   ch      the character to be tested.
598      * @return  {@code true} if the character is an ISO control character;
599      *          {@code false} otherwise.
600      *
601      * @see     Character#isSpaceChar(char)
602      * @see     Character#isWhitespace(char)
603      */
604     public static bool isISOControl(char ch) {
605         return isISOControl(cast(int)ch);
606     }
607 
608     /**
609      * Determines if the referenced character (Unicode code point) is an ISO control
610      * character.  A character is considered to be an ISO control
611      * character if its code is in the range {@code '\u005Cu0000'}
612      * through {@code '\u005Cu001F'} or in the range
613      * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}.
614      *
615      * @param   codePoint the character (Unicode code point) to be tested.
616      * @return  {@code true} if the character is an ISO control character;
617      *          {@code false} otherwise.
618      * @see     Character#isSpaceChar(int)
619      * @see     Character#isWhitespace(int)
620      */
621     public static bool isISOControl(int codePoint) {
622         // Optimized form of:
623         //     (codePoint >= 0x00 && codePoint <= 0x1F) ||
624         //     (codePoint >= 0x7F && codePoint <= 0x9F);
625         return codePoint <= 0x9F &&
626             (codePoint >= 0x7F || (codePoint >>> 5 == 0));
627     }
628 
629     /**
630      * Converts the specified character (Unicode code point) to its
631      * UTF-16 representation stored in a {@code char} array. If
632      * the specified code point is a BMP (Basic Multilingual Plane or
633      * Plane 0) value, the resulting {@code char} array has
634      * the same value as {@code codePoint}. If the specified code
635      * point is a supplementary code point, the resulting
636      * {@code char} array has the corresponding surrogate pair.
637      *
638      * @param  codePoint a Unicode code point
639      * @return a {@code char} array having
640      *         {@code codePoint}'s UTF-16 representation.
641      * @throws IllegalArgumentException if the specified
642      * {@code codePoint} is not a valid Unicode code point.
643      */
644     public static char[] toChars(int codePoint) {
645         if (isBmpCodePoint(codePoint)) {
646             return [ cast(char) codePoint ];
647         } else if (isValidCodePoint(codePoint)) {
648             char[] result = new char[2];
649             toSurrogates(codePoint, result, 0);
650             return result;
651         } else {
652             import std.string;
653             throw new IllegalArgumentException(
654                 format("Not a valid Unicode code point: 0x%X", codePoint));
655         }
656     }
657 
658     /**
659      * Determines whether the specified character (Unicode code point)
660      * is in the <a href="#BMP">Basic Multilingual Plane (BMP)</a>.
661      * Such code points can be represented using a single {@code char}.
662      *
663      * @param  codePoint the character (Unicode code point) to be tested
664      * @return {@code true} if the specified code point is between
665      *         {@link #MIN_VALUE} and {@link #MAX_VALUE} inclusive;
666      *         {@code false} otherwise.
667      */
668     public static bool isBmpCodePoint(int codePoint) {
669         return codePoint >>> 16 == 0;
670         // Optimized form of:
671         //     codePoint >= MIN_VALUE && codePoint <= MAX_VALUE
672         // We consistently use logical shift (>>>) to facilitate
673         // additional runtime optimizations.
674     }
675 
676     /**
677      * Determines whether the specified code point is a valid
678      * <a href="http://www.unicode.org/glossary/#code_point">
679      * Unicode code point value</a>.
680      *
681      * @param  codePoint the Unicode code point to be tested
682      * @return {@code true} if the specified code point value is between
683      *         {@link #MIN_CODE_POINT} and
684      *         {@link #MAX_CODE_POINT} inclusive;
685      *         {@code false} otherwise.
686      */
687     public static bool isValidCodePoint(int codePoint) {
688         // Optimized form of:
689         //     codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT
690         int plane = codePoint >>> 16;
691         return plane < ((MAX_CODE_POINT + 1) >>> 16);
692     }
693 
694     static void toSurrogates(int codePoint, char[] dst, int index) {
695         // We write elements "backwards" to guarantee all-or-nothing
696         dst[index+1] = lowSurrogate(codePoint);
697         dst[index] = highSurrogate(codePoint);
698     }
699 
700     /**
701      * Returns the trailing surrogate (a
702      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
703      * low surrogate code unit</a>) of the
704      * <a href="http://www.unicode.org/glossary/#surrogate_pair">
705      * surrogate pair</a>
706      * representing the specified supplementary character (Unicode
707      * code point) in the UTF-16 encoding.  If the specified character
708      * is not a
709      * <a href="Character.html#supplementary">supplementary character</a>,
710      * an unspecified {@code char} is returned.
711      *
712      * <p>If
713      * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)}
714      * is {@code true}, then
715      * {@link #isLowSurrogate isLowSurrogate}{@code (lowSurrogate(x))} and
716      * {@link #toCodePoint toCodePoint}{@code (}{@link #highSurrogate highSurrogate}{@code (x), lowSurrogate(x)) == x}
717      * are also always {@code true}.
718      *
719      * @param   codePoint a supplementary character (Unicode code point)
720      * @return  the trailing surrogate code unit used to represent the
721      *          character in the UTF-16 encoding
722      */
723     public static char lowSurrogate(int codePoint) {
724         return cast(char) ((codePoint & 0x3ff) + MIN_LOW_SURROGATE);
725     }
726 
727 
728     /**
729      * Returns the leading surrogate (a
730      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
731      * high surrogate code unit</a>) of the
732      * <a href="http://www.unicode.org/glossary/#surrogate_pair">
733      * surrogate pair</a>
734      * representing the specified supplementary character (Unicode
735      * code point) in the UTF-16 encoding.  If the specified character
736      * is not a
737      * <a href="Character.html#supplementary">supplementary character</a>,
738      * an unspecified {@code char} is returned.
739      *
740      * <p>If
741      * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)}
742      * is {@code true}, then
743      * {@link #isHighSurrogate isHighSurrogate}{@code (highSurrogate(x))} and
744      * {@link #toCodePoint toCodePoint}{@code (highSurrogate(x), }{@link #lowSurrogate lowSurrogate}{@code (x)) == x}
745      * are also always {@code true}.
746      *
747      * @param   codePoint a supplementary character (Unicode code point)
748      * @return  the leading surrogate code unit used to represent the
749      *          character in the UTF-16 encoding
750      */
751     public static char highSurrogate(int codePoint) {
752         return cast(char) ((codePoint >>> 10)
753             + (MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
754     }
755 
756     /**
757      * Returns the code point at the given index of the
758      * {@code CharSequence}. If the {@code char} value at
759      * the given index in the {@code CharSequence} is in the
760      * high-surrogate range, the following index is less than the
761      * length of the {@code CharSequence}, and the
762      * {@code char} value at the following index is in the
763      * low-surrogate range, then the supplementary code point
764      * corresponding to this surrogate pair is returned. Otherwise,
765      * the {@code char} value at the given index is returned.
766      *
767      * @param seq a sequence of {@code char} values (Unicode code
768      * units)
769      * @param index the index to the {@code char} values (Unicode
770      * code units) in {@code seq} to be converted
771      * @return the Unicode code point at the given index
772      * @throws NullPointerException if {@code seq} is null.
773      * @throws IndexOutOfBoundsException if the value
774      * {@code index} is negative or not less than
775      * {@link CharSequence#length() seq.length()}.
776      */
777     public static int codePointAt(string seq, int index) {
778         char c1 = seq.charAt(index);
779         if (isHighSurrogate(c1) && ++index < seq.length) {
780             char c2 = seq.charAt(index);
781             if (isLowSurrogate(c2)) {
782                 return toCodePoint(c1, c2);
783             }
784         }
785         return c1;
786     }
787 
788     /**
789      * Determines if the given {@code char} value is a
790      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
791      * Unicode high-surrogate code unit</a>
792      * (also known as <i>leading-surrogate code unit</i>).
793      *
794      * <p>Such values do not represent characters by themselves,
795      * but are used in the representation of
796      * <a href="#supplementary">supplementary characters</a>
797      * in the UTF-16 encoding.
798      *
799      * @param  ch the {@code char} value to be tested.
800      * @return {@code true} if the {@code char} value is between
801      *         {@link #MIN_HIGH_SURROGATE} and
802      *         {@link #MAX_HIGH_SURROGATE} inclusive;
803      *         {@code false} otherwise.
804      * @see    Character#isLowSurrogate(char)
805      * @see    Character.UnicodeBlock#of(int)
806      */
807     public static bool isHighSurrogate(char ch) {
808         // Help VM constant-fold; MAX_HIGH_SURROGATE + 1 == MIN_LOW_SURROGATE
809         return ch >= MIN_HIGH_SURROGATE && ch < (MAX_HIGH_SURROGATE + 1);
810     }
811 
812     /**
813      * Determines if the given {@code char} value is a
814      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
815      * Unicode low-surrogate code unit</a>
816      * (also known as <i>trailing-surrogate code unit</i>).
817      *
818      * <p>Such values do not represent characters by themselves,
819      * but are used in the representation of
820      * <a href="#supplementary">supplementary characters</a>
821      * in the UTF-16 encoding.
822      *
823      * @param  ch the {@code char} value to be tested.
824      * @return {@code true} if the {@code char} value is between
825      *         {@link #MIN_LOW_SURROGATE} and
826      *         {@link #MAX_LOW_SURROGATE} inclusive;
827      *         {@code false} otherwise.
828      * @see    Character#isHighSurrogate(char)
829      */
830     public static bool isLowSurrogate(char ch) {
831         return ch >= MIN_LOW_SURROGATE && ch < (MAX_LOW_SURROGATE + 1);
832     }
833 
834      /**
835      * Determines if the specified character is a letter.
836      * <p>
837      * A character is considered to be a letter if its general
838      * category type, provided by {@code Character.getType(ch)},
839      * is any of the following:
840      * <ul>
841      * <li> {@code UPPERCASE_LETTER}
842      * <li> {@code LOWERCASE_LETTER}
843      * <li> {@code TITLECASE_LETTER}
844      * <li> {@code MODIFIER_LETTER}
845      * <li> {@code OTHER_LETTER}
846      * </ul>
847      *
848      * Not all letters have case. Many characters are
849      * letters but are neither uppercase nor lowercase nor titlecase.
850      *
851      * <p><b>Note:</b> This method cannot handle <a
852      * href="#supplementary"> supplementary characters</a>. To support
853      * all Unicode characters, including supplementary characters, use
854      * the {@link #isLetter(int)} method.
855      *
856      * @param   ch   the character to be tested.
857      * @return  {@code true} if the character is a letter;
858      *          {@code false} otherwise.
859      * @see     Character#isDigit(char)
860      * @see     Character#isJavaIdentifierStart(char)
861      * @see     Character#isJavaLetter(char)
862      * @see     Character#isJavaLetterOrDigit(char)
863      * @see     Character#isLetterOrDigit(char)
864      * @see     Character#isLowerCase(char)
865      * @see     Character#isTitleCase(char)
866      * @see     Character#isUnicodeIdentifierStart(char)
867      * @see     Character#isUpperCase(char)
868      */
869     public static bool isLetter(char ch) {
870         return isLetter(cast(int)ch);
871     }
872 
873     /**
874      * Determines if the specified character (Unicode code point) is a letter.
875      * <p>
876      * A character is considered to be a letter if its general
877      * category type, provided by {@link Character#getType(int) getType(codePoint)},
878      * is any of the following:
879      * <ul>
880      * <li> {@code UPPERCASE_LETTER}
881      * <li> {@code LOWERCASE_LETTER}
882      * <li> {@code TITLECASE_LETTER}
883      * <li> {@code MODIFIER_LETTER}
884      * <li> {@code OTHER_LETTER}
885      * </ul>
886      *
887      * Not all letters have case. Many characters are
888      * letters but are neither uppercase nor lowercase nor titlecase.
889      *
890      * @param   codePoint the character (Unicode code point) to be tested.
891      * @return  {@code true} if the character is a letter;
892      *          {@code false} otherwise.
893      * @see     Character#isDigit(int)
894      * @see     Character#isJavaIdentifierStart(int)
895      * @see     Character#isLetterOrDigit(int)
896      * @see     Character#isLowerCase(int)
897      * @see     Character#isTitleCase(int)
898      * @see     Character#isUnicodeIdentifierStart(int)
899      * @see     Character#isUpperCase(int)
900      */
901     // public static bool isLetter(int codePoint) {
902     //     return ((((1 << Char.UPPERCASE_LETTER) |
903     //         (1 << Char.LOWERCASE_LETTER) |
904     //         (1 << Char.TITLECASE_LETTER) |
905     //         (1 << Char.MODIFIER_LETTER) |
906     //         (1 << Char.OTHER_LETTER)) >> getType(codePoint)) & 1)
907     //         != 0;
908     // }
909 }
910 
911 private class CharacterCache {
912     private this() {
913     }
914 
915     __gshared Char[] cache;
916 
917     shared static this() {
918         cache = new Char[127 + 1];
919         for (int i = 0; i < cast(int)cache.length; i++) {
920             cache[i] = new Char(cast(char) i);
921         }
922     }
923 }