hunt.text.StringTokenizer source code

1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.text.StringTokenizer;
13 
14 
15 import std.string;
16 
17 import hunt.util.Common;
18 import hunt.Char;
19 import hunt.Exceptions;
20 import hunt.text.Common;
21 
22 /**
23  * The string tokenizer class allows an application to break a
24  * string into tokens. The tokenization method is much simpler than
25  * the one used by the <code>StreamTokenizer</code> class. The
26  * <code>StringTokenizer</code> methods do not distinguish among
27  * identifiers, numbers, and quoted strings, nor do they recognize
28  * and skip comments.
29  * <p>
30  * The set of delimiters (the characters that separate tokens) may
31  * be specified either at creation time or on a per-token basis.
32  * <p>
33  * An instance of <code>StringTokenizer</code> behaves in one of two
34  * ways, depending on whether it was created with the
35  * <code>returnDelims</code> flag having the value <code>true</code>
36  * or <code>false</code>:
37  * <ul>
38  * <li>If the flag is <code>false</code>, delimiter characters serve to
39  *     separate tokens. A token is a maximal sequence of consecutive
40  *     characters that are not delimiters.
41  * <li>If the flag is <code>true</code>, delimiter characters are themselves
42  *     considered to be tokens. A token is thus either one delimiter
43  *     character, or a maximal sequence of consecutive characters that are
44  *     not delimiters.
45  * </ul><p>
46  * A <tt>StringTokenizer</tt> object internally maintains a current
47  * position within the string to be tokenized. Some operations advance this
48  * current position past the characters processed.<p>
49  * A token is returned by taking a substring of the string that was used to
50  * create the <tt>StringTokenizer</tt> object.
51  * <p>
52  * The following is one example of the use of the tokenizer. The code:
53  * <blockquote><pre>
54  *     StringTokenizer st = new StringTokenizer("this is a test");
55  *     while (st.hasMoreTokens()) {
56  *         System.out.println(st.nextToken());
57  *     }
58  * </pre></blockquote>
59  * <p>
60  * prints the following output:
61  * <blockquote><pre>
62  *     this
63  *     is
64  *     a
65  *     test
66  * </pre></blockquote>
67  *
68  * <p>
69  * <tt>StringTokenizer</tt> is a legacy class that is retained for
70  * compatibility reasons although its use is discouraged in new code. It is
71  * recommended that anyone seeking this functionality use the <tt>split</tt>
72  * method of <tt>string</tt> or the java.util.regex package instead.
73  * <p>
74  * The following example illustrates how the <tt>string.split</tt>
75  * method can be used to break up a string into its basic tokens:
76  * <blockquote><pre>
77  *     string[] result = "this is a test".split("\\s");
78  *     for (int x=0; x&lt;result.length; x++)
79  *         System.out.println(result[x]);
80  * </pre></blockquote>
81  * <p>
82  * prints the following output:
83  * <blockquote><pre>
84  *     this
85  *     is
86  *     a
87  *     test
88  * </pre></blockquote>
89  *
90  * @author  unascribed
91  * @see     java.io.StreamTokenizer
92  */
93 
94 class StringTokenizer : Iterable!string {
95     private int currentPosition;
96     private int newPosition;
97     private int maxPosition;
98     private string str;
99     private string delimiters;
100     private bool retDelims;
101     private bool delimsChanged;
102 
103     /**
104      * maxDelimCodePoint stores the value of the delimiter character with the
105      * highest value. It is used to optimize the detection of delimiter
106      * characters.
107      *
108      * It is unlikely to provide any optimization benefit in the
109      * hasSurrogates case because most string characters will be
110      * smaller than the limit, but we keep it so that the two code
111      * paths remain similar.
112      */
113     private int maxDelimCodePoint;
114 
115     /**
116      * If delimiters include any surrogates (including surrogate
117      * pairs), hasSurrogates is true and the tokenizer uses the
118      * different code path. This is because string.indexOf(int)
119      * doesn't handle unpaired surrogates as a single character.
120      */
121     private bool hasSurrogates = false;
122 
123     /**
124      * When hasSurrogates is true, delimiters are converted to code
125      * points and isDelimiter(int) is used to determine if the given
126      * codepoint is a delimiter.
127      */
128     private int[] delimiterCodePoints;
129 
130     /**
131      * Set maxDelimCodePoint to the highest char in the delimiter set.
132      */
133     private void setMaxDelimCodePoint() {
134         if (delimiters is null) {
135             maxDelimCodePoint = 0;
136             return;
137         }
138 
139         int m = 0;
140         int c;
141         int count = 0;
142         for (int i = 0; i < delimiters.length; i += Char.charCount(c)) {
143             c = delimiters[i];
144             // FIXME: Needing refactor or cleanup -@zxp at 12/28/2018, 2:53:32 PM
145             // 
146             // if (c >= Char.MIN_HIGH_SURROGATE && c <= Char.MAX_LOW_SURROGATE) {
147             //     c = delimiters.codePointAt(i);
148             //     hasSurrogates = true;
149             // }
150             if (m < c)
151                 m = c;
152             count++;
153         }
154         maxDelimCodePoint = m;
155 
156         // if (hasSurrogates) {
157         //     delimiterCodePoints = new int[count];
158         //     for (int i = 0, j = 0; i < count; i++, j += Char.charCount(c)) {
159         //         c = delimiters.codePointAt(j);
160         //         delimiterCodePoints[i] = c;
161         //     }
162         // }
163     }
164 
165     /**
166      * Constructs a string tokenizer for the specified string. All
167      * characters in the <code>delim</code> argument are the delimiters
168      * for separating tokens.
169      * <p>
170      * If the <code>returnDelims</code> flag is <code>true</code>, then
171      * the delimiter characters are also returned as tokens. Each
172      * delimiter is returned as a string of length one. If the flag is
173      * <code>false</code>, the delimiter characters are skipped and only
174      * serve as separators between tokens.
175      * <p>
176      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
177      * not throw an exception. However, trying to invoke other methods on the
178      * resulting <tt>StringTokenizer</tt> may result in a
179      * <tt>NullPointerException</tt>.
180      *
181      * @param   str            a string to be parsed.
182      * @param   delim          the delimiters.
183      * @param   returnDelims   flag indicating whether to return the delimiters
184      *                         as tokens.
185      * @exception NullPointerException if str is <CODE>null</CODE>
186      */
187     this(string str, string delim, bool returnDelims) {
188         currentPosition = 0;
189         newPosition = -1;
190         delimsChanged = false;
191         this.str = str;
192         maxPosition = cast(int)str.length;
193         delimiters = delim;
194         retDelims = returnDelims;
195         setMaxDelimCodePoint();
196     }
197 
198     /**
199      * Constructs a string tokenizer for the specified string. The
200      * characters in the <code>delim</code> argument are the delimiters
201      * for separating tokens. Delimiter characters themselves will not
202      * be treated as tokens.
203      * <p>
204      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
205      * not throw an exception. However, trying to invoke other methods on the
206      * resulting <tt>StringTokenizer</tt> may result in a
207      * <tt>NullPointerException</tt>.
208      *
209      * @param   str     a string to be parsed.
210      * @param   delim   the delimiters.
211      * @exception NullPointerException if str is <CODE>null</CODE>
212      */
213     this(string str, string delim) {
214         this(str, delim, false);
215     }
216 
217     /**
218      * Constructs a string tokenizer for the specified string. The
219      * tokenizer uses the default delimiter set, which is
220      * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character,
221      * the tab character, the newline character, the carriage-return character,
222      * and the form-feed character. Delimiter characters themselves will
223      * not be treated as tokens.
224      *
225      * @param   str   a string to be parsed.
226      * @exception NullPointerException if str is <CODE>null</CODE>
227      */
228     this(string str) {
229         this(str, " \t\n\r\f", false);
230     }
231 
232     /**
233      * Skips delimiters starting from the specified position. If retDelims
234      * is false, returns the index of the first non-delimiter character at or
235      * after startPos. If retDelims is true, startPos is returned.
236      */
237     private int skipDelimiters(int startPos) {
238         if (delimiters is null)
239             throw new NullPointerException();
240 
241         int position = startPos;
242         while (!retDelims && position < maxPosition) {
243             if (!hasSurrogates) {
244                 char c = str[position];
245                 if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0))
246                     break;
247                 position++;
248             } else {
249                 throw new NotSupportedException();
250                 // int c = str.codePointAt(position);
251                 // if ((c > maxDelimCodePoint) || !isDelimiter(c)) {
252                 //     break;
253                 // }
254                 // position += Char.charCount(c);
255             }
256         }
257         return position;
258     }
259 
260     /**
261      * Skips ahead from startPos and returns the index of the next delimiter
262      * character encountered, or maxPosition if no such delimiter is found.
263      */
264     private int scanToken(int startPos) {
265         int position = startPos;
266 
267         while (position < maxPosition) {
268             if (!hasSurrogates) {
269                 char c = str.charAt(position);
270                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
271                     break;
272                 position++;
273             } else {
274 
275                 throw new NotSupportedException();
276                 // int c = str.codePointAt(position);
277                 // if ((c <= maxDelimCodePoint) && isDelimiter(c))
278                 //     break;
279                 // position += Char.charCount(c);
280             }
281         }
282 
283         if (retDelims && (startPos == position)) {
284             if (!hasSurrogates) {
285                 char c = str.charAt(position);
286                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
287                     position++;
288             } else {
289 
290                 throw new NotSupportedException();
291                 // int c = str.codePointAt(position);
292                 // if ((c <= maxDelimCodePoint) && isDelimiter(c))
293                 //     position += Char.charCount(c);
294             }
295         }
296         return position;
297     }
298 
299     private bool isDelimiter(int codePoint) {
300         for (int i = 0; i < delimiterCodePoints.length; i++) {
301             if (delimiterCodePoints[i] == codePoint) {
302                 return true;
303             }
304         }
305         return false;
306     }
307 
308     /**
309      * Tests if there are more tokens available from this tokenizer's string.
310      * If this method returns <tt>true</tt>, then a subsequent call to
311      * <tt>nextToken</tt> with no argument will successfully return a token.
312      *
313      * @return  <code>true</code> if and only if there is at least one token
314      *          in the string after the current position; <code>false</code>
315      *          otherwise.
316      */
317     bool hasMoreTokens() {
318         /*
319          * Temporarily store this position and use it in the following
320          * nextToken() method only if the delimiters haven't been changed in
321          * that nextToken() invocation.
322          */
323         newPosition = skipDelimiters(currentPosition);
324         return (newPosition < maxPosition);
325     }
326 
327     /**
328      * Returns the next token from this string tokenizer.
329      *
330      * @return     the next token from this string tokenizer.
331      * @exception  NoSuchElementException  if there are no more tokens in this
332      *               tokenizer's string.
333      */
334     string nextToken() {
335         /*
336          * If next position already computed in hasMoreElements() and
337          * delimiters have changed between the computation and this invocation,
338          * then use the computed value.
339          */
340 
341         currentPosition = (newPosition >= 0 && !delimsChanged) ?
342             newPosition : skipDelimiters(currentPosition);
343 
344         /* Reset these anyway */
345         delimsChanged = false;
346         newPosition = -1;
347 
348         if (currentPosition >= maxPosition)
349             throw new NoSuchElementException("");
350         int start = currentPosition;
351         currentPosition = scanToken(currentPosition);
352         return str.substring(start, currentPosition);
353     }
354 
355     /**
356      * Returns the next token in this string tokenizer's string. First,
357      * the set of characters considered to be delimiters by this
358      * <tt>StringTokenizer</tt> object is changed to be the characters in
359      * the string <tt>delim</tt>. Then the next token in the string
360      * after the current position is returned. The current position is
361      * advanced beyond the recognized token.  The new delimiter set
362      * remains the default after this call.
363      *
364      * @param      delim   the new delimiters.
365      * @return     the next token, after switching to the new delimiter set.
366      * @exception  NoSuchElementException  if there are no more tokens in this
367      *               tokenizer's string.
368      * @exception NullPointerException if delim is <CODE>null</CODE>
369      */
370     string nextToken(string delim) {
371         delimiters = delim;
372 
373         /* delimiter string specified, so set the appropriate flag. */
374         delimsChanged = true;
375 
376         setMaxDelimCodePoint();
377         return nextToken();
378     }
379 
380     int opApply(scope int delegate(ref string) dg)
381     {
382         int result = 0;
383         while(hasMoreTokens && result == 0) {
384             string s = nextToken();
385             result = dg(s);
386         }
387         return result;
388     }
389 
390     /**
391      * Calculates the number of times that this tokenizer's
392      * <code>nextToken</code> method can be called before it generates an
393      * exception. The current position is not advanced.
394      *
395      * @return  the number of tokens remaining in the string using the current
396      *          delimiter set.
397      * @see     java.util.StringTokenizer#nextToken()
398      */
399     int countTokens() {
400         int count = 0;
401         int currpos = currentPosition;
402         while (currpos < maxPosition) {
403             currpos = skipDelimiters(currpos);
404             if (currpos >= maxPosition)
405                 break;
406             currpos = scanToken(currpos);
407             count++;
408         }
409         return count;
410     }
411 }