hunt.text.StringUtils source code

1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.text.StringUtils;
13 
14 import std.array;
15 import std.ascii;
16 import std.container.array;
17 import std.conv;
18 import std.range;
19 import std.string;
20 import std.uni;
21 
22 import hunt.collection.ArrayTrie;
23 import hunt.collection.Trie;
24 import hunt.text.Common;
25 
26 /**
27 */
28 class StringUtils {
29     private enum string FOLDER_SEPARATOR = "/";
30     private enum string WINDOWS_FOLDER_SEPARATOR = "\\";
31     private enum string TOP_PATH = "..";
32     private enum string CURRENT_PATH = ".";
33     private enum char EXTENSION_SEPARATOR = '.';
34 
35     enum string EMPTY = "";
36     enum string[] EMPTY_STRING_ARRAY = [];
37 
38     enum char[] lowercases = ['\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010',
39             '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025',
40             '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', '\040', '\041', '\042',
41             '\043', '\044', '\045', '\046', '\047', '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
42             '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074',
43             '\075', '\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151',
44             '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166',
45             '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', '\140', '\141', '\142', '\143',
46             '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160',
47             '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175',
48             '\176', '\177'];
49 
50     enum string __ISO_8859_1 = "iso-8859-1";
51     enum string __UTF8 = "utf-8";
52     enum string __UTF16 = "utf-16";
53     
54     // private enum string[string] CHARSETS = ["utf-8":__UTF8, "utf8":__UTF8, 
55     //     "utf-16":__UTF16, "utf-8":__UTF16, 
56     //     "iso-8859-1":__ISO_8859_1, "iso_8859_1":__ISO_8859_1];
57 
58     private __gshared Trie!string CHARSETS;
59 
60     shared static this() {
61         CHARSETS = new ArrayTrie!string(256);
62 
63         CHARSETS.put("utf-8", __UTF8);
64         CHARSETS.put("utf8", __UTF8);
65         CHARSETS.put("utf-16", __UTF16);
66         CHARSETS.put("utf16", __UTF16);
67         CHARSETS.put("iso-8859-1", __ISO_8859_1);
68         CHARSETS.put("iso_8859_1", __ISO_8859_1);
69     }
70 
71     
72     /**
73      * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8).
74      *
75      * @param s the charset to normalize
76      * @return the normalized charset (or null if normalized version not found)
77      */
78     static string normalizeCharset(string s) {
79         string n = CHARSETS.get(s);
80         return (n is null) ? s : n;
81     }
82 
83     /**
84      * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8).
85      *
86      * @param s      the charset to normalize
87      * @param offset the offset in the charset
88      * @param length the length of the charset in the input param
89      * @return the normalized charset (or null if not found)
90      */
91     static string normalizeCharset(string s, int offset, int length) {
92         return normalizeCharset(s[offset .. offset+length]);
93     }
94 
95     static string asciiToLowerCase(string s) {
96         return toLower(s);
97     }
98 
99     static int toInt(string str, int from) {
100         return to!int(str[from..$]);
101     }
102 
103     static byte[] getBytes(string s) {
104         return cast(byte[])s.dup;
105     }
106 
107     static byte[] getBytes(string s, string charset) {
108         return cast(byte[])s.dup;
109     }
110 
111     static string randomId(size_t n = 10, string str = letters) {
112         import std.random : randomSample;
113         import std.utf : byCodeUnit;
114         return str.byCodeUnit.randomSample(n).to!string;
115     }
116 
117     // Splitting
118     // -----------------------------------------------------------------------
119 
120     /**
121      * <p>
122      * Splits the provided text into an array, using whitespace as the
123      * separator. Whitespace is defined by {@link Character#isWhitespace(char)}.
124      * </p>
125      * <p>
126      * <p>
127      * The separator is not included in the returned string array. Adjacent
128      * separators are treated as one separator. For more control over the split
129      * use the StrTokenizer class.
130      * </p>
131      * <p>
132      * <p>
133      * A <code>null</code> input string returns <code>null</code>.
134      * </p>
135      * <p>
136      * <pre>
137      * StringUtils.split(null)       = null
138      * StringUtils.split("")         = []
139      * StringUtils.split("abc def")  = ["abc", "def"]
140      * StringUtils.split("abc  def") = ["abc", "def"]
141      * StringUtils.split(" abc ")    = ["abc"]
142      * </pre>
143      *
144      * @param str the string to parse, may be null
145      * @return an array of parsed Strings, <code>null</code> if null string
146      * input
147      */
148     static string[] split(string str) {
149         return split(str, null, -1);
150     }
151 
152     /**
153      * <p>
154      * Splits the provided text into an array, separators specified. This is an
155      * alternative to using StringTokenizer.
156      * </p>
157      * <p>
158      * <p>
159      * The separator is not included in the returned string array. Adjacent
160      * separators are treated as one separator. For more control over the split
161      * use the StrTokenizer class.
162      * </p>
163      * <p>
164      * <p>
165      * A <code>null</code> input string returns <code>null</code>. A
166      * <code>null</code> separatorChars splits on whitespace.
167      * </p>
168      * <p>
169      * <pre>
170      * StringUtils.split(null, *)         = null
171      * StringUtils.split("", *)           = []
172      * StringUtils.split("abc def", null) = ["abc", "def"]
173      * StringUtils.split("abc def", " ")  = ["abc", "def"]
174      * StringUtils.split("abc  def", " ") = ["abc", "def"]
175      * StringUtils.split("ab:cd:ef", ":") = ["ab", "cd", "ef"]
176      * </pre>
177      *
178      * @param str            the string to parse, may be null
179      * @param separatorChars the characters used as the delimiters, <code>null</code>
180      *                       splits on whitespace
181      * @return an array of parsed Strings, <code>null</code> if null string
182      * input
183      */
184     static string[] split(string str, string separatorChars) {
185         return splitWorker(str, separatorChars, -1, false);
186     }
187 
188     /**
189      * <p>
190      * Splits the provided text into an array, separator specified. This is an
191      * alternative to using StringTokenizer.
192      * </p>
193      * <p>
194      * <p>
195      * The separator is not included in the returned string array. Adjacent
196      * separators are treated as one separator. For more control over the split
197      * use the StrTokenizer class.
198      * </p>
199      * <p>
200      * <p>
201      * A <code>null</code> input string returns <code>null</code>.
202      * </p>
203      * <p>
204      * <pre>
205      * StringUtils.split(null, *)         = null
206      * StringUtils.split("", *)           = []
207      * StringUtils.split("a.b.c", '.')    = ["a", "b", "c"]
208      * StringUtils.split("a..b.c", '.')   = ["a", "b", "c"]
209      * StringUtils.split("a:b:c", '.')    = ["a:b:c"]
210      * StringUtils.split("a b c", ' ')    = ["a", "b", "c"]
211      * </pre>
212      *
213      * @param str           the string to parse, may be null
214      * @param separatorChar the character used as the delimiter
215      * @return an array of parsed Strings, <code>null</code> if null string
216      * input
217      */
218     static string[] split(string str, char separatorChar) {
219         return splitWorker(str, separatorChar, false);
220     }
221 
222     /**
223      * <p>
224      * Splits the provided text into an array with a maximum length, separators
225      * specified.
226      * </p>
227      * <p>
228      * <p>
229      * The separator is not included in the returned string array. Adjacent
230      * separators are treated as one separator.
231      * </p>
232      * <p>
233      * <p>
234      * A <code>null</code> input string returns <code>null</code>. A
235      * <code>null</code> separatorChars splits on whitespace.
236      * </p>
237      * <p>
238      * <p>
239      * If more than <code>max</code> delimited substrings are found, the last
240      * returned string includes all characters after the first
241      * <code>max - 1</code> returned strings (including separator characters).
242      * </p>
243      * <p>
244      * <pre>
245      * StringUtils.split(null, *, *)            = null
246      * StringUtils.split("", *, *)              = []
247      * StringUtils.split("ab de fg", null, 0)   = ["ab", "cd", "ef"]
248      * StringUtils.split("ab   de fg", null, 0) = ["ab", "cd", "ef"]
249      * StringUtils.split("ab:cd:ef", ":", 0)    = ["ab", "cd", "ef"]
250      * StringUtils.split("ab:cd:ef", ":", 2)    = ["ab", "cd:ef"]
251      * </pre>
252      *
253      * @param str            the string to parse, may be null
254      * @param separatorChars the characters used as the delimiters, <code>null</code>
255      *                       splits on whitespace
256      * @param max            the maximum number of elements to include in the array. A zero
257      *                       or negative value implies no limit
258      * @return an array of parsed Strings, <code>null</code> if null string
259      * input
260      */
261     static string[] split(string str, string separatorChars, int max) {
262         return splitWorker(str, separatorChars, max, false);
263     }
264 
265     /**
266      * Performs the logic for the <code>split</code> and
267      * <code>splitPreserveAllTokens</code> methods that return a maximum array
268      * length.
269      *
270      * @param str               the string to parse, may be <code>null</code>
271      * @param separatorChars    the separate character
272      * @param max               the maximum number of elements to include in the array. A zero
273      *                          or negative value implies no limit.
274      * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty
275      *                          token separators; if <code>false</code>, adjacent separators
276      *                          are treated as one separator.
277      * @return an array of parsed Strings, <code>null</code> if null string
278      * input
279      */
280     private static string[] splitWorker(string str, string separatorChars, int max, bool preserveAllTokens) {
281         // Performance tuned for 2.0 (JDK1.4)
282         // Direct code is quicker than StringTokenizer.
283         // Also, StringTokenizer uses isSpace() not isWhitespace()
284 
285         if (str is null) {
286             return null;
287         }
288         int len = cast(int)str.length;
289         if (len == 0) {
290             return EMPTY_STRING_ARRAY;
291         }
292 
293         string[] list; // = new ArrayList!(string)();
294         int sizePlus1 = 1;
295         int i = 0, start = 0;
296         bool match = false;
297         bool lastMatch = false;
298         if (separatorChars is null) {
299             // Null separator means use whitespace
300             while (i < len) {                
301                 if (std.ascii.isWhite(str[i])) {
302                     if (match || preserveAllTokens) {
303                         lastMatch = true;
304                         if (sizePlus1++ == max) {
305                             i = len;
306                             lastMatch = false;
307                         }
308                         list ~= (str.substring(start, i));
309                         match = false;
310                     }
311                     start = ++i;
312                     continue;
313                 }
314                 lastMatch = false;
315                 match = true;
316                 i++;
317             }
318         } else if (separatorChars.length == 1) {
319             // Optimise 1 character case
320             char sep = separatorChars[0];
321             while (i < len) {
322                 if (str[i] == sep) {
323                     if (match || preserveAllTokens) {
324                         lastMatch = true;
325                         if (sizePlus1++ == max) {
326                             i = len;
327                             lastMatch = false;
328                         }
329                         list  ~= (str.substring(start, i));
330                         match = false;
331                     }
332                     start = ++i;
333                     continue;
334                 }
335                 lastMatch = false;
336                 match = true;
337                 i++;
338             }
339         } else {
340             // standard case
341             while (i < len) {
342                 if (separatorChars.indexOf(str[i]) >= 0) {
343                     if (match || preserveAllTokens) {
344                         lastMatch = true;
345                         if (sizePlus1++ == max) {
346                             i = len;
347                             lastMatch = false;
348                         }
349                         list ~= (str.substring(start, i));
350                         match = false;
351                     }
352                     start = ++i;
353                     continue;
354                 }
355                 lastMatch = false;
356                 match = true;
357                 i++;
358             }
359         }
360         if (match || (preserveAllTokens && lastMatch)) {
361             list ~= (str.substring(start, i));
362         }
363         return list; //.toArray(EMPTY_STRING_ARRAY);
364     }
365 
366     /**
367      * Performs the logic for the <code>split</code> and
368      * <code>splitPreserveAllTokens</code> methods that do not return a maximum
369      * array length.
370      *
371      * @param str               the string to parse, may be <code>null</code>
372      * @param separatorChar     the separate character
373      * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty
374      *                          token separators; if <code>false</code>, adjacent separators
375      *                          are treated as one separator.
376      * @return an array of parsed Strings, <code>null</code> if null string
377      * input
378      */
379     private static string[] splitWorker(string str, char separatorChar, bool preserveAllTokens) {
380         // Performance tuned for 2.0 (JDK1.4)
381 
382         if (str is null) {
383             return null;
384         }
385         int len = cast(int)str.length;
386         if (len == 0) {
387             return EMPTY_STRING_ARRAY;
388         }
389         string[] list; // = new ArrayList!(string)();
390         int i = 0, start = 0;
391         bool match = false;
392         bool lastMatch = false;
393         while (i < len) {
394             if (str[i] == separatorChar) {
395                 if (match || preserveAllTokens) {
396                     list ~= (str.substring(start, i));
397                     match = false;
398                     lastMatch = true;
399                 }
400                 start = ++i;
401                 continue;
402             }
403             lastMatch = false;
404             match = true;
405             i++;
406         }
407         if (match || (preserveAllTokens && lastMatch)) {
408             list ~= (str.substring(start, i));
409         }
410         return list;
411     }
412 
413 
414 
415 	/**
416 	 * Copy the given Enumeration into a {@code string} array.
417 	 * The Enumeration must contain {@code string} elements only.
418 	 * @param enumeration the Enumeration to copy
419 	 * @return the {@code string} array
420 	 */
421 	static string[] toStringArray(InputRange!string range) {
422         // Array!string buffer;
423         // foreach(string s; range) {
424         //     buffer.insertBack(s);
425         // }
426 		return range.array;
427 	}
428 
429 
430 	/**
431 	 * Convert a {@code string} array into a delimited {@code string} (e.g. CSV).
432 	 * <p>Useful for {@code toString()} implementations.
433 	 * @param arr the array to display (potentially {@code null} or empty)
434 	 * @param delim the delimiter to use (typically a ",")
435 	 * @return the delimited {@code string}
436 	 */
437 	static string toDelimitedString(string[] arr, string delim) {
438 		if (arr.length == 0) {
439 			return "";
440 		}
441 		if (arr.length == 1) {
442 			return arr[0];
443 		}
444 
445         Appender!string sb;
446 		for (size_t i = 0; i < arr.length; i++) {
447 			if (i > 0) {
448 				sb.put(delim);
449 			}
450 			sb.put(arr[i]);
451 		}
452 		return sb.data;
453 	}
454 
455 
456 	/**
457 	 * Convert a {@code string} array into a comma delimited {@code string}
458 	 * (i.e., CSV).
459 	 * <p>Useful for {@code toString()} implementations.
460 	 * @param arr the array to display (potentially {@code null} or empty)
461 	 * @return the delimited {@code string}
462 	 */
463 	static string toCommaDelimitedString(string[] arr) {
464 		return toDelimitedString(arr, ",");
465 	}
466 
467 	static string toDelimitedString(Object[] arr, string delim) {
468 		if (arr.length == 0) {
469 			return "";
470 		}
471 		if (arr.length == 1) {
472 			return arr[0].toString();
473 		}
474 
475         Appender!string sb;
476 		for (size_t i = 0; i < arr.length; i++) {
477 			if (i > 0) {
478 				sb.put(delim);
479 			}
480 			sb.put(arr[i].toString());
481 		}
482 		return sb.data;
483     }
484 
485 	static string toCommaDelimitedString(Object[] arr) {
486 		return toDelimitedString(arr, ",");
487 	}
488 
489 	/**
490 	 * Convert a comma delimited list (e.g., a row from a CSV file) into an
491 	 * array of strings.
492 	 * @param str the input {@code string} (potentially {@code null} or empty)
493 	 * @return an array of strings, or the empty array in case of empty input
494 	 */
495 	static string[] commaDelimitedListToStringArray(string str) {
496 		return delimitedListToStringArray(str, ",");
497 	}
498 
499 
500 	/**
501 	 * Take a {@code string} that is a delimited list and convert it into a
502 	 * {@code string} array.
503 	 * <p>A single {@code delimiter} may consist of more than one character,
504 	 * but it will still be considered as a single delimiter string, rather
505 	 * than as bunch of potential delimiter characters, in contrast to
506 	 * {@link #tokenizeToStringArray}.
507 	 * @param str the input {@code string} (potentially {@code null} or empty)
508 	 * @param delimiter the delimiter between elements (this is a single delimiter,
509 	 * rather than a bunch individual delimiter characters)
510 	 * @return an array of the tokens in the list
511 	 * @see #tokenizeToStringArray
512 	 */
513 	static string[] delimitedListToStringArray(string str, string delimiter) {
514 		return delimitedListToStringArray(str, delimiter, null);
515 	}
516 
517 	/**
518 	 * Take a {@code string} that is a delimited list and convert it into
519 	 * a {@code string} array.
520 	 * <p>A single {@code delimiter} may consist of more than one character,
521 	 * but it will still be considered as a single delimiter string, rather
522 	 * than as bunch of potential delimiter characters, in contrast to
523 	 * {@link #tokenizeToStringArray}.
524 	 * @param str the input {@code string} (potentially {@code null} or empty)
525 	 * @param delimiter the delimiter between elements (this is a single delimiter,
526 	 * rather than a bunch individual delimiter characters)
527 	 * @param charsToDelete a set of characters to delete; useful for deleting unwanted
528 	 * line breaks: e.g. "\r\n\f" will delete all new lines and line feeds in a {@code string}
529 	 * @return an array of the tokens in the list
530 	 * @see #tokenizeToStringArray
531 	 */
532 	static string[] delimitedListToStringArray(string str, 
533         string delimiter, string charsToDelete) {
534 
535 		if (str.empty()) {
536 			return [];
537 		}
538 		if (delimiter is null) {
539 			return [str];
540 		}
541 
542 		Array!string result;
543 		if ("" == delimiter) {
544 			for (size_t i = 0; i < str.length; i++) {
545 				result.insertBack(deleteAny(str[i .. i + 1], charsToDelete));
546 			}
547 		}
548 		else {
549 			size_t pos = 0;
550 			ptrdiff_t delPos;
551 			while ((delPos = str.indexOf(delimiter, pos)) != -1) {
552 				result.insertBack(deleteAny(str[pos .. delPos], charsToDelete));
553 				pos = delPos + delimiter.length;
554 			}
555 			if (str.length > 0 && pos <= str.length) {
556 				// Add rest of string, but not in case of empty input.
557 				result.insertBack(deleteAny(str[pos .. $], charsToDelete));
558 			}
559 		}
560 		return result.array;
561 	}
562 
563 
564 	/**
565 	 * Delete any character in a given {@code string}.
566 	 * @param inString the original {@code string}
567 	 * @param charsToDelete a set of characters to delete.
568 	 * E.g. "az\n" will delete 'a's, 'z's and new lines.
569 	 * @return the resulting {@code string}
570 	 */
571 	static string deleteAny(string inString, string charsToDelete) {
572 		if (inString.empty() || charsToDelete.empty()) {
573 			return inString;
574 		}
575 
576         Appender!string sb;
577 		for (size_t i = 0; i < inString.length; i++) {
578 			char c = inString[i];
579 			if (charsToDelete.indexOf(c) == -1) {
580 				sb.put(c);
581 			}
582 		}
583 		return sb.data;
584 	}
585 
586     /**
587      * Escape a json value string
588      * 
589      * Examples:
590      *  The "abc\n123" will be escaped as "abc\\n123"
591      * 
592      * See_also:
593      *  https://tools.ietf.org/html/rfc7159#page-8
594      *  https://stackoverflow.com/questions/3020094/how-should-i-escape-strings-in-json?noredirect=1&lq=1
595      */
596     string escapeJson(string value) {
597         if (value.empty) {
598             return "";
599         }
600 
601         char c = 0;
602         size_t i;
603         size_t len = value.length;
604         Appender!string sb;
605         string t;
606 
607         // sb.put('"');
608         for (i = 0; i < len; i += 1) {
609             c = value[i];
610             switch (c) {
611             case '\\':
612             case '"':
613                 sb.put('\\');
614                 sb.put(c);
615                 break;
616             case '/':
617                 //                if (b == '<') {
618                 sb.put('\\');
619                 //                }
620                 sb.put(c);
621                 break;
622             case '\b':
623                 sb.put("\\b");
624                 break;
625             case '\t':
626                 sb.put("\\t");
627                 break;
628             case '\n':
629                 sb.put("\\n");
630                 break;
631             case '\f':
632                 sb.put("\\f");
633                 break;
634             case '\r':
635                 sb.put("\\r");
636                 break;
637             default:
638                 if (c < ' ') {
639                     t = "000" ~ format("%X", c);
640                     sb.put("\\u" ~ t[$ - 4 .. $]);
641                 } else {
642                     sb.put(c);
643                 }
644             }
645         }
646         //  sb.put('"');
647         return sb.data();
648     }    
649 
650 }