1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.text.StringUtils; 13 14 import std.array; 15 import std.ascii; 16 import std.container.array; 17 import std.conv; 18 import std.range; 19 import std.string; 20 import std.uni; 21 22 import hunt.collection.ArrayTrie; 23 import hunt.collection.Trie; 24 import hunt.text.Common; 25 26 /** 27 */ 28 class StringUtils { 29 private enum string FOLDER_SEPARATOR = "/"; 30 private enum string WINDOWS_FOLDER_SEPARATOR = "\\"; 31 private enum string TOP_PATH = ".."; 32 private enum string CURRENT_PATH = "."; 33 private enum char EXTENSION_SEPARATOR = '.'; 34 35 enum string EMPTY = ""; 36 enum string[] EMPTY_STRING_ARRAY = []; 37 38 enum char[] lowercases = ['\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', 39 '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', 40 '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', '\040', '\041', '\042', 41 '\043', '\044', '\045', '\046', '\047', '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 42 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074', 43 '\075', '\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151', 44 '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166', 45 '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', '\140', '\141', '\142', '\143', 46 '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', 47 '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175', 48 '\176', '\177']; 49 50 enum string __ISO_8859_1 = "iso-8859-1"; 51 enum string __UTF8 = "utf-8"; 52 enum string __UTF16 = "utf-16"; 53 54 // private enum string[string] CHARSETS = ["utf-8":__UTF8, "utf8":__UTF8, 55 // "utf-16":__UTF16, "utf-8":__UTF16, 56 // "iso-8859-1":__ISO_8859_1, "iso_8859_1":__ISO_8859_1]; 57 58 private __gshared Trie!string CHARSETS; 59 60 shared static this() { 61 CHARSETS = new ArrayTrie!string(256); 62 63 CHARSETS.put("utf-8", __UTF8); 64 CHARSETS.put("utf8", __UTF8); 65 CHARSETS.put("utf-16", __UTF16); 66 CHARSETS.put("utf16", __UTF16); 67 CHARSETS.put("iso-8859-1", __ISO_8859_1); 68 CHARSETS.put("iso_8859_1", __ISO_8859_1); 69 } 70 71 72 /** 73 * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8). 74 * 75 * @param s the charset to normalize 76 * @return the normalized charset (or null if normalized version not found) 77 */ 78 static string normalizeCharset(string s) { 79 string n = CHARSETS.get(s); 80 return (n is null) ? s : n; 81 } 82 83 /** 84 * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8). 85 * 86 * @param s the charset to normalize 87 * @param offset the offset in the charset 88 * @param length the length of the charset in the input param 89 * @return the normalized charset (or null if not found) 90 */ 91 static string normalizeCharset(string s, int offset, int length) { 92 return normalizeCharset(s[offset .. offset+length]); 93 } 94 95 static string asciiToLowerCase(string s) { 96 return toLower(s); 97 } 98 99 static int toInt(string str, int from) { 100 return to!int(str[from..$]); 101 } 102 103 static byte[] getBytes(string s) { 104 return cast(byte[])s.dup; 105 } 106 107 static byte[] getBytes(string s, string charset) { 108 return cast(byte[])s.dup; 109 } 110 111 static string randomId(size_t n = 10, string str = letters) { 112 import std.random : randomSample; 113 import std.utf : byCodeUnit; 114 return str.byCodeUnit.randomSample(n).to!string; 115 } 116 117 // Splitting 118 // ----------------------------------------------------------------------- 119 120 /** 121 * <p> 122 * Splits the provided text into an array, using whitespace as the 123 * separator. Whitespace is defined by {@link Character#isWhitespace(char)}. 124 * </p> 125 * <p> 126 * <p> 127 * The separator is not included in the returned string array. Adjacent 128 * separators are treated as one separator. For more control over the split 129 * use the StrTokenizer class. 130 * </p> 131 * <p> 132 * <p> 133 * A <code>null</code> input string returns <code>null</code>. 134 * </p> 135 * <p> 136 * <pre> 137 * StringUtils.split(null) = null 138 * StringUtils.split("") = [] 139 * StringUtils.split("abc def") = ["abc", "def"] 140 * StringUtils.split("abc def") = ["abc", "def"] 141 * StringUtils.split(" abc ") = ["abc"] 142 * </pre> 143 * 144 * @param str the string to parse, may be null 145 * @return an array of parsed Strings, <code>null</code> if null string 146 * input 147 */ 148 static string[] split(string str) { 149 return split(str, null, -1); 150 } 151 152 /** 153 * <p> 154 * Splits the provided text into an array, separators specified. This is an 155 * alternative to using StringTokenizer. 156 * </p> 157 * <p> 158 * <p> 159 * The separator is not included in the returned string array. Adjacent 160 * separators are treated as one separator. For more control over the split 161 * use the StrTokenizer class. 162 * </p> 163 * <p> 164 * <p> 165 * A <code>null</code> input string returns <code>null</code>. A 166 * <code>null</code> separatorChars splits on whitespace. 167 * </p> 168 * <p> 169 * <pre> 170 * StringUtils.split(null, *) = null 171 * StringUtils.split("", *) = [] 172 * StringUtils.split("abc def", null) = ["abc", "def"] 173 * StringUtils.split("abc def", " ") = ["abc", "def"] 174 * StringUtils.split("abc def", " ") = ["abc", "def"] 175 * StringUtils.split("ab:cd:ef", ":") = ["ab", "cd", "ef"] 176 * </pre> 177 * 178 * @param str the string to parse, may be null 179 * @param separatorChars the characters used as the delimiters, <code>null</code> 180 * splits on whitespace 181 * @return an array of parsed Strings, <code>null</code> if null string 182 * input 183 */ 184 static string[] split(string str, string separatorChars) { 185 return splitWorker(str, separatorChars, -1, false); 186 } 187 188 /** 189 * <p> 190 * Splits the provided text into an array, separator specified. This is an 191 * alternative to using StringTokenizer. 192 * </p> 193 * <p> 194 * <p> 195 * The separator is not included in the returned string array. Adjacent 196 * separators are treated as one separator. For more control over the split 197 * use the StrTokenizer class. 198 * </p> 199 * <p> 200 * <p> 201 * A <code>null</code> input string returns <code>null</code>. 202 * </p> 203 * <p> 204 * <pre> 205 * StringUtils.split(null, *) = null 206 * StringUtils.split("", *) = [] 207 * StringUtils.split("a.b.c", '.') = ["a", "b", "c"] 208 * StringUtils.split("a..b.c", '.') = ["a", "b", "c"] 209 * StringUtils.split("a:b:c", '.') = ["a:b:c"] 210 * StringUtils.split("a b c", ' ') = ["a", "b", "c"] 211 * </pre> 212 * 213 * @param str the string to parse, may be null 214 * @param separatorChar the character used as the delimiter 215 * @return an array of parsed Strings, <code>null</code> if null string 216 * input 217 */ 218 static string[] split(string str, char separatorChar) { 219 return splitWorker(str, separatorChar, false); 220 } 221 222 /** 223 * <p> 224 * Splits the provided text into an array with a maximum length, separators 225 * specified. 226 * </p> 227 * <p> 228 * <p> 229 * The separator is not included in the returned string array. Adjacent 230 * separators are treated as one separator. 231 * </p> 232 * <p> 233 * <p> 234 * A <code>null</code> input string returns <code>null</code>. A 235 * <code>null</code> separatorChars splits on whitespace. 236 * </p> 237 * <p> 238 * <p> 239 * If more than <code>max</code> delimited substrings are found, the last 240 * returned string includes all characters after the first 241 * <code>max - 1</code> returned strings (including separator characters). 242 * </p> 243 * <p> 244 * <pre> 245 * StringUtils.split(null, *, *) = null 246 * StringUtils.split("", *, *) = [] 247 * StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"] 248 * StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"] 249 * StringUtils.split("ab:cd:ef", ":", 0) = ["ab", "cd", "ef"] 250 * StringUtils.split("ab:cd:ef", ":", 2) = ["ab", "cd:ef"] 251 * </pre> 252 * 253 * @param str the string to parse, may be null 254 * @param separatorChars the characters used as the delimiters, <code>null</code> 255 * splits on whitespace 256 * @param max the maximum number of elements to include in the array. A zero 257 * or negative value implies no limit 258 * @return an array of parsed Strings, <code>null</code> if null string 259 * input 260 */ 261 static string[] split(string str, string separatorChars, int max) { 262 return splitWorker(str, separatorChars, max, false); 263 } 264 265 /** 266 * Performs the logic for the <code>split</code> and 267 * <code>splitPreserveAllTokens</code> methods that return a maximum array 268 * length. 269 * 270 * @param str the string to parse, may be <code>null</code> 271 * @param separatorChars the separate character 272 * @param max the maximum number of elements to include in the array. A zero 273 * or negative value implies no limit. 274 * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty 275 * token separators; if <code>false</code>, adjacent separators 276 * are treated as one separator. 277 * @return an array of parsed Strings, <code>null</code> if null string 278 * input 279 */ 280 private static string[] splitWorker(string str, string separatorChars, int max, bool preserveAllTokens) { 281 // Performance tuned for 2.0 (JDK1.4) 282 // Direct code is quicker than StringTokenizer. 283 // Also, StringTokenizer uses isSpace() not isWhitespace() 284 285 if (str is null) { 286 return null; 287 } 288 int len = cast(int)str.length; 289 if (len == 0) { 290 return EMPTY_STRING_ARRAY; 291 } 292 293 string[] list; // = new ArrayList!(string)(); 294 int sizePlus1 = 1; 295 int i = 0, start = 0; 296 bool match = false; 297 bool lastMatch = false; 298 if (separatorChars is null) { 299 // Null separator means use whitespace 300 while (i < len) { 301 if (std.ascii.isWhite(str[i])) { 302 if (match || preserveAllTokens) { 303 lastMatch = true; 304 if (sizePlus1++ == max) { 305 i = len; 306 lastMatch = false; 307 } 308 list ~= (str.substring(start, i)); 309 match = false; 310 } 311 start = ++i; 312 continue; 313 } 314 lastMatch = false; 315 match = true; 316 i++; 317 } 318 } else if (separatorChars.length == 1) { 319 // Optimise 1 character case 320 char sep = separatorChars[0]; 321 while (i < len) { 322 if (str[i] == sep) { 323 if (match || preserveAllTokens) { 324 lastMatch = true; 325 if (sizePlus1++ == max) { 326 i = len; 327 lastMatch = false; 328 } 329 list ~= (str.substring(start, i)); 330 match = false; 331 } 332 start = ++i; 333 continue; 334 } 335 lastMatch = false; 336 match = true; 337 i++; 338 } 339 } else { 340 // standard case 341 while (i < len) { 342 if (separatorChars.indexOf(str[i]) >= 0) { 343 if (match || preserveAllTokens) { 344 lastMatch = true; 345 if (sizePlus1++ == max) { 346 i = len; 347 lastMatch = false; 348 } 349 list ~= (str.substring(start, i)); 350 match = false; 351 } 352 start = ++i; 353 continue; 354 } 355 lastMatch = false; 356 match = true; 357 i++; 358 } 359 } 360 if (match || (preserveAllTokens && lastMatch)) { 361 list ~= (str.substring(start, i)); 362 } 363 return list; //.toArray(EMPTY_STRING_ARRAY); 364 } 365 366 /** 367 * Performs the logic for the <code>split</code> and 368 * <code>splitPreserveAllTokens</code> methods that do not return a maximum 369 * array length. 370 * 371 * @param str the string to parse, may be <code>null</code> 372 * @param separatorChar the separate character 373 * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty 374 * token separators; if <code>false</code>, adjacent separators 375 * are treated as one separator. 376 * @return an array of parsed Strings, <code>null</code> if null string 377 * input 378 */ 379 private static string[] splitWorker(string str, char separatorChar, bool preserveAllTokens) { 380 // Performance tuned for 2.0 (JDK1.4) 381 382 if (str is null) { 383 return null; 384 } 385 int len = cast(int)str.length; 386 if (len == 0) { 387 return EMPTY_STRING_ARRAY; 388 } 389 string[] list; // = new ArrayList!(string)(); 390 int i = 0, start = 0; 391 bool match = false; 392 bool lastMatch = false; 393 while (i < len) { 394 if (str[i] == separatorChar) { 395 if (match || preserveAllTokens) { 396 list ~= (str.substring(start, i)); 397 match = false; 398 lastMatch = true; 399 } 400 start = ++i; 401 continue; 402 } 403 lastMatch = false; 404 match = true; 405 i++; 406 } 407 if (match || (preserveAllTokens && lastMatch)) { 408 list ~= (str.substring(start, i)); 409 } 410 return list; 411 } 412 413 414 415 /** 416 * Copy the given Enumeration into a {@code string} array. 417 * The Enumeration must contain {@code string} elements only. 418 * @param enumeration the Enumeration to copy 419 * @return the {@code string} array 420 */ 421 static string[] toStringArray(InputRange!string range) { 422 // Array!string buffer; 423 // foreach(string s; range) { 424 // buffer.insertBack(s); 425 // } 426 return range.array; 427 } 428 429 430 /** 431 * Convert a {@code string} array into a delimited {@code string} (e.g. CSV). 432 * <p>Useful for {@code toString()} implementations. 433 * @param arr the array to display (potentially {@code null} or empty) 434 * @param delim the delimiter to use (typically a ",") 435 * @return the delimited {@code string} 436 */ 437 static string toDelimitedString(string[] arr, string delim) { 438 if (arr.length == 0) { 439 return ""; 440 } 441 if (arr.length == 1) { 442 return arr[0]; 443 } 444 445 Appender!string sb; 446 for (size_t i = 0; i < arr.length; i++) { 447 if (i > 0) { 448 sb.put(delim); 449 } 450 sb.put(arr[i]); 451 } 452 return sb.data; 453 } 454 455 456 /** 457 * Convert a {@code string} array into a comma delimited {@code string} 458 * (i.e., CSV). 459 * <p>Useful for {@code toString()} implementations. 460 * @param arr the array to display (potentially {@code null} or empty) 461 * @return the delimited {@code string} 462 */ 463 static string toCommaDelimitedString(string[] arr) { 464 return toDelimitedString(arr, ","); 465 } 466 467 static string toDelimitedString(Object[] arr, string delim) { 468 if (arr.length == 0) { 469 return ""; 470 } 471 if (arr.length == 1) { 472 return arr[0].toString(); 473 } 474 475 Appender!string sb; 476 for (size_t i = 0; i < arr.length; i++) { 477 if (i > 0) { 478 sb.put(delim); 479 } 480 sb.put(arr[i].toString()); 481 } 482 return sb.data; 483 } 484 485 static string toCommaDelimitedString(Object[] arr) { 486 return toDelimitedString(arr, ","); 487 } 488 489 /** 490 * Convert a comma delimited list (e.g., a row from a CSV file) into an 491 * array of strings. 492 * @param str the input {@code string} (potentially {@code null} or empty) 493 * @return an array of strings, or the empty array in case of empty input 494 */ 495 static string[] commaDelimitedListToStringArray(string str) { 496 return delimitedListToStringArray(str, ","); 497 } 498 499 500 /** 501 * Take a {@code string} that is a delimited list and convert it into a 502 * {@code string} array. 503 * <p>A single {@code delimiter} may consist of more than one character, 504 * but it will still be considered as a single delimiter string, rather 505 * than as bunch of potential delimiter characters, in contrast to 506 * {@link #tokenizeToStringArray}. 507 * @param str the input {@code string} (potentially {@code null} or empty) 508 * @param delimiter the delimiter between elements (this is a single delimiter, 509 * rather than a bunch individual delimiter characters) 510 * @return an array of the tokens in the list 511 * @see #tokenizeToStringArray 512 */ 513 static string[] delimitedListToStringArray(string str, string delimiter) { 514 return delimitedListToStringArray(str, delimiter, null); 515 } 516 517 /** 518 * Take a {@code string} that is a delimited list and convert it into 519 * a {@code string} array. 520 * <p>A single {@code delimiter} may consist of more than one character, 521 * but it will still be considered as a single delimiter string, rather 522 * than as bunch of potential delimiter characters, in contrast to 523 * {@link #tokenizeToStringArray}. 524 * @param str the input {@code string} (potentially {@code null} or empty) 525 * @param delimiter the delimiter between elements (this is a single delimiter, 526 * rather than a bunch individual delimiter characters) 527 * @param charsToDelete a set of characters to delete; useful for deleting unwanted 528 * line breaks: e.g. "\r\n\f" will delete all new lines and line feeds in a {@code string} 529 * @return an array of the tokens in the list 530 * @see #tokenizeToStringArray 531 */ 532 static string[] delimitedListToStringArray(string str, 533 string delimiter, string charsToDelete) { 534 535 if (str.empty()) { 536 return []; 537 } 538 if (delimiter is null) { 539 return [str]; 540 } 541 542 Array!string result; 543 if ("" == delimiter) { 544 for (size_t i = 0; i < str.length; i++) { 545 result.insertBack(deleteAny(str[i .. i + 1], charsToDelete)); 546 } 547 } 548 else { 549 size_t pos = 0; 550 ptrdiff_t delPos; 551 while ((delPos = str.indexOf(delimiter, pos)) != -1) { 552 result.insertBack(deleteAny(str[pos .. delPos], charsToDelete)); 553 pos = delPos + delimiter.length; 554 } 555 if (str.length > 0 && pos <= str.length) { 556 // Add rest of string, but not in case of empty input. 557 result.insertBack(deleteAny(str[pos .. $], charsToDelete)); 558 } 559 } 560 return result.array; 561 } 562 563 564 /** 565 * Delete any character in a given {@code string}. 566 * @param inString the original {@code string} 567 * @param charsToDelete a set of characters to delete. 568 * E.g. "az\n" will delete 'a's, 'z's and new lines. 569 * @return the resulting {@code string} 570 */ 571 static string deleteAny(string inString, string charsToDelete) { 572 if (inString.empty() || charsToDelete.empty()) { 573 return inString; 574 } 575 576 Appender!string sb; 577 for (size_t i = 0; i < inString.length; i++) { 578 char c = inString[i]; 579 if (charsToDelete.indexOf(c) == -1) { 580 sb.put(c); 581 } 582 } 583 return sb.data; 584 } 585 586 /** 587 * Escape a json value string 588 * 589 * Examples: 590 * The "abc\n123" will be escaped as "abc\\n123" 591 * 592 * See_also: 593 * https://tools.ietf.org/html/rfc7159#page-8 594 * https://stackoverflow.com/questions/3020094/how-should-i-escape-strings-in-json?noredirect=1&lq=1 595 */ 596 string escapeJson(string value) { 597 if (value.empty) { 598 return ""; 599 } 600 601 char c = 0; 602 size_t i; 603 size_t len = value.length; 604 Appender!string sb; 605 string t; 606 607 // sb.put('"'); 608 for (i = 0; i < len; i += 1) { 609 c = value[i]; 610 switch (c) { 611 case '\\': 612 case '"': 613 sb.put('\\'); 614 sb.put(c); 615 break; 616 case '/': 617 // if (b == '<') { 618 sb.put('\\'); 619 // } 620 sb.put(c); 621 break; 622 case '\b': 623 sb.put("\\b"); 624 break; 625 case '\t': 626 sb.put("\\t"); 627 break; 628 case '\n': 629 sb.put("\\n"); 630 break; 631 case '\f': 632 sb.put("\\f"); 633 break; 634 case '\r': 635 sb.put("\\r"); 636 break; 637 default: 638 if (c < ' ') { 639 t = "000" ~ format("%X", c); 640 sb.put("\\u" ~ t[$ - 4 .. $]); 641 } else { 642 sb.put(c); 643 } 644 } 645 } 646 // sb.put('"'); 647 return sb.data(); 648 } 649 650 }