1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.text.QuotedStringTokenizer;
13 
14 import std.conv;
15 import std.ascii;
16 import std.string;
17 
18 // import hunt.collection.StringBuilder;
19 import hunt.text.StringTokenizer;
20 import hunt.text.Common;
21 import hunt.util.StringBuilder;
22 import hunt.Exceptions;
23 import hunt.util.Appendable;
24 import hunt.util.Common;
25 import hunt.util.ConverterUtils;
26 
27 
28 /**
29  * StringTokenizer with Quoting support.
30  *
31  * This class is a copy of the java.util.StringTokenizer API and the behaviour
32  * is the same, except that single and double quoted string values are
33  * recognised. Delimiters within quotes are not considered delimiters. Quotes
34  * can be escaped with '\'.
35  *
36  * @see java.util.StringTokenizer
37  *
38  */
39 class QuotedStringTokenizer : StringTokenizer {
40 	private enum string __delim = "\t\n\r";
41 	private string _string;
42 	private string _delim = __delim;
43 	private bool _returnQuotes = false;
44 	private bool _returnDelimiters = false;
45 	private StringBuilder _token;
46 	private bool _hasToken = false;
47 	private int _i = 0;
48 	private int _lastStart = 0;
49 	private bool _double = true;
50 	private bool _single = true;
51 
52 	this(string str, string delim, bool returnDelimiters, bool returnQuotes) {
53 		super("");
54 		_string = str;
55 		if (delim !is null)
56 			_delim = delim;
57 		_returnDelimiters = returnDelimiters;
58 		_returnQuotes = returnQuotes;
59 
60 		if (_delim.indexOf('\'') >= 0 || _delim.indexOf('"') >= 0)
61 			throw new Error("Can't use quotes as delimiters: " ~ _delim);
62 
63 		_token = new StringBuilder(_string.length > 1024 ? 512 : _string.length / 2);
64 	}
65 
66 	this(string str, string delim, bool returnDelimiters) {
67 		this(str, delim, returnDelimiters, false);
68 	}
69 
70 	this(string str, string delim) {
71 		this(str, delim, false, false);
72 	}
73 
74 	this(string str) {
75 		this(str, null, false, false);
76 	}
77 
78 	override
79 	bool hasMoreTokens() {
80 		// Already found a token
81 		if (_hasToken)
82 			return true;
83 
84 		_lastStart = _i;
85 
86 		int state = 0;
87 		bool escape = false;
88 		while (_i < _string.length) {
89 			char c = _string.charAt(_i++);
90 
91 			switch (state) {
92 			case 0: // Start
93 				if (_delim.indexOf(c) >= 0) {
94 					if (_returnDelimiters) {
95 						_token.append(c);
96 						return _hasToken = true;
97 					}
98 				} else if (c == '\'' && _single) {
99 					if (_returnQuotes)
100 						_token.append(c);
101 					state = 2;
102 				} else if (c == '\"' && _double) {
103 					if (_returnQuotes)
104 						_token.append(c);
105 					state = 3;
106 				} else {
107 					_token.append(c);
108 					_hasToken = true;
109 					state = 1;
110 				}
111 				break;
112 
113 			case 1: // Token
114 				_hasToken = true;
115 				if (_delim.indexOf(c) >= 0) {
116 					if (_returnDelimiters)
117 						_i--;
118 					return _hasToken;
119 				} else if (c == '\'' && _single) {
120 					if (_returnQuotes)
121 						_token.append(c);
122 					state = 2;
123 				} else if (c == '\"' && _double) {
124 					if (_returnQuotes)
125 						_token.append(c);
126 					state = 3;
127 				} else {
128 					_token.append(c);
129 				}
130 				break;
131 
132 			case 2: // Single Quote
133 				_hasToken = true;
134 				if (escape) {
135 					escape = false;
136 					_token.append(c);
137 				} else if (c == '\'') {
138 					if (_returnQuotes)
139 						_token.append(c);
140 					state = 1;
141 				} else if (c == '\\') {
142 					if (_returnQuotes)
143 						_token.append(c);
144 					escape = true;
145 				} else {
146 					_token.append(c);
147 				}
148 				break;
149 
150 			case 3: // Double Quote
151 				_hasToken = true;
152 				if (escape) {
153 					escape = false;
154 					_token.append(c);
155 				} else if (c == '\"') {
156 					if (_returnQuotes)
157 						_token.append(c);
158 					state = 1;
159 				} else if (c == '\\') {
160 					if (_returnQuotes)
161 						_token.append(c);
162 					escape = true;
163 				} else {
164 					_token.append(c);
165 				}
166 				break;
167 
168             default:
169                 break;
170 			}
171 		}
172 
173 		return _hasToken;
174 	}
175 
176 	override
177 	string nextToken() {
178 		if (!hasMoreTokens() || _token is null)
179 			throw new NoSuchElementException("");
180 		string t = _token.toString();
181 		_token.setLength(0);
182 		_hasToken = false;
183 		return t;
184 	}
185 
186 	override
187 	string nextToken(string delim) {
188 		_delim = delim;
189 		_i = _lastStart;
190 		_token.setLength(0);
191 		_hasToken = false;
192 		return nextToken();
193 	}
194 
195 
196 	/**
197 	 * Not implemented.
198 	 */
199 	override
200 	int countTokens() {
201 		return -1;
202 	}
203 
204 	/**
205 	 * Quote a string. The string is quoted only if quoting is required due to
206 	 * embedded delimiters, quote characters or the empty string.
207 	 * 
208 	 * @param s
209 	 *            The string to quote.
210 	 * @param delim
211 	 *            the delimiter to use to quote the string
212 	 * @return quoted string
213 	 */
214 	static string quoteIfNeeded(string s, string delim) {
215 		if (s is null)
216 			return null;
217 		if (s.length == 0)
218 			return "\"\"";
219 
220 		for (int i = 0; i < s.length; i++) {
221 			char c = s[i];
222 			if (c == '\\' || c == '"' || c == '\'' || std.ascii.isWhite(c) || delim.indexOf(c) >= 0) {
223 				StringBuilder b = new StringBuilder(s.length + 8);
224 				quote(b, s);
225 				return b.toString();
226 			}
227 		}
228 
229 		return s;
230 	}
231 
232 	/**
233 	 * Quote a string. The string is quoted only if quoting is required due to
234 	 * embeded delimiters, quote characters or the empty string.
235 	 * 
236 	 * @param s
237 	 *            The string to quote.
238 	 * @return quoted string
239 	 */
240 	static string quote(string s) {
241 		if (s is null)
242 			return null;
243 		if (s.length == 0)
244 			return "\"\"";
245 
246 		StringBuilder b = new StringBuilder(s.length + 8);
247 		quote(b, s);
248 		return b.toString();
249 
250 	}
251 
252 	private __gshared char[] escapes; // = new char[32];
253 
254 	shared static this() {
255         // escapes[] = cast(char) 0xFFFF;
256 		escapes = new char[32];
257 		escapes[] = cast(char) 0xFF;
258 		// for(size_t i=0; i<escapes.length; i++)
259 		// 	escapes[i] = cast(char) 0xFFFF;
260 		escapes['\b'] = 'b';
261 		escapes['\t'] = 't';
262 		escapes['\n'] = 'n';
263 		escapes['\f'] = 'f';
264 		escapes['\r'] = 'r';
265 	}
266 
267 	/**
268 	 * Quote a string into an Appendable. Only quotes and backslash are escaped.
269 	 * 
270 	 * @param buffer
271 	 *            The Appendable
272 	 * @param input
273 	 *            The string to quote.
274 	 */
275 	static void quoteOnly(Appendable buffer, string input) {
276 		if (input is null)
277 			return;
278 
279 		try {
280 			buffer.append('"');
281 			for (int i = 0; i < input.length; ++i) {
282 				char c = input[i];
283 				if (c == '"' || c == '\\')
284 					buffer.append('\\');
285 				buffer.append(c);
286 			}
287 			buffer.append('"');
288 		} catch (IOException x) {
289 			throw new RuntimeException(x);
290 		}
291 	}
292 
293 	/**
294 	 * Quote a string into an Appendable. The characters ", \, \n, \r, \t, \f
295 	 * and \b are escaped
296 	 * 
297 	 * @param buffer
298 	 *            The Appendable
299 	 * @param input
300 	 *            The string to quote.
301 	 */
302 	static void quote(Appendable buffer, string input) {
303 		if (input is null)
304 			return;
305 
306 		try {
307 			buffer.append('"');
308 			for (int i = 0; i < input.length; ++i) {
309 				char c = input[i];
310 				if (c >= 32) {
311 					if (c == '"' || c == '\\')
312 						buffer.append('\\');
313 					buffer.append(c);
314 				} else {
315 					char escape = escapes[c];
316 					if (escape == 0xFFFF) {
317 						// Unicode escape
318 						buffer.append('\\').append('u').append('0').append('0');
319 						if (c < 0x10)
320 							buffer.append('0');
321 						buffer.append(to!string(cast(int)c, 16));
322 					} else {
323 						buffer.append('\\').append(escape);
324 					}
325 				}
326 			}
327 			buffer.append('"');
328 		} catch (IOException x) {
329 			throw new RuntimeException(x);
330 		}
331 	}
332 
333 	static string unquoteOnly(string s) {
334 		return unquoteOnly(s, false);
335 	}
336 
337 	/**
338 	 * Unquote a string, NOT converting unicode sequences
339 	 * 
340 	 * @param s
341 	 *            The string to unquote.
342 	 * @param lenient
343 	 *            if true, will leave in backslashes that aren't valid escapes
344 	 * @return quoted string
345 	 */
346 	static string unquoteOnly(string s, bool lenient) {
347 		if (s is null)
348 			return null;
349 		if (s.length < 2)
350 			return s;
351 
352 		char first = s.charAt(0);
353 		char last = s.charAt(cast(int)s.length - 1);
354 		if (first != last || (first != '"' && first != '\''))
355 			return s;
356 
357 		StringBuilder b = new StringBuilder(cast(int)s.length - 2);
358 		bool escape = false;
359 		for (int i = 1; i < s.length - 1; i++) {
360 			char c = s[i];
361 
362 			if (escape) {
363 				escape = false;
364 				if (lenient && !isValidEscaping(c)) {
365 					b.append('\\');
366 				}
367 				b.append(c);
368 			} else if (c == '\\') {
369 				escape = true;
370 			} else {
371 				b.append(c);
372 			}
373 		}
374 
375 		return b.toString();
376 	}
377 
378 	static string unquote(string s) {
379 		return unquote(s, false);
380 	}
381 
382 	/**
383 	 * Unquote a string.
384 	 * 
385 	 * @param s
386 	 *            The string to unquote.
387 	 * @param lenient
388 	 *            true if unquoting should be lenient to escaped content,
389 	 *            leaving some alone, false if string unescaping
390 	 * @return quoted string
391 	 */
392 	static string unquote(string s, bool lenient) {
393 		if (s is null)
394 			return null;
395 		if (s.length < 2)
396 			return s;
397 
398 		char first = s.charAt(0);
399 		char last = s.charAt(cast(int)s.length - 1);
400 		if (first != last || (first != '"' && first != '\''))
401 			return s;
402 
403 		StringBuilder b = new StringBuilder(cast(int)s.length - 2);
404 		bool escape = false;
405 		for (int i = 1; i < cast(int)s.length - 1; i++) {
406 			char c = s[i];
407 
408 			if (escape) {
409 				escape = false;
410 				switch (c) {
411 				case 'n':
412 					b.append('\n');
413 					break;
414 				case 'r':
415 					b.append('\r');
416 					break;
417 				case 't':
418 					b.append('\t');
419 					break;
420 				case 'f':
421 					b.append('\f');
422 					break;
423 				case 'b':
424 					b.append('\b');
425 					break;
426 				case '\\':
427 					b.append('\\');
428 					break;
429 				case '/':
430 					b.append('/');
431 					break;
432 				case '"':
433 					b.append('"');
434 					break;
435 				case 'u':
436 					b.append(cast(char) ((ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 24)
437 							+ (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 16)
438 							+ (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 8)
439 							+ (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)))));
440 					break;
441 				default:
442 					if (lenient && !isValidEscaping(c)) {
443 						b.append('\\');
444 					}
445 					b.append(c);
446 				}
447 			} else if (c == '\\') {
448 				escape = true;
449 			} else {
450 				b.append(c);
451 			}
452 		}
453 
454 		return b.toString();
455 	}
456 
457 	/**
458 	 * Check that char c (which is preceded by a backslash) is a valid escape
459 	 * sequence.
460 	 * 
461 	 * @param c
462 	 * @return
463 	 */
464 	private static bool isValidEscaping(char c) {
465 		return ((c == 'n') || (c == 'r') || (c == 't') || (c == 'f') || (c == 'b') || (c == '\\') || (c == '/')
466 				|| (c == '"') || (c == 'u'));
467 	}
468 
469 	static bool isQuoted(string s) {
470 		return s !is null && s.length > 0 && s.charAt(0) == '"' && s.charAt(cast(int)s.length - 1) == '"';
471 	}
472 
473 	/**
474 	 * @return handle double quotes if true
475 	 */
476 	bool getDouble() {
477 		return _double;
478 	}
479 
480 	/**
481 	 * @param d
482 	 *            handle double quotes if true
483 	 */
484 	void setDouble(bool d) {
485 		_double = d;
486 	}
487 
488 	/**
489 	 * @return handle single quotes if true
490 	 */
491 	bool getSingle() {
492 		return _single;
493 	}
494 
495 	/**
496 	 * @param single
497 	 *            handle single quotes if true
498 	 */
499 	void setSingle(bool single) {
500 		_single = single;
501 	}
502 }