1 /*
2 * Hunt - A refined core library for D programming language.
3 *
4 * Copyright (C) 2018-2019 HuntLabs
5 *
6 * Website: https://www.huntlabs.net/
7 *
8 * Licensed under the Apache-2.0 License.
9 *
10 */
11
12 module hunt.collection.ArrayTrie;
13
14 import hunt.collection.AbstractTrie;
15 import hunt.util.Common;
16 import hunt.io.ByteBuffer;
17 import hunt.collection.HashSet;
18 import hunt.collection.Set;
19 import hunt.collection.Trie;
20
21 import hunt.Exceptions;
22 import hunt.util.Appendable;
23 import hunt.util.StringBuilder;
24
25 import hunt.logging;
26 import std.conv;
27
28 /**
29 * <p>A Trie string lookup data structure using a fixed size array.</p>
30 * <p>This implementation is always case insensitive and is optimal for
31 * a small number of fixed strings with few special characters. The
32 * Trie is stored in an array of lookup tables, each indexed by the
33 * next character of the key. Frequently used characters are directly
34 * indexed in each lookup table, whilst infrequently used characters
35 * must use a big character table.
36 * </p>
37 * <p>This Trie is very space efficient if the key characters are
38 * from ' ', '+', '-', ':', ';', '.', 'A' to 'Z' or 'a' to 'z'.
39 * Other ISO-8859-1 characters can be used by the key, but less space
40 * efficiently.
41 * </p>
42 * <p>This Trie is not Threadsafe and contains no mutual exclusion
43 * or deliberate memory barriers. It is intended for an ArrayTrie to be
44 * built by a single thread and then used concurrently by multiple threads
45 * and not mutated during that access. If concurrent mutations of the
46 * Trie is required external locks need to be applied.
47 * </p>
48 *
49 * @param (V) the element of entry
50 */
51 class ArrayTrie(V) : AbstractTrie!(V) {
52 /**
53 * The Size of a Trie row is how many characters can be looked
54 * up directly without going to a big index. This is set at
55 * 32 to cover case insensitive alphabet and a few other common
56 * characters.
57 */
58 private enum int ROW_SIZE = 32;
59
60 /**
61 * The index lookup table, this maps a character as a byte
62 * (ISO-8859-1 or UTF8) to an index within a Trie row
63 */
64 private enum int[] __lookup =
65 [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F
66 /*0*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
67 /*1*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
68 /*2*/31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, -1, 27, 30, -1,
69 /*3*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 28, 29, -1, -1, -1, -1,
70 /*4*/-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
71 /*5*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
72 /*6*/-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
73 /*7*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
74 ];
75
76 /**
77 * The Trie rows in a single array which allows a lookup of row,character
78 * to the next row in the Trie. This is actually a 2 dimensional
79 * array that has been flattened to achieve locality of reference.
80 * The first ROW_SIZE entries are for row 0, then next ROW_SIZE
81 * entries are for row 1 etc. So in general instead of using
82 * _rows[row][index], we use _rows[row*ROW_SIZE+index] to look up
83 * the next row for a given character.
84 * <p>
85 * The array is of characters rather than integers to save space.
86 */
87 private byte[] _rowIndex;
88
89 /**
90 * The key (if any) for a Trie row.
91 * A row may be a leaf, a node or both in the Trie tree.
92 */
93 private string[] _key;
94
95 /**
96 * The value (if any) for a Trie row.
97 * A row may be a leaf, a node or both in the Trie tree.
98 */
99 private V[] _value;
100
101 /**
102 * A big index for each row.
103 * If a character outside of the lookup map is needed,
104 * then a big index will be created for the row, with
105 * 256 entries, one for each possible byte.
106 */
107 private byte[][] _bigIndex;
108
109 /**
110 * The number of rows allocated
111 */
112 private byte _rows;
113
114 this() {
115 this(128);
116 }
117
118 /* ------------------------------------------------------------ */
119
120 /**
121 * @param capacity The capacity of the trie, which at the worst case
122 * is the total number of characters of all keys stored in the Trie.
123 * The capacity needed is dependent of the shared prefixes of the keys.
124 * For example, a capacity of 6 nodes is required to store keys "foo"
125 * and "bar", but a capacity of only 4 is required to
126 * store "bar" and "bat".
127 */
128 this(int capacity) {
129 super(true);
130 _value = new V[capacity];
131 _rowIndex = new byte[capacity * 32];
132 _key = new string[capacity];
133 }
134
135 /* ------------------------------------------------------------ */
136 override
137 void clear() {
138 _rows = 0;
139 _value[] = V.init;
140 _rowIndex[] = 0;
141 _key[] = null;
142 }
143
144 /* ------------------------------------------------------------ */
145 override
146 bool put(string s, V v) {
147 int t = 0;
148 size_t k;
149 size_t limit = s.length;
150 for (k = 0; k < limit; k++) {
151 byte c = s[k];
152
153 int index = __lookup[c & 0x7f];
154 if (index >= 0) {
155 int idx = t * ROW_SIZE + index;
156 t = _rowIndex[idx];
157 if (t == 0) {
158 if (++_rows >= _value.length)
159 return false;
160 t = _rowIndex[idx] = _rows;
161 }
162 } else if (c > 127)
163 throw new IllegalArgumentException("non ascii character");
164 else {
165 if (_bigIndex is null)
166 _bigIndex = new byte[][_value.length];
167 if (t >= _bigIndex.length)
168 return false;
169 byte[] big = _bigIndex[t];
170 if (big is null)
171 big = _bigIndex[t] = new byte[128];
172 t = big[c];
173 if (t == 0) {
174 if (_rows == _value.length)
175 return false;
176 t = big[c] = ++_rows;
177 }
178 }
179 }
180
181 if (t >= _key.length) {
182 _rows = cast(byte) _key.length;
183 return false;
184 }
185
186 static if(is(V == class)) {
187 _key[t] = v is null ? null : s;
188 } else {
189 _key[t] = v == V.init ? null : s;
190 }
191 _value[t] = v;
192 return true;
193 }
194
195 /* ------------------------------------------------------------ */
196 override
197 V get(string s, int offset, int len) {
198 int t = 0;
199 for (int i = 0; i < len; i++) {
200 byte c = s[offset + i];
201 int index = __lookup[c & 0x7f];
202 if (index >= 0) {
203 int idx = t * ROW_SIZE + index;
204 t = _rowIndex[idx];
205 if (t == 0)
206 return V.init;
207 } else {
208 byte[] big = _bigIndex is null ? null : _bigIndex[t];
209 if (big is null)
210 return V.init;
211 t = big[c];
212 if (t == 0)
213 return V.init;
214 }
215 }
216 return _value[t];
217 }
218
219 /* ------------------------------------------------------------ */
220 override
221 V get(ByteBuffer b, int offset, int len) {
222 int t = 0;
223 for (int i = 0; i < len; i++) {
224 byte c = b.get(offset + i);
225 int index = __lookup[c & 0x7f];
226 if (index >= 0) {
227 int idx = t * ROW_SIZE + index;
228 t = _rowIndex[idx];
229 if (t == 0)
230 return V.init;
231 } else {
232 byte[] big = _bigIndex is null ? null : _bigIndex[t];
233 if (big is null)
234 return V.init;
235 t = big[c];
236 if (t == 0)
237 return V.init;
238 }
239 }
240 return _value[t];
241 }
242
243 /* ------------------------------------------------------------ */
244 override
245 V getBest(byte[] b, int offset, int len) {
246 return getBest(0, b, offset, len);
247 }
248
249 /* ------------------------------------------------------------ */
250 override
251 V getBest(ByteBuffer b, int offset, int len) {
252 if (b.hasArray())
253 return getBest(0, b.array(), b.arrayOffset() + b.position() + offset, len);
254 return getBest(0, b, offset, len);
255 }
256
257 /* ------------------------------------------------------------ */
258 override
259 V getBest(string s, int offset, int len) {
260 return getBest(0, s, offset, len);
261 }
262
263 /* ------------------------------------------------------------ */
264 private V getBest(int t, string s, int offset, int len) {
265 int pos = offset;
266 for (int i = 0; i < len; i++) {
267 byte c = s[pos++];
268 int index = __lookup[c & 0x7f];
269 if (index >= 0) {
270 int idx = t * ROW_SIZE + index;
271 int nt = _rowIndex[idx];
272 if (nt == 0)
273 break;
274 t = nt;
275 } else {
276 byte[] big = _bigIndex is null ? null : _bigIndex[t];
277 if (big is null)
278 return V.init;
279 int nt = big[c];
280 if (nt == 0)
281 break;
282 t = nt;
283 }
284
285 // Is the next Trie is a match
286 if (_key[t] !is null) {
287 // Recurse so we can remember this possibility
288 V best = getBest(t, s, offset + i + 1, len - i - 1);
289
290 static if(is(V == class)) {
291 if (best !is null)
292 return best;
293 } else {
294 if (best != V.init)
295 return best;
296 }
297
298 return _value[t];
299 }
300 }
301 return _value[t];
302 }
303
304 /* ------------------------------------------------------------ */
305 private V getBest(int t, byte[] b, int offset, int len) {
306 for (int i = 0; i < len; i++) {
307 byte c = b[offset + i];
308 int index = __lookup[c & 0x7f];
309 if (index >= 0) {
310 int idx = t * ROW_SIZE + index;
311 int nt = _rowIndex[idx];
312 if (nt == 0)
313 break;
314 t = nt;
315 } else {
316 byte[] big = _bigIndex is null ? null : _bigIndex[t];
317 if (big is null)
318 return V.init;
319 int nt = big[c];
320 if (nt == 0)
321 break;
322 t = nt;
323 }
324
325 // Is the next Trie is a match
326 if (_key[t] !is null) {
327 // Recurse so we can remember this possibility
328 V best = getBest(t, b, offset + i + 1, len - i - 1);
329 static if(is(V == class)) {
330 if (best !is null)
331 return best;
332 } else {
333 if (best != V.init)
334 return best;
335 }
336
337 break;
338 }
339 }
340 return _value[t];
341 }
342
343 private V getBest(int t, ByteBuffer b, int offset, int len) {
344 int pos = b.position() + offset;
345 for (int i = 0; i < len; i++) {
346 byte c = b.get(pos++);
347 int index = __lookup[c & 0x7f];
348 if (index >= 0) {
349 int idx = t * ROW_SIZE + index;
350 int nt = _rowIndex[idx];
351 if (nt == 0)
352 break;
353 t = nt;
354 } else {
355 byte[] big = _bigIndex is null ? null : _bigIndex[t];
356 if (big is null)
357 return V.init;
358 int nt = big[c];
359 if (nt == 0)
360 break;
361 t = nt;
362 }
363
364 // Is the next Trie is a match
365 if (_key[t] !is null) {
366 // Recurse so we can remember this possibility
367 V best = getBest(t, b, offset + i + 1, len - i - 1);
368
369 static if(is(V == class)) {
370 if (best !is null)
371 return best;
372 } else {
373 if (best != V.init)
374 return best;
375 }
376
377 break;
378 }
379 }
380 return _value[t];
381 }
382
383
384 override
385 string toString() {
386 StringBuilder buf = new StringBuilder();
387 toString(buf, 0);
388
389 if (buf.length() == 0)
390 return "{}";
391
392 buf.setCharAt(0, '{');
393 buf.append('}');
394 return buf.toString();
395 }
396
397
398 private void toString(Appendable ot, int t) {
399 void doAppend() {
400 try {
401 ot.append(',');
402 ot.append(_key[t]);
403 ot.append('=');
404 ot.append(_value[t].to!string());
405 } catch (IOException e) {
406 throw new RuntimeException(e);
407 }
408 }
409
410 static if(is(V == class)) {
411 if (_value[t] !is null)
412 doAppend();
413 } else {
414 if (_value[t] != V.init)
415 doAppend();
416 }
417
418 for (int i = 0; i < ROW_SIZE; i++) {
419 int idx = t * ROW_SIZE + i;
420 if (_rowIndex[idx] != 0)
421 toString(ot, _rowIndex[idx]);
422 }
423
424 byte[] big = _bigIndex is null ? null : _bigIndex[t];
425 if (big !is null) {
426 foreach (int i ; big)
427 if (i != 0)
428 toString(ot, i);
429 }
430
431 }
432
433 override
434 Set!(string) keySet() {
435 Set!(string) keys = new HashSet!(string)();
436 keySet(keys, 0);
437 return keys;
438 }
439
440 private void keySet(Set!(string) set, int t) {
441 static if(is(V == class)) {
442 if (t < _value.length && _value[t] !is null)
443 set.add(_key[t]);
444 } else {
445 if (t < _value.length && _value[t] != V.init)
446 set.add(_key[t]);
447 }
448
449 for (int i = 0; i < ROW_SIZE; i++) {
450 int idx = t * ROW_SIZE + i;
451 if (idx < _rowIndex.length && _rowIndex[idx] != 0)
452 keySet(set, _rowIndex[idx]);
453 }
454
455 byte[] big = _bigIndex is null || t >= _bigIndex.length ? null : _bigIndex[t];
456 // if (big !is null) {
457 foreach (int i ; big)
458 if (i != 0)
459 keySet(set, i);
460 // }
461 }
462
463 override
464 bool isFull() {
465 return _rows + 1 >= _key.length;
466 }
467 }