1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.collection.ArrayTrie;
13 
14 import hunt.collection.AbstractTrie;
15 import hunt.util.Common;
16 import hunt.io.ByteBuffer;
17 import hunt.collection.HashSet;
18 import hunt.collection.Set;
19 import hunt.collection.Trie;
20 
21 import hunt.Exceptions;
22 import hunt.util.Appendable;
23 import hunt.util.StringBuilder;
24 
25 import hunt.logging;
26 import std.conv;
27 
28 /**
29  * <p>A Trie string lookup data structure using a fixed size array.</p>
30  * <p>This implementation is always case insensitive and is optimal for
31  * a small number of fixed strings with few special characters.  The
32  * Trie is stored in an array of lookup tables, each indexed by the
33  * next character of the key.   Frequently used characters are directly
34  * indexed in each lookup table, whilst infrequently used characters
35  * must use a big character table.
36  * </p>
37  * <p>This Trie is very space efficient if the key characters are
38  * from ' ', '+', '-', ':', ';', '.', 'A' to 'Z' or 'a' to 'z'.
39  * Other ISO-8859-1 characters can be used by the key, but less space
40  * efficiently.
41  * </p>
42  * <p>This Trie is not Threadsafe and contains no mutual exclusion
43  * or deliberate memory barriers.  It is intended for an ArrayTrie to be
44  * built by a single thread and then used concurrently by multiple threads
45  * and not mutated during that access.  If concurrent mutations of the
46  * Trie is required external locks need to be applied.
47  * </p>
48  *
49  * @param (V) the element of entry
50  */
51 class ArrayTrie(V) : AbstractTrie!(V) {
52     /**
53      * The Size of a Trie row is how many characters can be looked
54      * up directly without going to a big index.  This is set at
55      * 32 to cover case insensitive alphabet and a few other common
56      * characters.
57      */
58     private enum int ROW_SIZE = 32;
59 
60     /**
61      * The index lookup table, this maps a character as a byte
62      * (ISO-8859-1 or UTF8) to an index within a Trie row
63      */
64     private enum int[] __lookup =
65     [ // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
66    /*0*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
67    /*1*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
68    /*2*/31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, -1, 27, 30, -1,
69    /*3*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 28, 29, -1, -1, -1, -1,
70    /*4*/-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
71    /*5*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
72    /*6*/-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
73    /*7*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
74     ];
75 
76     /**
77      * The Trie rows in a single array which allows a lookup of row,character
78      * to the next row in the Trie.  This is actually a 2 dimensional
79      * array that has been flattened to achieve locality of reference.
80      * The first ROW_SIZE entries are for row 0, then next ROW_SIZE
81      * entries are for row 1 etc.   So in general instead of using
82      * _rows[row][index], we use _rows[row*ROW_SIZE+index] to look up
83      * the next row for a given character.
84      * <p>
85      * The array is of characters rather than integers to save space.
86      */
87     private byte[] _rowIndex;
88 
89     /**
90      * The key (if any) for a Trie row.
91      * A row may be a leaf, a node or both in the Trie tree.
92      */
93     private string[] _key;
94 
95     /**
96      * The value (if any) for a Trie row.
97      * A row may be a leaf, a node or both in the Trie tree.
98      */
99     private V[] _value;
100 
101     /**
102      * A big index for each row.
103      * If a character outside of the lookup map is needed,
104      * then a big index will be created for the row, with
105      * 256 entries, one for each possible byte.
106      */
107     private byte[][] _bigIndex;
108 
109     /**
110      * The number of rows allocated
111      */
112     private byte _rows;
113 
114     this() {
115         this(128);
116     }
117 
118     /* ------------------------------------------------------------ */
119 
120     /**
121      * @param capacity The capacity of the trie, which at the worst case
122      *                 is the total number of characters of all keys stored in the Trie.
123      *                 The capacity needed is dependent of the shared prefixes of the keys.
124      *                 For example, a capacity of 6 nodes is required to store keys "foo"
125      *                 and "bar", but a capacity of only 4 is required to
126      *                 store "bar" and "bat".
127      */
128     this(int capacity) {
129         super(true);
130         _value = new V[capacity];
131         _rowIndex = new byte[capacity * 32];
132         _key = new string[capacity];
133     }
134 
135     /* ------------------------------------------------------------ */
136     override
137     void clear() {
138         _rows = 0;
139         _value[] = V.init;
140         _rowIndex[] = 0;
141         _key[] = null;
142     }
143 
144     /* ------------------------------------------------------------ */
145     override
146     bool put(string s, V v) {
147         int t = 0;
148         size_t k;
149         size_t limit = s.length;
150         for (k = 0; k < limit; k++) {
151             byte c = s[k];
152 
153             int index = __lookup[c & 0x7f];
154             if (index >= 0) {
155                 int idx = t * ROW_SIZE + index;
156                 t = _rowIndex[idx];
157                 if (t == 0) {
158                     if (++_rows >= _value.length)
159                         return false;
160                     t = _rowIndex[idx] = _rows;
161                 }
162             } else if (c > 127)
163                 throw new IllegalArgumentException("non ascii character");
164             else {
165                 if (_bigIndex is null)
166                     _bigIndex = new byte[][_value.length];
167                 if (t >= _bigIndex.length)
168                     return false;
169                 byte[] big = _bigIndex[t];
170                 if (big is null)
171                     big = _bigIndex[t] = new byte[128];
172                 t = big[c];
173                 if (t == 0) {
174                     if (_rows == _value.length)
175                         return false;
176                     t = big[c] = ++_rows;
177                 }
178             }
179         }
180 
181         if (t >= _key.length) {
182             _rows = cast(byte) _key.length;
183             return false;
184         }
185 
186         static if(is(V == class)) {
187             _key[t] = v is null ? null : s;
188         } else {
189             _key[t] = v == V.init ? null : s;
190         }
191         _value[t] = v;
192         return true;
193     }
194 
195     /* ------------------------------------------------------------ */
196     override
197     V get(string s, int offset, int len) {
198         int t = 0;
199         for (int i = 0; i < len; i++) {
200             byte c = s[offset + i];
201             int index = __lookup[c & 0x7f];
202             if (index >= 0) {
203                 int idx = t * ROW_SIZE + index;
204                 t = _rowIndex[idx];
205                 if (t == 0)
206                     return V.init;
207             } else {
208                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
209                 if (big is null)
210                     return V.init;
211                 t = big[c];
212                 if (t == 0)
213                     return V.init;
214             }
215         }
216         return _value[t];
217     }
218 
219     /* ------------------------------------------------------------ */
220     override
221     V get(ByteBuffer b, int offset, int len) {
222         int t = 0;
223         for (int i = 0; i < len; i++) {
224             byte c = b.get(offset + i);
225             int index = __lookup[c & 0x7f];
226             if (index >= 0) {
227                 int idx = t * ROW_SIZE + index;
228                 t = _rowIndex[idx];
229                 if (t == 0)
230                     return V.init;
231             } else {
232                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
233                 if (big is null)
234                     return V.init;
235                 t = big[c];
236                 if (t == 0)
237                     return V.init;
238             }
239         }
240         return _value[t];
241     }
242 
243     /* ------------------------------------------------------------ */
244     override
245     V getBest(byte[] b, int offset, int len) {
246         return getBest(0, b, offset, len);
247     }
248 
249     /* ------------------------------------------------------------ */
250     override
251     V getBest(ByteBuffer b, int offset, int len) {
252         if (b.hasArray())
253             return getBest(0, b.array(), b.arrayOffset() + b.position() + offset, len);
254         return getBest(0, b, offset, len);
255     }
256 
257     /* ------------------------------------------------------------ */
258     override
259     V getBest(string s, int offset, int len) {
260         return getBest(0, s, offset, len);
261     }
262 
263     /* ------------------------------------------------------------ */
264     private V getBest(int t, string s, int offset, int len) {
265         int pos = offset;
266         for (int i = 0; i < len; i++) {
267             byte c = s[pos++];
268             int index = __lookup[c & 0x7f];
269             if (index >= 0) {
270                 int idx = t * ROW_SIZE + index;
271                 int nt = _rowIndex[idx];
272                 if (nt == 0)
273                     break;
274                 t = nt;
275             } else {
276                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
277                 if (big is null)
278                     return V.init;
279                 int nt = big[c];
280                 if (nt == 0)
281                     break;
282                 t = nt;
283             }
284 
285             // Is the next Trie is a match
286             if (_key[t] !is null) {
287                 // Recurse so we can remember this possibility
288                 V best = getBest(t, s, offset + i + 1, len - i - 1);
289                 
290                 static if(is(V == class)) {
291                     if (best !is null)
292                         return best;
293                 } else {
294                     if (best != V.init)
295                         return best;
296                 }
297 
298                 return _value[t];
299             }
300         }
301         return _value[t];
302     }
303 
304     /* ------------------------------------------------------------ */
305     private V getBest(int t, byte[] b, int offset, int len) {
306         for (int i = 0; i < len; i++) {
307             byte c = b[offset + i];
308             int index = __lookup[c & 0x7f];
309             if (index >= 0) {
310                 int idx = t * ROW_SIZE + index;
311                 int nt = _rowIndex[idx];
312                 if (nt == 0)
313                     break;
314                 t = nt;
315             } else {
316                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
317                 if (big is null)
318                     return V.init;
319                 int nt = big[c];
320                 if (nt == 0)
321                     break;
322                 t = nt;
323             }
324 
325             // Is the next Trie is a match
326             if (_key[t] !is null) {
327                 // Recurse so we can remember this possibility
328                 V best = getBest(t, b, offset + i + 1, len - i - 1);
329                 static if(is(V == class)) {
330                     if (best !is null)
331                         return best;
332                 } else {
333                     if (best != V.init)
334                         return best;
335                 }
336 
337                 break;
338             }
339         }
340         return _value[t];
341     }
342 
343     private V getBest(int t, ByteBuffer b, int offset, int len) {
344         int pos = b.position() + offset;
345         for (int i = 0; i < len; i++) {
346             byte c = b.get(pos++);
347             int index = __lookup[c & 0x7f];
348             if (index >= 0) {
349                 int idx = t * ROW_SIZE + index;
350                 int nt = _rowIndex[idx];
351                 if (nt == 0)
352                     break;
353                 t = nt;
354             } else {
355                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
356                 if (big is null)
357                     return V.init;
358                 int nt = big[c];
359                 if (nt == 0)
360                     break;
361                 t = nt;
362             }
363 
364             // Is the next Trie is a match
365             if (_key[t] !is null) {
366                 // Recurse so we can remember this possibility
367                 V best = getBest(t, b, offset + i + 1, len - i - 1);
368                 
369                 static if(is(V == class)) {
370                     if (best !is null)
371                         return best;
372                 } else {
373                     if (best != V.init)
374                         return best;
375                 }
376 
377                 break;
378             }
379         }
380         return _value[t];
381     }
382 
383 
384     override
385     string toString() {
386         StringBuilder buf = new StringBuilder();
387         toString(buf, 0);
388 
389         if (buf.length() == 0)
390             return "{}";
391 
392         buf.setCharAt(0, '{');
393         buf.append('}');
394         return buf.toString();
395     }
396 
397 
398     private void toString(Appendable ot, int t) {
399         void doAppend() {
400             try {
401                 ot.append(',');
402                 ot.append(_key[t]);
403                 ot.append('=');
404                 ot.append(_value[t].to!string());
405             } catch (IOException e) {
406                 throw new RuntimeException(e);
407             }
408         }
409 
410         static if(is(V == class)) {
411             if (_value[t] !is null)
412                 doAppend();
413         } else {
414             if (_value[t] != V.init)
415                 doAppend();
416         }
417 
418         for (int i = 0; i < ROW_SIZE; i++) {
419             int idx = t * ROW_SIZE + i;
420             if (_rowIndex[idx] != 0)
421                 toString(ot, _rowIndex[idx]);
422         }
423 
424         byte[] big = _bigIndex is null ? null : _bigIndex[t];
425         if (big !is null) {
426             foreach (int i ; big)
427                 if (i != 0)
428                     toString(ot, i);
429         }
430 
431     }
432 
433     override
434     Set!(string) keySet() {
435         Set!(string) keys = new HashSet!(string)();
436         keySet(keys, 0);
437         return keys;
438     }
439 
440     private void keySet(Set!(string) set, int t) {
441         static if(is(V == class)) {
442             if (t < _value.length && _value[t] !is null)
443                 set.add(_key[t]);
444         } else {
445             if (t < _value.length && _value[t] != V.init)
446                 set.add(_key[t]);
447         }
448 
449         for (int i = 0; i < ROW_SIZE; i++) {
450             int idx = t * ROW_SIZE + i;
451             if (idx < _rowIndex.length && _rowIndex[idx] != 0)
452                 keySet(set, _rowIndex[idx]);
453         }
454 
455         byte[] big = _bigIndex is null || t >= _bigIndex.length ? null : _bigIndex[t];
456         // if (big !is null) {
457             foreach (int i ; big)
458                 if (i != 0)
459                     keySet(set, i);
460         // }
461     }
462 
463     override
464     bool isFull() {
465         return _rows + 1 >= _key.length;
466     }
467 }