1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.collection.ArrayTrie; 13 14 import hunt.collection.AbstractTrie; 15 import hunt.util.Common; 16 import hunt.io.ByteBuffer; 17 import hunt.collection.HashSet; 18 import hunt.collection.Set; 19 import hunt.collection.Trie; 20 21 import hunt.Exceptions; 22 import hunt.util.Appendable; 23 import hunt.util.StringBuilder; 24 25 import hunt.logging; 26 import std.conv; 27 28 /** 29 * <p>A Trie string lookup data structure using a fixed size array.</p> 30 * <p>This implementation is always case insensitive and is optimal for 31 * a small number of fixed strings with few special characters. The 32 * Trie is stored in an array of lookup tables, each indexed by the 33 * next character of the key. Frequently used characters are directly 34 * indexed in each lookup table, whilst infrequently used characters 35 * must use a big character table. 36 * </p> 37 * <p>This Trie is very space efficient if the key characters are 38 * from ' ', '+', '-', ':', ';', '.', 'A' to 'Z' or 'a' to 'z'. 39 * Other ISO-8859-1 characters can be used by the key, but less space 40 * efficiently. 41 * </p> 42 * <p>This Trie is not Threadsafe and contains no mutual exclusion 43 * or deliberate memory barriers. It is intended for an ArrayTrie to be 44 * built by a single thread and then used concurrently by multiple threads 45 * and not mutated during that access. If concurrent mutations of the 46 * Trie is required external locks need to be applied. 47 * </p> 48 * 49 * @param (V) the element of entry 50 */ 51 class ArrayTrie(V) : AbstractTrie!(V) { 52 /** 53 * The Size of a Trie row is how many characters can be looked 54 * up directly without going to a big index. This is set at 55 * 32 to cover case insensitive alphabet and a few other common 56 * characters. 57 */ 58 private enum int ROW_SIZE = 32; 59 60 /** 61 * The index lookup table, this maps a character as a byte 62 * (ISO-8859-1 or UTF8) to an index within a Trie row 63 */ 64 private enum int[] __lookup = 65 [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F 66 /*0*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 67 /*1*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 68 /*2*/31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, -1, 27, 30, -1, 69 /*3*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 28, 29, -1, -1, -1, -1, 70 /*4*/-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 71 /*5*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 72 /*6*/-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 73 /*7*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 74 ]; 75 76 /** 77 * The Trie rows in a single array which allows a lookup of row,character 78 * to the next row in the Trie. This is actually a 2 dimensional 79 * array that has been flattened to achieve locality of reference. 80 * The first ROW_SIZE entries are for row 0, then next ROW_SIZE 81 * entries are for row 1 etc. So in general instead of using 82 * _rows[row][index], we use _rows[row*ROW_SIZE+index] to look up 83 * the next row for a given character. 84 * <p> 85 * The array is of characters rather than integers to save space. 86 */ 87 private byte[] _rowIndex; 88 89 /** 90 * The key (if any) for a Trie row. 91 * A row may be a leaf, a node or both in the Trie tree. 92 */ 93 private string[] _key; 94 95 /** 96 * The value (if any) for a Trie row. 97 * A row may be a leaf, a node or both in the Trie tree. 98 */ 99 private V[] _value; 100 101 /** 102 * A big index for each row. 103 * If a character outside of the lookup map is needed, 104 * then a big index will be created for the row, with 105 * 256 entries, one for each possible byte. 106 */ 107 private byte[][] _bigIndex; 108 109 /** 110 * The number of rows allocated 111 */ 112 private byte _rows; 113 114 this() { 115 this(128); 116 } 117 118 /* ------------------------------------------------------------ */ 119 120 /** 121 * @param capacity The capacity of the trie, which at the worst case 122 * is the total number of characters of all keys stored in the Trie. 123 * The capacity needed is dependent of the shared prefixes of the keys. 124 * For example, a capacity of 6 nodes is required to store keys "foo" 125 * and "bar", but a capacity of only 4 is required to 126 * store "bar" and "bat". 127 */ 128 this(int capacity) { 129 super(true); 130 _value = new V[capacity]; 131 _rowIndex = new byte[capacity * 32]; 132 _key = new string[capacity]; 133 } 134 135 /* ------------------------------------------------------------ */ 136 override 137 void clear() { 138 _rows = 0; 139 _value[] = V.init; 140 _rowIndex[] = 0; 141 _key[] = null; 142 } 143 144 /* ------------------------------------------------------------ */ 145 override 146 bool put(string s, V v) { 147 int t = 0; 148 size_t k; 149 size_t limit = s.length; 150 for (k = 0; k < limit; k++) { 151 byte c = s[k]; 152 153 int index = __lookup[c & 0x7f]; 154 if (index >= 0) { 155 int idx = t * ROW_SIZE + index; 156 t = _rowIndex[idx]; 157 if (t == 0) { 158 if (++_rows >= _value.length) 159 return false; 160 t = _rowIndex[idx] = _rows; 161 } 162 } else if (c > 127) 163 throw new IllegalArgumentException("non ascii character"); 164 else { 165 if (_bigIndex is null) 166 _bigIndex = new byte[][_value.length]; 167 if (t >= _bigIndex.length) 168 return false; 169 byte[] big = _bigIndex[t]; 170 if (big is null) 171 big = _bigIndex[t] = new byte[128]; 172 t = big[c]; 173 if (t == 0) { 174 if (_rows == _value.length) 175 return false; 176 t = big[c] = ++_rows; 177 } 178 } 179 } 180 181 if (t >= _key.length) { 182 _rows = cast(byte) _key.length; 183 return false; 184 } 185 186 static if(is(V == class)) { 187 _key[t] = v is null ? null : s; 188 } else { 189 _key[t] = v == V.init ? null : s; 190 } 191 _value[t] = v; 192 return true; 193 } 194 195 /* ------------------------------------------------------------ */ 196 override 197 V get(string s, int offset, int len) { 198 int t = 0; 199 for (int i = 0; i < len; i++) { 200 byte c = s[offset + i]; 201 int index = __lookup[c & 0x7f]; 202 if (index >= 0) { 203 int idx = t * ROW_SIZE + index; 204 t = _rowIndex[idx]; 205 if (t == 0) 206 return V.init; 207 } else { 208 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 209 if (big is null) 210 return V.init; 211 t = big[c]; 212 if (t == 0) 213 return V.init; 214 } 215 } 216 return _value[t]; 217 } 218 219 /* ------------------------------------------------------------ */ 220 override 221 V get(ByteBuffer b, int offset, int len) { 222 int t = 0; 223 for (int i = 0; i < len; i++) { 224 byte c = b.get(offset + i); 225 int index = __lookup[c & 0x7f]; 226 if (index >= 0) { 227 int idx = t * ROW_SIZE + index; 228 t = _rowIndex[idx]; 229 if (t == 0) 230 return V.init; 231 } else { 232 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 233 if (big is null) 234 return V.init; 235 t = big[c]; 236 if (t == 0) 237 return V.init; 238 } 239 } 240 return _value[t]; 241 } 242 243 /* ------------------------------------------------------------ */ 244 override 245 V getBest(byte[] b, int offset, int len) { 246 return getBest(0, b, offset, len); 247 } 248 249 /* ------------------------------------------------------------ */ 250 override 251 V getBest(ByteBuffer b, int offset, int len) { 252 if (b.hasArray()) 253 return getBest(0, b.array(), b.arrayOffset() + b.position() + offset, len); 254 return getBest(0, b, offset, len); 255 } 256 257 /* ------------------------------------------------------------ */ 258 override 259 V getBest(string s, int offset, int len) { 260 return getBest(0, s, offset, len); 261 } 262 263 /* ------------------------------------------------------------ */ 264 private V getBest(int t, string s, int offset, int len) { 265 int pos = offset; 266 for (int i = 0; i < len; i++) { 267 byte c = s[pos++]; 268 int index = __lookup[c & 0x7f]; 269 if (index >= 0) { 270 int idx = t * ROW_SIZE + index; 271 int nt = _rowIndex[idx]; 272 if (nt == 0) 273 break; 274 t = nt; 275 } else { 276 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 277 if (big is null) 278 return V.init; 279 int nt = big[c]; 280 if (nt == 0) 281 break; 282 t = nt; 283 } 284 285 // Is the next Trie is a match 286 if (_key[t] !is null) { 287 // Recurse so we can remember this possibility 288 V best = getBest(t, s, offset + i + 1, len - i - 1); 289 290 static if(is(V == class)) { 291 if (best !is null) 292 return best; 293 } else { 294 if (best != V.init) 295 return best; 296 } 297 298 return _value[t]; 299 } 300 } 301 return _value[t]; 302 } 303 304 /* ------------------------------------------------------------ */ 305 private V getBest(int t, byte[] b, int offset, int len) { 306 for (int i = 0; i < len; i++) { 307 byte c = b[offset + i]; 308 int index = __lookup[c & 0x7f]; 309 if (index >= 0) { 310 int idx = t * ROW_SIZE + index; 311 int nt = _rowIndex[idx]; 312 if (nt == 0) 313 break; 314 t = nt; 315 } else { 316 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 317 if (big is null) 318 return V.init; 319 int nt = big[c]; 320 if (nt == 0) 321 break; 322 t = nt; 323 } 324 325 // Is the next Trie is a match 326 if (_key[t] !is null) { 327 // Recurse so we can remember this possibility 328 V best = getBest(t, b, offset + i + 1, len - i - 1); 329 static if(is(V == class)) { 330 if (best !is null) 331 return best; 332 } else { 333 if (best != V.init) 334 return best; 335 } 336 337 break; 338 } 339 } 340 return _value[t]; 341 } 342 343 private V getBest(int t, ByteBuffer b, int offset, int len) { 344 int pos = b.position() + offset; 345 for (int i = 0; i < len; i++) { 346 byte c = b.get(pos++); 347 int index = __lookup[c & 0x7f]; 348 if (index >= 0) { 349 int idx = t * ROW_SIZE + index; 350 int nt = _rowIndex[idx]; 351 if (nt == 0) 352 break; 353 t = nt; 354 } else { 355 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 356 if (big is null) 357 return V.init; 358 int nt = big[c]; 359 if (nt == 0) 360 break; 361 t = nt; 362 } 363 364 // Is the next Trie is a match 365 if (_key[t] !is null) { 366 // Recurse so we can remember this possibility 367 V best = getBest(t, b, offset + i + 1, len - i - 1); 368 369 static if(is(V == class)) { 370 if (best !is null) 371 return best; 372 } else { 373 if (best != V.init) 374 return best; 375 } 376 377 break; 378 } 379 } 380 return _value[t]; 381 } 382 383 384 override 385 string toString() { 386 StringBuilder buf = new StringBuilder(); 387 toString(buf, 0); 388 389 if (buf.length() == 0) 390 return "{}"; 391 392 buf.setCharAt(0, '{'); 393 buf.append('}'); 394 return buf.toString(); 395 } 396 397 398 private void toString(Appendable ot, int t) { 399 void doAppend() { 400 try { 401 ot.append(','); 402 ot.append(_key[t]); 403 ot.append('='); 404 ot.append(_value[t].to!string()); 405 } catch (IOException e) { 406 throw new RuntimeException(e); 407 } 408 } 409 410 static if(is(V == class)) { 411 if (_value[t] !is null) 412 doAppend(); 413 } else { 414 if (_value[t] != V.init) 415 doAppend(); 416 } 417 418 for (int i = 0; i < ROW_SIZE; i++) { 419 int idx = t * ROW_SIZE + i; 420 if (_rowIndex[idx] != 0) 421 toString(ot, _rowIndex[idx]); 422 } 423 424 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 425 if (big !is null) { 426 foreach (int i ; big) 427 if (i != 0) 428 toString(ot, i); 429 } 430 431 } 432 433 override 434 Set!(string) keySet() { 435 Set!(string) keys = new HashSet!(string)(); 436 keySet(keys, 0); 437 return keys; 438 } 439 440 private void keySet(Set!(string) set, int t) { 441 static if(is(V == class)) { 442 if (t < _value.length && _value[t] !is null) 443 set.add(_key[t]); 444 } else { 445 if (t < _value.length && _value[t] != V.init) 446 set.add(_key[t]); 447 } 448 449 for (int i = 0; i < ROW_SIZE; i++) { 450 int idx = t * ROW_SIZE + i; 451 if (idx < _rowIndex.length && _rowIndex[idx] != 0) 452 keySet(set, _rowIndex[idx]); 453 } 454 455 byte[] big = _bigIndex is null || t >= _bigIndex.length ? null : _bigIndex[t]; 456 // if (big !is null) { 457 foreach (int i ; big) 458 if (i != 0) 459 keySet(set, i); 460 // } 461 } 462 463 override 464 bool isFull() { 465 return _rows + 1 >= _key.length; 466 } 467 }