1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.text.SearchPattern; 13 14 import hunt.Exceptions; 15 16 /** 17 * SearchPattern 18 * <p> 19 * Fast search for patterns within strings and arrays of bytes. 20 * Uses an implementation of the Boyer–Moore–Horspool algorithm 21 * with a 256 character alphabet. 22 * <p> 23 * The algorithm has an average-case complexity of O(n) 24 * on random text and O(nm) in the worst case. 25 * where: 26 * m = pattern length 27 * n = length of data to search 28 */ 29 class SearchPattern { 30 enum int alphabetSize = 256; 31 private size_t[] table; 32 private byte[] pattern; 33 34 /** 35 * Produces a SearchPattern instance which can be used 36 * to find matches of the pattern in data 37 * 38 * @param pattern byte array containing the pattern 39 * @return a new SearchPattern instance using the given pattern 40 */ 41 static SearchPattern compile(byte[] pattern) { 42 return new SearchPattern(pattern); 43 } 44 45 /** 46 * Produces a SearchPattern instance which can be used 47 * to find matches of the pattern in data 48 * 49 * @param pattern string containing the pattern 50 * @return a new SearchPattern instance using the given pattern 51 */ 52 static SearchPattern compile(string pattern) { 53 return new SearchPattern(cast(byte[])pattern); 54 } 55 56 /** 57 * @param pattern byte array containing the pattern used for matching 58 */ 59 private this(byte[] pattern) { 60 this.pattern = pattern; 61 62 if (pattern.length == 0) 63 throw new IllegalArgumentException("Empty Pattern"); 64 65 //Build up the pre-processed table for this pattern. 66 table = new size_t[alphabetSize]; 67 for (size_t i = 0; i < table.length; ++i) 68 table[i] = pattern.length; 69 for (size_t i = 0; i < pattern.length - 1; ++i) 70 table[0xff & pattern[i]] = pattern.length - 1 - i; 71 } 72 73 74 /** 75 * Search for a complete match of the pattern within the data 76 * 77 * @param data The data in which to search for. The data may be arbitrary binary data, 78 * but the pattern will always be {@link StandardCharsets#US_ASCII} encoded. 79 * @param offset The offset within the data to start the search 80 * @param length The length of the data to search 81 * @return The index within the data array at which the first instance of the pattern or -1 if not found 82 */ 83 int match(byte[] data, int offset, int length) { 84 validate(data, offset, length); 85 int skip = offset; 86 int len = cast(int)pattern.length; 87 while (skip <= offset + length - len) { 88 for (size_t i = len - 1; data[skip + i] == pattern[i]; i--) { 89 if (i == 0) return skip; 90 } 91 92 skip += table[0xff & data[skip + len - 1]]; 93 } 94 95 return -1; 96 } 97 98 /** 99 * Search for a partial match of the pattern at the end of the data. 100 * 101 * @param data The data in which to search for. The data may be arbitrary binary data, 102 * but the pattern will always be {@link StandardCharsets#US_ASCII} encoded. 103 * @param offset The offset within the data to start the search 104 * @param length The length of the data to search 105 * @return the length of the partial pattern matched and 0 for no match. 106 */ 107 int endsWith(byte[] data, int offset, int length) { 108 validate(data, offset, length); 109 110 int len = cast(int)pattern.length; 111 int skip = (len <= length) ? (offset + length - len) : offset; 112 while (skip < offset + length) { 113 for (size_t i = (offset + length - 1) - skip; data[skip + i] == pattern[i]; --i) 114 if (i == 0) return cast(int) (offset + length - skip); 115 116 if (skip + len - 1 < data.length) 117 skip += table[0xff & data[skip + len - 1]]; 118 else 119 skip++; 120 } 121 122 return 0; 123 } 124 125 /** 126 * Search for a possibly partial match of the pattern at the start of the data. 127 * 128 * @param data The data in which to search for. The data may be arbitrary binary data, 129 * but the pattern will always be {@link StandardCharsets#US_ASCII} encoded. 130 * @param offset The offset within the data to start the search 131 * @param length The length of the data to search 132 * @param matched The length of the partial pattern already matched 133 * @return the length of the partial pattern matched and 0 for no match. 134 */ 135 int startsWith(byte[] data, int offset, int length, int matched) { 136 validate(data, offset, length); 137 138 int matchedCount = 0; 139 140 for (size_t i = 0; i < cast(int)pattern.length - matched && i < length; i++) { 141 if (data[offset + i] == pattern[i + matched]) 142 matchedCount++; 143 else 144 return 0; 145 } 146 147 return matched + matchedCount; 148 } 149 150 /** 151 * Performs legality checks for standard arguments input into SearchPattern methods. 152 * 153 * @param data The data in which to search for. The data may be arbitrary binary data, 154 * but the pattern will always be {@link StandardCharsets#US_ASCII} encoded. 155 * @param offset The offset within the data to start the search 156 * @param length The length of the data to search 157 */ 158 private void validate(byte[] data, int offset, int length) { 159 if (offset < 0) 160 throw new IllegalArgumentException("offset was negative"); 161 else if (length < 0) 162 throw new IllegalArgumentException("length was negative"); 163 else if (offset + length > data.length) 164 throw new IllegalArgumentException("(offset+length) out of bounds of data[]"); 165 } 166 167 /** 168 * @return The length of the pattern in bytes. 169 */ 170 int getLength() { 171 return cast(int)pattern.length; 172 } 173 }