1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.text.SearchPattern;
13 
14 import hunt.Exceptions;
15 
16 /**
17  * SearchPattern
18  * <p>
19  * Fast search for patterns within strings and arrays of bytes.
20  * Uses an implementation of the Boyer–Moore–Horspool algorithm
21  * with a 256 character alphabet.
22  * <p>
23  * The algorithm has an average-case complexity of O(n)
24  * on random text and O(nm) in the worst case.
25  * where:
26  * m = pattern length
27  * n = length of data to search
28  */
29 class SearchPattern {
30     enum int alphabetSize = 256;
31     private size_t[] table;
32     private byte[] pattern;
33 
34     /**
35      * Produces a SearchPattern instance which can be used
36      * to find matches of the pattern in data
37      *
38      * @param pattern byte array containing the pattern
39      * @return a new SearchPattern instance using the given pattern
40      */
41     static SearchPattern compile(byte[] pattern) {
42         return new SearchPattern(pattern);
43     }
44 
45     /**
46      * Produces a SearchPattern instance which can be used
47      * to find matches of the pattern in data
48      *
49      * @param pattern string containing the pattern
50      * @return a new SearchPattern instance using the given pattern
51      */
52     static SearchPattern compile(string pattern) {
53         return new SearchPattern(cast(byte[])pattern);
54     }
55 
56     /**
57      * @param pattern byte array containing the pattern used for matching
58      */
59     private this(byte[] pattern) {
60         this.pattern = pattern;
61 
62         if (pattern.length == 0)
63             throw new IllegalArgumentException("Empty Pattern");
64 
65         //Build up the pre-processed table for this pattern.
66         table = new size_t[alphabetSize];
67         for (size_t i = 0; i < table.length; ++i)
68             table[i] = pattern.length;
69         for (size_t i = 0; i < pattern.length - 1; ++i)
70             table[0xff & pattern[i]] = pattern.length - 1 - i;
71     }
72 
73 
74     /**
75      * Search for a complete match of the pattern within the data
76      *
77      * @param data   The data in which to search for. The data may be arbitrary binary data,
78      *               but the pattern will always be {@link StandardCharsets#US_ASCII} encoded.
79      * @param offset The offset within the data to start the search
80      * @param length The length of the data to search
81      * @return The index within the data array at which the first instance of the pattern or -1 if not found
82      */
83     int match(byte[] data, int offset, int length) {
84         validate(data, offset, length);
85         int skip = offset;
86         int len = cast(int)pattern.length;
87         while (skip <= offset + length - len) {
88             for (size_t i = len - 1; data[skip + i] == pattern[i]; i--) {
89                 if (i == 0) return skip;
90             }
91 
92             skip += table[0xff & data[skip + len - 1]];
93         }
94 
95         return -1;
96     }
97 
98     /**
99      * Search for a partial match of the pattern at the end of the data.
100      *
101      * @param data   The data in which to search for. The data may be arbitrary binary data,
102      *               but the pattern will always be {@link StandardCharsets#US_ASCII} encoded.
103      * @param offset The offset within the data to start the search
104      * @param length The length of the data to search
105      * @return the length of the partial pattern matched and 0 for no match.
106      */
107     int endsWith(byte[] data, int offset, int length) {
108         validate(data, offset, length);
109 
110         int len = cast(int)pattern.length;
111         int skip = (len <= length) ? (offset + length - len) : offset;
112         while (skip < offset + length) {
113             for (size_t i = (offset + length - 1) - skip; data[skip + i] == pattern[i]; --i)
114                 if (i == 0) return cast(int) (offset + length - skip);
115 
116             if (skip + len - 1 < data.length)
117                 skip += table[0xff & data[skip + len - 1]];
118             else
119                 skip++;
120         }
121 
122         return 0;
123     }
124 
125     /**
126      * Search for a possibly partial match of the pattern at the start of the data.
127      *
128      * @param data    The data in which to search for. The data may be arbitrary binary data,
129      *                but the pattern will always be {@link StandardCharsets#US_ASCII} encoded.
130      * @param offset  The offset within the data to start the search
131      * @param length  The length of the data to search
132      * @param matched The length of the partial pattern already matched
133      * @return the length of the partial pattern matched and 0 for no match.
134      */
135     int startsWith(byte[] data, int offset, int length, int matched) {
136         validate(data, offset, length);
137 
138         int matchedCount = 0;
139 
140         for (size_t i = 0; i < cast(int)pattern.length - matched && i < length; i++) {
141             if (data[offset + i] == pattern[i + matched])
142                 matchedCount++;
143             else
144                 return 0;
145         }
146 
147         return matched + matchedCount;
148     }
149 
150     /**
151      * Performs legality checks for standard arguments input into SearchPattern methods.
152      *
153      * @param data   The data in which to search for. The data may be arbitrary binary data,
154      *               but the pattern will always be {@link StandardCharsets#US_ASCII} encoded.
155      * @param offset The offset within the data to start the search
156      * @param length The length of the data to search
157      */
158     private void validate(byte[] data, int offset, int length) {
159         if (offset < 0)
160             throw new IllegalArgumentException("offset was negative");
161         else if (length < 0)
162             throw new IllegalArgumentException("length was negative");
163         else if (offset + length > data.length)
164             throw new IllegalArgumentException("(offset+length) out of bounds of data[]");
165     }
166 
167     /**
168      * @return The length of the pattern in bytes.
169      */
170     int getLength() {
171         return cast(int)pattern.length;
172     }
173 }