Line data Source code
1 : /* stringlib: fastsearch implementation */
2 :
3 : #ifndef STRINGLIB_FASTSEARCH_H
4 : #define STRINGLIB_FASTSEARCH_H
5 :
6 : /* fast search/count implementation, based on a mix between boyer-
7 : moore and horspool, with a few more bells and whistles on the top.
8 : for some more background, see: http://effbot.org/zone/stringlib.htm */
9 :
10 : /* note: fastsearch may access s[n], which isn't a problem when using
11 : Python's ordinary string types, but may cause problems if you're
12 : using this code in other contexts. also, the count mode returns -1
13 : if there cannot possible be a match in the target string, and 0 if
14 : it has actually checked for matches, but didn't find any. callers
15 : beware! */
16 :
17 : #define FAST_COUNT 0
18 : #define FAST_SEARCH 1
19 : #define FAST_RSEARCH 2
20 :
21 : #if LONG_BIT >= 128
22 : #define STRINGLIB_BLOOM_WIDTH 128
23 : #elif LONG_BIT >= 64
24 : #define STRINGLIB_BLOOM_WIDTH 64
25 : #elif LONG_BIT >= 32
26 : #define STRINGLIB_BLOOM_WIDTH 32
27 : #else
28 : #error "LONG_BIT is smaller than 32"
29 : #endif
30 :
31 : #define STRINGLIB_BLOOM_ADD(mask, ch) \
32 : ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
33 : #define STRINGLIB_BLOOM(mask, ch) \
34 : ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
35 :
36 : Py_LOCAL_INLINE(Py_ssize_t)
37 8103 : fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
38 : const STRINGLIB_CHAR* p, Py_ssize_t m,
39 : Py_ssize_t maxcount, int mode)
40 : {
41 : unsigned long mask;
42 8103 : Py_ssize_t skip, count = 0;
43 : Py_ssize_t i, j, mlast, w;
44 :
45 8103 : w = n - m;
46 :
47 8103 : if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
48 0 : return -1;
49 :
50 : /* look for special cases */
51 8103 : if (m <= 1) {
52 8103 : if (m <= 0)
53 0 : return -1;
54 : /* use special case for 1-character strings */
55 8103 : if (mode == FAST_COUNT) {
56 0 : for (i = 0; i < n; i++)
57 0 : if (s[i] == p[0]) {
58 0 : count++;
59 0 : if (count == maxcount)
60 0 : return maxcount;
61 : }
62 0 : return count;
63 8103 : } else if (mode == FAST_SEARCH) {
64 161403 : for (i = 0; i < n; i++)
65 158505 : if (s[i] == p[0])
66 5187 : return i;
67 : } else { /* FAST_RSEARCH */
68 114 : for (i = n - 1; i > -1; i--)
69 111 : if (s[i] == p[0])
70 15 : return i;
71 : }
72 2901 : return -1;
73 : }
74 :
75 0 : mlast = m - 1;
76 0 : skip = mlast - 1;
77 0 : mask = 0;
78 :
79 0 : if (mode != FAST_RSEARCH) {
80 :
81 : /* create compressed boyer-moore delta 1 table */
82 :
83 : /* process pattern[:-1] */
84 0 : for (i = 0; i < mlast; i++) {
85 0 : STRINGLIB_BLOOM_ADD(mask, p[i]);
86 0 : if (p[i] == p[mlast])
87 0 : skip = mlast - i - 1;
88 : }
89 : /* process pattern[-1] outside the loop */
90 0 : STRINGLIB_BLOOM_ADD(mask, p[mlast]);
91 :
92 0 : for (i = 0; i <= w; i++) {
93 : /* note: using mlast in the skip path slows things down on x86 */
94 0 : if (s[i+m-1] == p[m-1]) {
95 : /* candidate match */
96 0 : for (j = 0; j < mlast; j++)
97 0 : if (s[i+j] != p[j])
98 0 : break;
99 0 : if (j == mlast) {
100 : /* got a match! */
101 0 : if (mode != FAST_COUNT)
102 0 : return i;
103 0 : count++;
104 0 : if (count == maxcount)
105 0 : return maxcount;
106 0 : i = i + mlast;
107 0 : continue;
108 : }
109 : /* miss: check if next character is part of pattern */
110 0 : if (!STRINGLIB_BLOOM(mask, s[i+m]))
111 0 : i = i + m;
112 : else
113 0 : i = i + skip;
114 : } else {
115 : /* skip: check if next character is part of pattern */
116 0 : if (!STRINGLIB_BLOOM(mask, s[i+m]))
117 0 : i = i + m;
118 : }
119 : }
120 : } else { /* FAST_RSEARCH */
121 :
122 : /* create compressed boyer-moore delta 1 table */
123 :
124 : /* process pattern[0] outside the loop */
125 0 : STRINGLIB_BLOOM_ADD(mask, p[0]);
126 : /* process pattern[:0:-1] */
127 0 : for (i = mlast; i > 0; i--) {
128 0 : STRINGLIB_BLOOM_ADD(mask, p[i]);
129 0 : if (p[i] == p[0])
130 0 : skip = i - 1;
131 : }
132 :
133 0 : for (i = w; i >= 0; i--) {
134 0 : if (s[i] == p[0]) {
135 : /* candidate match */
136 0 : for (j = mlast; j > 0; j--)
137 0 : if (s[i+j] != p[j])
138 0 : break;
139 0 : if (j == 0)
140 : /* got a match! */
141 0 : return i;
142 : /* miss: check if previous character is part of pattern */
143 0 : if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
144 0 : i = i - m;
145 : else
146 0 : i = i - skip;
147 : } else {
148 : /* skip: check if previous character is part of pattern */
149 0 : if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
150 0 : i = i - m;
151 : }
152 : }
153 : }
154 :
155 0 : if (mode != FAST_COUNT)
156 0 : return -1;
157 0 : return count;
158 : }
159 :
160 : #endif
|