Line data Source code
1 : /*
2 : * Secret Labs' Regular Expression Engine
3 : *
4 : * regular expression matching engine
5 : *
6 : * partial history:
7 : * 1999-10-24 fl created (based on existing template matcher code)
8 : * 2000-03-06 fl first alpha, sort of
9 : * 2000-08-01 fl fixes for 1.6b1
10 : * 2000-08-07 fl use PyOS_CheckStack() if available
11 : * 2000-09-20 fl added expand method
12 : * 2001-03-20 fl lots of fixes for 2.1b2
13 : * 2001-04-15 fl export copyright as Python attribute, not global
14 : * 2001-04-28 fl added __copy__ methods (work in progress)
15 : * 2001-05-14 fl fixes for 1.5.2 compatibility
16 : * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 : * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 : * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 : * 2001-10-21 fl added sub/subn primitive
20 : * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 : * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 : * 2002-11-09 fl fixed empty sub/subn return type
23 : * 2003-04-18 mvl fully support 4-byte codes
24 : * 2003-10-17 gn implemented non recursive scheme
25 : *
26 : * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
27 : *
28 : * This version of the SRE library can be redistributed under CNRI's
29 : * Python 1.6 license. For any other use, please contact Secret Labs
30 : * AB (info@pythonware.com).
31 : *
32 : * Portions of this engine have been developed in cooperation with
33 : * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 : * other compatibility work.
35 : */
36 :
37 : #ifndef SRE_RECURSIVE
38 :
39 : static char copyright[] =
40 : " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
41 :
42 : #define PY_SSIZE_T_CLEAN
43 :
44 : #include "Python.h"
45 : #include "structmember.h" /* offsetof */
46 :
47 : #include "sre.h"
48 :
49 : #include <ctype.h>
50 :
51 : /* name of this module, minus the leading underscore */
52 : #if !defined(SRE_MODULE)
53 : #define SRE_MODULE "sre"
54 : #endif
55 :
56 : #define SRE_PY_MODULE "re"
57 :
58 : /* defining this one enables tracing */
59 : #undef VERBOSE
60 :
61 : #if PY_VERSION_HEX >= 0x01060000
62 : #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 : /* defining this enables unicode support (default under 1.6a1 and later) */
64 : #define HAVE_UNICODE
65 : #endif
66 : #endif
67 :
68 : /* -------------------------------------------------------------------- */
69 : /* optional features */
70 :
71 : /* enables fast searching */
72 : #define USE_FAST_SEARCH
73 :
74 : /* enables aggressive inlining (always on for Visual C) */
75 : #undef USE_INLINE
76 :
77 : /* enables copy/deepcopy handling (work in progress) */
78 : #undef USE_BUILTIN_COPY
79 :
80 : #if PY_VERSION_HEX < 0x01060000
81 : #define PyObject_DEL(op) PyMem_DEL((op))
82 : #endif
83 :
84 : /* -------------------------------------------------------------------- */
85 :
86 : #if defined(_MSC_VER)
87 : #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 : #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 : /* fastest possible local call under MSVC */
90 : #define LOCAL(type) static __inline type __fastcall
91 : #elif defined(USE_INLINE)
92 : #define LOCAL(type) static inline type
93 : #else
94 : #define LOCAL(type) static type
95 : #endif
96 :
97 : /* error codes */
98 : #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 : #define SRE_ERROR_STATE -2 /* illegal state */
100 : #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 : #define SRE_ERROR_MEMORY -9 /* out of memory */
102 : #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
103 :
104 : #if defined(VERBOSE)
105 : #define TRACE(v) printf v
106 : #else
107 : #define TRACE(v)
108 : #endif
109 :
110 : /* -------------------------------------------------------------------- */
111 : /* search engine state */
112 :
113 : /* default character predicates (run sre_chars.py to regenerate tables) */
114 :
115 : #define SRE_DIGIT_MASK 1
116 : #define SRE_SPACE_MASK 2
117 : #define SRE_LINEBREAK_MASK 4
118 : #define SRE_ALNUM_MASK 8
119 : #define SRE_WORD_MASK 16
120 :
121 : /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122 :
123 : static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
124 : 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
125 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
126 : 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
127 : 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
128 : 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
129 : 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 :
131 : static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
132 : 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
133 : 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
134 : 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
135 : 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136 : 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137 : 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138 : 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139 : 120, 121, 122, 123, 124, 125, 126, 127 };
140 :
141 : #define SRE_IS_DIGIT(ch)\
142 : ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143 : #define SRE_IS_SPACE(ch)\
144 : ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145 : #define SRE_IS_LINEBREAK(ch)\
146 : ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147 : #define SRE_IS_ALNUM(ch)\
148 : ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149 : #define SRE_IS_WORD(ch)\
150 : ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 :
152 396 : static unsigned int sre_lower(unsigned int ch)
153 : {
154 396 : return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
155 : }
156 :
157 : /* locale-specific character predicates */
158 : /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 : * warnings when c's type supports only numbers < N+1 */
160 : #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161 : #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162 : #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163 : #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164 : #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 :
166 0 : static unsigned int sre_lower_locale(unsigned int ch)
167 : {
168 0 : return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
169 : }
170 :
171 : /* unicode-specific character predicates */
172 :
173 : #if defined(HAVE_UNICODE)
174 :
175 : #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
176 : #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177 : #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178 : #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179 : #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 :
181 0 : static unsigned int sre_lower_unicode(unsigned int ch)
182 : {
183 0 : return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184 : }
185 :
186 : #endif
187 :
188 : LOCAL(int)
189 14817 : sre_category(SRE_CODE category, unsigned int ch)
190 : {
191 14817 : switch (category) {
192 :
193 : case SRE_CATEGORY_DIGIT:
194 0 : return SRE_IS_DIGIT(ch);
195 : case SRE_CATEGORY_NOT_DIGIT:
196 0 : return !SRE_IS_DIGIT(ch);
197 : case SRE_CATEGORY_SPACE:
198 3813 : return SRE_IS_SPACE(ch);
199 : case SRE_CATEGORY_NOT_SPACE:
200 0 : return !SRE_IS_SPACE(ch);
201 : case SRE_CATEGORY_WORD:
202 11004 : return SRE_IS_WORD(ch);
203 : case SRE_CATEGORY_NOT_WORD:
204 0 : return !SRE_IS_WORD(ch);
205 : case SRE_CATEGORY_LINEBREAK:
206 0 : return SRE_IS_LINEBREAK(ch);
207 : case SRE_CATEGORY_NOT_LINEBREAK:
208 0 : return !SRE_IS_LINEBREAK(ch);
209 :
210 : case SRE_CATEGORY_LOC_WORD:
211 0 : return SRE_LOC_IS_WORD(ch);
212 : case SRE_CATEGORY_LOC_NOT_WORD:
213 0 : return !SRE_LOC_IS_WORD(ch);
214 :
215 : #if defined(HAVE_UNICODE)
216 : case SRE_CATEGORY_UNI_DIGIT:
217 0 : return SRE_UNI_IS_DIGIT(ch);
218 : case SRE_CATEGORY_UNI_NOT_DIGIT:
219 0 : return !SRE_UNI_IS_DIGIT(ch);
220 : case SRE_CATEGORY_UNI_SPACE:
221 0 : return SRE_UNI_IS_SPACE(ch);
222 : case SRE_CATEGORY_UNI_NOT_SPACE:
223 0 : return !SRE_UNI_IS_SPACE(ch);
224 : case SRE_CATEGORY_UNI_WORD:
225 0 : return SRE_UNI_IS_WORD(ch);
226 : case SRE_CATEGORY_UNI_NOT_WORD:
227 0 : return !SRE_UNI_IS_WORD(ch);
228 : case SRE_CATEGORY_UNI_LINEBREAK:
229 0 : return SRE_UNI_IS_LINEBREAK(ch);
230 : case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 0 : return !SRE_UNI_IS_LINEBREAK(ch);
232 : #else
233 : case SRE_CATEGORY_UNI_DIGIT:
234 : return SRE_IS_DIGIT(ch);
235 : case SRE_CATEGORY_UNI_NOT_DIGIT:
236 : return !SRE_IS_DIGIT(ch);
237 : case SRE_CATEGORY_UNI_SPACE:
238 : return SRE_IS_SPACE(ch);
239 : case SRE_CATEGORY_UNI_NOT_SPACE:
240 : return !SRE_IS_SPACE(ch);
241 : case SRE_CATEGORY_UNI_WORD:
242 : return SRE_LOC_IS_WORD(ch);
243 : case SRE_CATEGORY_UNI_NOT_WORD:
244 : return !SRE_LOC_IS_WORD(ch);
245 : case SRE_CATEGORY_UNI_LINEBREAK:
246 : return SRE_IS_LINEBREAK(ch);
247 : case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 : return !SRE_IS_LINEBREAK(ch);
249 : #endif
250 : }
251 0 : return 0;
252 : }
253 :
254 : /* helpers */
255 :
256 : static void
257 6063 : data_stack_dealloc(SRE_STATE* state)
258 : {
259 6063 : if (state->data_stack) {
260 4905 : PyMem_FREE(state->data_stack);
261 4905 : state->data_stack = NULL;
262 : }
263 6063 : state->data_stack_size = state->data_stack_base = 0;
264 6063 : }
265 :
266 : static int
267 4905 : data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 : {
269 : Py_ssize_t minsize, cursize;
270 4905 : minsize = state->data_stack_base+size;
271 4905 : cursize = state->data_stack_size;
272 4905 : if (cursize < minsize) {
273 : void* stack;
274 4905 : cursize = minsize+minsize/4+1024;
275 : TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
276 4905 : stack = PyMem_REALLOC(state->data_stack, cursize);
277 4905 : if (!stack) {
278 0 : data_stack_dealloc(state);
279 0 : return SRE_ERROR_MEMORY;
280 : }
281 4905 : state->data_stack = (char *)stack;
282 4905 : state->data_stack_size = cursize;
283 : }
284 4905 : return 0;
285 : }
286 :
287 : /* generate 8-bit version */
288 :
289 : #define SRE_CHAR unsigned char
290 : #define SRE_AT sre_at
291 : #define SRE_COUNT sre_count
292 : #define SRE_CHARSET sre_charset
293 : #define SRE_INFO sre_info
294 : #define SRE_MATCH sre_match
295 : #define SRE_MATCH_CONTEXT sre_match_context
296 : #define SRE_SEARCH sre_search
297 : #define SRE_LITERAL_TEMPLATE sre_literal_template
298 :
299 : #if defined(HAVE_UNICODE)
300 :
301 : #define SRE_RECURSIVE
302 : #include "_sre.c"
303 : #undef SRE_RECURSIVE
304 :
305 : #undef SRE_LITERAL_TEMPLATE
306 : #undef SRE_SEARCH
307 : #undef SRE_MATCH
308 : #undef SRE_MATCH_CONTEXT
309 : #undef SRE_INFO
310 : #undef SRE_CHARSET
311 : #undef SRE_COUNT
312 : #undef SRE_AT
313 : #undef SRE_CHAR
314 :
315 : /* generate 16-bit unicode version */
316 :
317 : #define SRE_CHAR Py_UNICODE
318 : #define SRE_AT sre_uat
319 : #define SRE_COUNT sre_ucount
320 : #define SRE_CHARSET sre_ucharset
321 : #define SRE_INFO sre_uinfo
322 : #define SRE_MATCH sre_umatch
323 : #define SRE_MATCH_CONTEXT sre_umatch_context
324 : #define SRE_SEARCH sre_usearch
325 : #define SRE_LITERAL_TEMPLATE sre_uliteral_template
326 : #endif
327 :
328 : #endif /* SRE_RECURSIVE */
329 :
330 : /* -------------------------------------------------------------------- */
331 : /* String matching engine */
332 :
333 : /* the following section is compiled twice, with different character
334 : settings */
335 :
336 : LOCAL(int)
337 0 : SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 : {
339 : /* check if pointer is at given position */
340 :
341 : Py_ssize_t thisp, thatp;
342 :
343 0 : switch (at) {
344 :
345 : case SRE_AT_BEGINNING:
346 : case SRE_AT_BEGINNING_STRING:
347 0 : return ((void*) ptr == state->beginning);
348 :
349 : case SRE_AT_BEGINNING_LINE:
350 0 : return ((void*) ptr == state->beginning ||
351 0 : SRE_IS_LINEBREAK((int) ptr[-1]));
352 :
353 : case SRE_AT_END:
354 0 : return (((SRE_CHAR *)state->end - ptr == 1 &&
355 0 : SRE_IS_LINEBREAK((int) ptr[0])) ||
356 0 : ((void*) ptr == state->end));
357 :
358 : case SRE_AT_END_LINE:
359 0 : return ((void*) ptr == state->end ||
360 0 : SRE_IS_LINEBREAK((int) ptr[0]));
361 :
362 : case SRE_AT_END_STRING:
363 0 : return ((void*) ptr == state->end);
364 :
365 : case SRE_AT_BOUNDARY:
366 0 : if (state->beginning == state->end)
367 0 : return 0;
368 0 : thatp = ((void*) ptr > state->beginning) ?
369 0 : SRE_IS_WORD((int) ptr[-1]) : 0;
370 0 : thisp = ((void*) ptr < state->end) ?
371 0 : SRE_IS_WORD((int) ptr[0]) : 0;
372 0 : return thisp != thatp;
373 :
374 : case SRE_AT_NON_BOUNDARY:
375 0 : if (state->beginning == state->end)
376 0 : return 0;
377 0 : thatp = ((void*) ptr > state->beginning) ?
378 0 : SRE_IS_WORD((int) ptr[-1]) : 0;
379 0 : thisp = ((void*) ptr < state->end) ?
380 0 : SRE_IS_WORD((int) ptr[0]) : 0;
381 0 : return thisp == thatp;
382 :
383 : case SRE_AT_LOC_BOUNDARY:
384 0 : if (state->beginning == state->end)
385 0 : return 0;
386 0 : thatp = ((void*) ptr > state->beginning) ?
387 0 : SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
388 0 : thisp = ((void*) ptr < state->end) ?
389 0 : SRE_LOC_IS_WORD((int) ptr[0]) : 0;
390 0 : return thisp != thatp;
391 :
392 : case SRE_AT_LOC_NON_BOUNDARY:
393 0 : if (state->beginning == state->end)
394 0 : return 0;
395 0 : thatp = ((void*) ptr > state->beginning) ?
396 0 : SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
397 0 : thisp = ((void*) ptr < state->end) ?
398 0 : SRE_LOC_IS_WORD((int) ptr[0]) : 0;
399 0 : return thisp == thatp;
400 :
401 : #if defined(HAVE_UNICODE)
402 : case SRE_AT_UNI_BOUNDARY:
403 0 : if (state->beginning == state->end)
404 0 : return 0;
405 0 : thatp = ((void*) ptr > state->beginning) ?
406 0 : SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
407 0 : thisp = ((void*) ptr < state->end) ?
408 0 : SRE_UNI_IS_WORD((int) ptr[0]) : 0;
409 0 : return thisp != thatp;
410 :
411 : case SRE_AT_UNI_NON_BOUNDARY:
412 0 : if (state->beginning == state->end)
413 0 : return 0;
414 0 : thatp = ((void*) ptr > state->beginning) ?
415 0 : SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
416 0 : thisp = ((void*) ptr < state->end) ?
417 0 : SRE_UNI_IS_WORD((int) ptr[0]) : 0;
418 0 : return thisp == thatp;
419 : #endif
420 :
421 : }
422 :
423 0 : return 0;
424 : }
425 :
426 : LOCAL(int)
427 15188 : SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 : {
429 : /* check if character is a member of the given set */
430 :
431 15188 : int ok = 1;
432 :
433 : for (;;) {
434 21018 : switch (*set++) {
435 :
436 : case SRE_OP_FAILURE:
437 5818 : return !ok;
438 :
439 : case SRE_OP_LITERAL:
440 : /* <LITERAL> <code> */
441 0 : if (ch == set[0])
442 0 : return ok;
443 0 : set++;
444 0 : break;
445 :
446 : case SRE_OP_CATEGORY:
447 : /* <CATEGORY> <code> */
448 14817 : if (sre_category(set[0], (int) ch))
449 9321 : return ok;
450 5496 : set += 1;
451 5496 : break;
452 :
453 : case SRE_OP_CHARSET:
454 : if (sizeof(SRE_CODE) == 2) {
455 : /* <CHARSET> <bitmap> (16 bits per code word) */
456 : if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 : return ok;
458 : set += 16;
459 : }
460 : else {
461 : /* <CHARSET> <bitmap> (32 bits per code word) */
462 133 : if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
463 49 : return ok;
464 84 : set += 8;
465 : }
466 84 : break;
467 :
468 : case SRE_OP_RANGE:
469 : /* <RANGE> <lower> <upper> */
470 238 : if (set[0] <= ch && ch <= set[1])
471 0 : return ok;
472 238 : set += 2;
473 238 : break;
474 :
475 : case SRE_OP_NEGATE:
476 12 : ok = !ok;
477 12 : break;
478 :
479 : case SRE_OP_BIGCHARSET:
480 : /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 : {
482 : Py_ssize_t count, block;
483 0 : count = *(set++);
484 :
485 : if (sizeof(SRE_CODE) == 2) {
486 : block = ((unsigned char*)set)[ch >> 8];
487 : set += 128;
488 : if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 : return ok;
490 : set += count*16;
491 : }
492 : else {
493 : /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 : * warnings when c's type supports only numbers < N+1 */
495 0 : if (!(ch & ~65535))
496 0 : block = ((unsigned char*)set)[ch >> 8];
497 : else
498 0 : block = -1;
499 0 : set += 64;
500 0 : if (block >=0 &&
501 0 : (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
502 0 : return ok;
503 0 : set += count*8;
504 : }
505 0 : break;
506 : }
507 :
508 : default:
509 : /* internal error -- there's not much we can do about it
510 : here, so let's just pretend it didn't match... */
511 0 : return 0;
512 : }
513 5830 : }
514 : }
515 :
516 : LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517 :
518 : LOCAL(Py_ssize_t)
519 6187 : SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 : {
521 : SRE_CODE chr;
522 6187 : SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 6187 : SRE_CHAR* end = (SRE_CHAR *)state->end;
524 : Py_ssize_t i;
525 :
526 : /* adjust end */
527 6187 : if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
528 16 : end = ptr + maxcount;
529 :
530 6187 : switch (pattern[0]) {
531 :
532 : case SRE_OP_IN:
533 : /* repeated set */
534 : TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 21011 : while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 9365 : ptr++;
537 5823 : break;
538 :
539 : case SRE_OP_ANY:
540 : /* repeated dot wildcard. */
541 : TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
542 15723 : while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 15027 : ptr++;
544 348 : break;
545 :
546 : case SRE_OP_ANY_ALL:
547 : /* repeated dot wildcard. skip to the end of the target
548 : string, and backtrack from there */
549 : TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
550 0 : ptr = end;
551 0 : break;
552 :
553 : case SRE_OP_LITERAL:
554 : /* repeated literal */
555 16 : chr = pattern[1];
556 : TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
557 32 : while (ptr < end && (SRE_CODE) *ptr == chr)
558 0 : ptr++;
559 16 : break;
560 :
561 : case SRE_OP_LITERAL_IGNORE:
562 : /* repeated literal */
563 0 : chr = pattern[1];
564 : TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
565 0 : while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 0 : ptr++;
567 0 : break;
568 :
569 : case SRE_OP_NOT_LITERAL:
570 : /* repeated non-literal */
571 0 : chr = pattern[1];
572 : TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
573 0 : while (ptr < end && (SRE_CODE) *ptr != chr)
574 0 : ptr++;
575 0 : break;
576 :
577 : case SRE_OP_NOT_LITERAL_IGNORE:
578 : /* repeated non-literal */
579 0 : chr = pattern[1];
580 : TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
581 0 : while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 0 : ptr++;
583 0 : break;
584 :
585 : default:
586 : /* repeated single character pattern */
587 : TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
588 0 : while ((SRE_CHAR*) state->ptr < end) {
589 0 : i = SRE_MATCH(state, pattern);
590 0 : if (i < 0)
591 0 : return i;
592 0 : if (!i)
593 0 : break;
594 : }
595 : TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
596 : (SRE_CHAR*) state->ptr - ptr));
597 0 : return (SRE_CHAR*) state->ptr - ptr;
598 : }
599 :
600 : TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
601 : ptr - (SRE_CHAR*) state->ptr));
602 6187 : return ptr - (SRE_CHAR*) state->ptr;
603 : }
604 :
605 : #if 0 /* not used in this release */
606 : LOCAL(int)
607 : SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
608 : {
609 : /* check if an SRE_OP_INFO block matches at the current position.
610 : returns the number of SRE_CODE objects to skip if successful, 0
611 : if no match */
612 :
613 : SRE_CHAR* end = state->end;
614 : SRE_CHAR* ptr = state->ptr;
615 : Py_ssize_t i;
616 :
617 : /* check minimal length */
618 : if (pattern[3] && (end - ptr) < pattern[3])
619 : return 0;
620 :
621 : /* check known prefix */
622 : if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
623 : /* <length> <skip> <prefix data> <overlap data> */
624 : for (i = 0; i < pattern[5]; i++)
625 : if ((SRE_CODE) ptr[i] != pattern[7 + i])
626 : return 0;
627 : return pattern[0] + 2 * pattern[6];
628 : }
629 : return pattern[0];
630 : }
631 : #endif
632 :
633 : /* The macros below should be used to protect recursive SRE_MATCH()
634 : * calls that *failed* and do *not* return immediately (IOW, those
635 : * that will backtrack). Explaining:
636 : *
637 : * - Recursive SRE_MATCH() returned true: that's usually a success
638 : * (besides atypical cases like ASSERT_NOT), therefore there's no
639 : * reason to restore lastmark;
640 : *
641 : * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
642 : * is returning to the caller: If the current SRE_MATCH() is the
643 : * top function of the recursion, returning false will be a matching
644 : * failure, and it doesn't matter where lastmark is pointing to.
645 : * If it's *not* the top function, it will be a recursive SRE_MATCH()
646 : * failure by itself, and the calling SRE_MATCH() will have to deal
647 : * with the failure by the same rules explained here (it will restore
648 : * lastmark by itself if necessary);
649 : *
650 : * - Recursive SRE_MATCH() returned false, and will continue the
651 : * outside 'for' loop: must be protected when breaking, since the next
652 : * OP could potentially depend on lastmark;
653 : *
654 : * - Recursive SRE_MATCH() returned false, and will be called again
655 : * inside a local for/while loop: must be protected between each
656 : * loop iteration, since the recursive SRE_MATCH() could do anything,
657 : * and could potentially depend on lastmark.
658 : *
659 : * For more information, check the discussion at SF patch #712900.
660 : */
661 : #define LASTMARK_SAVE() \
662 : do { \
663 : ctx->lastmark = state->lastmark; \
664 : ctx->lastindex = state->lastindex; \
665 : } while (0)
666 : #define LASTMARK_RESTORE() \
667 : do { \
668 : state->lastmark = ctx->lastmark; \
669 : state->lastindex = ctx->lastindex; \
670 : } while (0)
671 :
672 : #define RETURN_ERROR(i) do { return i; } while(0)
673 : #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
674 : #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
675 :
676 : #define RETURN_ON_ERROR(i) \
677 : do { if (i < 0) RETURN_ERROR(i); } while (0)
678 : #define RETURN_ON_SUCCESS(i) \
679 : do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
680 : #define RETURN_ON_FAILURE(i) \
681 : do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
682 :
683 : #define SFY(x) #x
684 :
685 : #define DATA_STACK_ALLOC(state, type, ptr) \
686 : do { \
687 : alloc_pos = state->data_stack_base; \
688 : TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
689 : "(%" PY_FORMAT_SIZE_T "d)\n", \
690 : SFY(type), alloc_pos, sizeof(type))); \
691 : if (sizeof(type) > state->data_stack_size - alloc_pos) { \
692 : int j = data_stack_grow(state, sizeof(type)); \
693 : if (j < 0) return j; \
694 : if (ctx_pos != -1) \
695 : DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
696 : } \
697 : ptr = (type*)(state->data_stack+alloc_pos); \
698 : state->data_stack_base += sizeof(type); \
699 : } while (0)
700 :
701 : #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
702 : do { \
703 : TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
704 : ptr = (type*)(state->data_stack+pos); \
705 : } while (0)
706 :
707 : #define DATA_STACK_PUSH(state, data, size) \
708 : do { \
709 : TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
710 : "(%" PY_FORMAT_SIZE_T "d)\n", \
711 : data, state->data_stack_base, size)); \
712 : if (size > state->data_stack_size - state->data_stack_base) { \
713 : int j = data_stack_grow(state, size); \
714 : if (j < 0) return j; \
715 : if (ctx_pos != -1) \
716 : DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
717 : } \
718 : memcpy(state->data_stack+state->data_stack_base, data, size); \
719 : state->data_stack_base += size; \
720 : } while (0)
721 :
722 : #define DATA_STACK_POP(state, data, size, discard) \
723 : do { \
724 : TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
725 : "(%" PY_FORMAT_SIZE_T "d)\n", \
726 : data, state->data_stack_base-size, size)); \
727 : memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728 : if (discard) \
729 : state->data_stack_base -= size; \
730 : } while (0)
731 :
732 : #define DATA_STACK_POP_DISCARD(state, size) \
733 : do { \
734 : TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
735 : "(%" PY_FORMAT_SIZE_T "d)\n", \
736 : state->data_stack_base-size, size)); \
737 : state->data_stack_base -= size; \
738 : } while(0)
739 :
740 : #define DATA_PUSH(x) \
741 : DATA_STACK_PUSH(state, (x), sizeof(*(x)))
742 : #define DATA_POP(x) \
743 : DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
744 : #define DATA_POP_DISCARD(x) \
745 : DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
746 : #define DATA_ALLOC(t,p) \
747 : DATA_STACK_ALLOC(state, t, p)
748 : #define DATA_LOOKUP_AT(t,p,pos) \
749 : DATA_STACK_LOOKUP_AT(state,t,p,pos)
750 :
751 : #define MARK_PUSH(lastmark) \
752 : do if (lastmark > 0) { \
753 : i = lastmark; /* ctx->lastmark may change if reallocated */ \
754 : DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755 : } while (0)
756 : #define MARK_POP(lastmark) \
757 : do if (lastmark > 0) { \
758 : DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759 : } while (0)
760 : #define MARK_POP_KEEP(lastmark) \
761 : do if (lastmark > 0) { \
762 : DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763 : } while (0)
764 : #define MARK_POP_DISCARD(lastmark) \
765 : do if (lastmark > 0) { \
766 : DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
767 : } while (0)
768 :
769 : #define JUMP_NONE 0
770 : #define JUMP_MAX_UNTIL_1 1
771 : #define JUMP_MAX_UNTIL_2 2
772 : #define JUMP_MAX_UNTIL_3 3
773 : #define JUMP_MIN_UNTIL_1 4
774 : #define JUMP_MIN_UNTIL_2 5
775 : #define JUMP_MIN_UNTIL_3 6
776 : #define JUMP_REPEAT 7
777 : #define JUMP_REPEAT_ONE_1 8
778 : #define JUMP_REPEAT_ONE_2 9
779 : #define JUMP_MIN_REPEAT_ONE 10
780 : #define JUMP_BRANCH 11
781 : #define JUMP_ASSERT 12
782 : #define JUMP_ASSERT_NOT 13
783 :
784 : #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
785 : DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
786 : nextctx->last_ctx_pos = ctx_pos; \
787 : nextctx->jump = jumpvalue; \
788 : nextctx->pattern = nextpattern; \
789 : ctx_pos = alloc_pos; \
790 : ctx = nextctx; \
791 : goto entrance; \
792 : jumplabel: \
793 : while (0) /* gcc doesn't like labels at end of scopes */ \
794 :
795 : typedef struct {
796 : Py_ssize_t last_ctx_pos;
797 : Py_ssize_t jump;
798 : SRE_CHAR* ptr;
799 : SRE_CODE* pattern;
800 : Py_ssize_t count;
801 : Py_ssize_t lastmark;
802 : Py_ssize_t lastindex;
803 : union {
804 : SRE_CODE chr;
805 : SRE_REPEAT* rep;
806 : } u;
807 : } SRE_MATCH_CONTEXT;
808 :
809 : /* check if string matches the given pattern. returns <0 for
810 : error, 0 for failure, and 1 for success */
811 : LOCAL(Py_ssize_t)
812 4905 : SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
813 : {
814 4905 : SRE_CHAR* end = (SRE_CHAR *)state->end;
815 4905 : Py_ssize_t alloc_pos, ctx_pos = -1;
816 4905 : Py_ssize_t i, ret = 0;
817 : Py_ssize_t jump;
818 4905 : unsigned int sigcount=0;
819 :
820 : SRE_MATCH_CONTEXT* ctx;
821 : SRE_MATCH_CONTEXT* nextctx;
822 :
823 : TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
824 :
825 4905 : DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
826 4905 : ctx->last_ctx_pos = -1;
827 4905 : ctx->jump = JUMP_NONE;
828 4905 : ctx->pattern = pattern;
829 4905 : ctx_pos = alloc_pos;
830 :
831 : entrance:
832 :
833 13549 : ctx->ptr = (SRE_CHAR *)state->ptr;
834 :
835 13549 : if (ctx->pattern[0] == SRE_OP_INFO) {
836 : /* optimization info block */
837 : /* <INFO> <1=skip> <2=flags> <3=min> ... */
838 2145 : if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
839 : TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
840 : "need %" PY_FORMAT_SIZE_T "d)\n",
841 : (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3]));
842 307 : RETURN_FAILURE;
843 : }
844 1838 : ctx->pattern += ctx->pattern[1] + 1;
845 : }
846 :
847 : for (;;) {
848 23420 : ++sigcount;
849 23420 : if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
850 0 : RETURN_ERROR(SRE_ERROR_INTERRUPTED);
851 :
852 23420 : switch (*ctx->pattern++) {
853 :
854 : case SRE_OP_MARK:
855 : /* set mark */
856 : /* <MARK> <gid> */
857 : TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
858 : ctx->ptr, ctx->pattern[0]));
859 5520 : i = ctx->pattern[0];
860 5520 : if (i & 1)
861 2760 : state->lastindex = i/2 + 1;
862 5520 : if (i > state->lastmark) {
863 : /* state->lastmark is the highest valid index in the
864 : state->mark array. If it is increased by more than 1,
865 : the intervening marks must be set to NULL to signal
866 : that these marks have not been encountered. */
867 5520 : Py_ssize_t j = state->lastmark + 1;
868 11040 : while (j < i)
869 0 : state->mark[j++] = NULL;
870 5520 : state->lastmark = i;
871 : }
872 5520 : state->mark[i] = ctx->ptr;
873 5520 : ctx->pattern++;
874 5520 : break;
875 :
876 : case SRE_OP_LITERAL:
877 : /* match literal string */
878 : /* <LITERAL> <code> */
879 : TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
880 : ctx->ptr, *ctx->pattern));
881 2192 : if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
882 1469 : RETURN_FAILURE;
883 723 : ctx->pattern++;
884 723 : ctx->ptr++;
885 723 : break;
886 :
887 : case SRE_OP_NOT_LITERAL:
888 : /* match anything that is not literal character */
889 : /* <NOT_LITERAL> <code> */
890 : TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
891 : ctx->ptr, *ctx->pattern));
892 0 : if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
893 0 : RETURN_FAILURE;
894 0 : ctx->pattern++;
895 0 : ctx->ptr++;
896 0 : break;
897 :
898 : case SRE_OP_SUCCESS:
899 : /* end of pattern */
900 : TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
901 2804 : state->ptr = ctx->ptr;
902 2804 : RETURN_SUCCESS;
903 :
904 : case SRE_OP_AT:
905 : /* match at given position */
906 : /* <AT> <code> */
907 : TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
908 0 : if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
909 0 : RETURN_FAILURE;
910 0 : ctx->pattern++;
911 0 : break;
912 :
913 : case SRE_OP_CATEGORY:
914 : /* match at given category */
915 : /* <CATEGORY> <code> */
916 : TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
917 : ctx->ptr, *ctx->pattern));
918 0 : if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
919 0 : RETURN_FAILURE;
920 0 : ctx->pattern++;
921 0 : ctx->ptr++;
922 0 : break;
923 :
924 : case SRE_OP_ANY:
925 : /* match anything (except a newline) */
926 : /* <ANY> */
927 : TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
928 1173 : if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
929 3 : RETURN_FAILURE;
930 1170 : ctx->ptr++;
931 1170 : break;
932 :
933 : case SRE_OP_ANY_ALL:
934 : /* match anything */
935 : /* <ANY_ALL> */
936 : TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
937 0 : if (ctx->ptr >= end)
938 0 : RETURN_FAILURE;
939 0 : ctx->ptr++;
940 0 : break;
941 :
942 : case SRE_OP_IN:
943 : /* match set member (or non_member) */
944 : /* <IN> <skip> <set> */
945 : TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
946 24 : if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
947 19 : RETURN_FAILURE;
948 5 : ctx->pattern += ctx->pattern[0];
949 5 : ctx->ptr++;
950 5 : break;
951 :
952 : case SRE_OP_LITERAL_IGNORE:
953 : TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
954 : ctx->pattern, ctx->ptr, ctx->pattern[0]));
955 0 : if (ctx->ptr >= end ||
956 0 : state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
957 0 : RETURN_FAILURE;
958 0 : ctx->pattern++;
959 0 : ctx->ptr++;
960 0 : break;
961 :
962 : case SRE_OP_NOT_LITERAL_IGNORE:
963 : TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
964 : ctx->pattern, ctx->ptr, *ctx->pattern));
965 0 : if (ctx->ptr >= end ||
966 0 : state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
967 0 : RETURN_FAILURE;
968 0 : ctx->pattern++;
969 0 : ctx->ptr++;
970 0 : break;
971 :
972 : case SRE_OP_IN_IGNORE:
973 : TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
974 0 : if (ctx->ptr >= end
975 0 : || !SRE_CHARSET(ctx->pattern+1,
976 0 : (SRE_CODE)state->lower(*ctx->ptr)))
977 0 : RETURN_FAILURE;
978 0 : ctx->pattern += ctx->pattern[0];
979 0 : ctx->ptr++;
980 0 : break;
981 :
982 : case SRE_OP_JUMP:
983 : case SRE_OP_INFO:
984 : /* jump forward */
985 : /* <JUMP> <offset> */
986 : TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
987 : ctx->ptr, ctx->pattern[0]));
988 2760 : ctx->pattern += ctx->pattern[0];
989 2760 : break;
990 :
991 : case SRE_OP_BRANCH:
992 : /* alternation */
993 : /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
994 : TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
995 2760 : LASTMARK_SAVE();
996 2760 : ctx->u.rep = state->repeat;
997 2760 : if (ctx->u.rep)
998 0 : MARK_PUSH(ctx->lastmark);
999 5388 : for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1000 6876 : if (ctx->pattern[1] == SRE_OP_LITERAL &&
1001 2976 : (ctx->ptr >= end ||
1002 1488 : (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1003 1140 : continue;
1004 4248 : if (ctx->pattern[1] == SRE_OP_IN &&
1005 0 : (ctx->ptr >= end ||
1006 0 : !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1007 0 : continue;
1008 4248 : state->ptr = ctx->ptr;
1009 4248 : DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1010 4248 : if (ret) {
1011 2760 : if (ctx->u.rep)
1012 0 : MARK_POP_DISCARD(ctx->lastmark);
1013 2760 : RETURN_ON_ERROR(ret);
1014 2760 : RETURN_SUCCESS;
1015 : }
1016 1488 : if (ctx->u.rep)
1017 0 : MARK_POP_KEEP(ctx->lastmark);
1018 1488 : LASTMARK_RESTORE();
1019 : }
1020 0 : if (ctx->u.rep)
1021 0 : MARK_POP_DISCARD(ctx->lastmark);
1022 0 : RETURN_FAILURE;
1023 :
1024 : case SRE_OP_REPEAT_ONE:
1025 : /* match repeated sequence (maximizing regexp) */
1026 :
1027 : /* this operator only works if the repeated item is
1028 : exactly one character wide, and we're not already
1029 : collecting backtracking points. for other cases,
1030 : use the MAX_REPEAT operator */
1031 :
1032 : /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1033 :
1034 : TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1035 : ctx->pattern[1], ctx->pattern[2]));
1036 :
1037 6187 : if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1038 0 : RETURN_FAILURE; /* cannot match */
1039 :
1040 6187 : state->ptr = ctx->ptr;
1041 :
1042 6187 : ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1043 6187 : RETURN_ON_ERROR(ret);
1044 6187 : DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1045 6187 : ctx->count = ret;
1046 6187 : ctx->ptr += ctx->count;
1047 :
1048 : /* when we arrive here, count contains the number of
1049 : matches, and ctx->ptr points to the tail of the target
1050 : string. check if the rest of the pattern matches,
1051 : and backtrack if not. */
1052 :
1053 6187 : if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1054 1529 : RETURN_FAILURE;
1055 :
1056 4658 : if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1057 : /* tail is empty. we're finished */
1058 19 : state->ptr = ctx->ptr;
1059 19 : RETURN_SUCCESS;
1060 : }
1061 :
1062 4639 : LASTMARK_SAVE();
1063 :
1064 4639 : if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1065 : /* tail starts with a literal. skip positions where
1066 : the rest of the pattern cannot possibly match */
1067 254 : ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1068 : for (;;) {
1069 1016 : while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1070 508 : (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1071 254 : ctx->ptr--;
1072 254 : ctx->count--;
1073 : }
1074 254 : if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1075 254 : break;
1076 0 : state->ptr = ctx->ptr;
1077 0 : DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1078 : ctx->pattern+ctx->pattern[0]);
1079 0 : if (ret) {
1080 0 : RETURN_ON_ERROR(ret);
1081 0 : RETURN_SUCCESS;
1082 : }
1083 :
1084 0 : LASTMARK_RESTORE();
1085 :
1086 0 : ctx->ptr--;
1087 0 : ctx->count--;
1088 0 : }
1089 :
1090 : } else {
1091 : /* general case */
1092 8786 : while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1093 4396 : state->ptr = ctx->ptr;
1094 4396 : DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1095 : ctx->pattern+ctx->pattern[0]);
1096 4396 : if (ret) {
1097 4380 : RETURN_ON_ERROR(ret);
1098 4380 : RETURN_SUCCESS;
1099 : }
1100 16 : ctx->ptr--;
1101 16 : ctx->count--;
1102 16 : LASTMARK_RESTORE();
1103 : }
1104 : }
1105 259 : RETURN_FAILURE;
1106 :
1107 : case SRE_OP_MIN_REPEAT_ONE:
1108 : /* match repeated sequence (minimizing regexp) */
1109 :
1110 : /* this operator only works if the repeated item is
1111 : exactly one character wide, and we're not already
1112 : collecting backtracking points. for other cases,
1113 : use the MIN_REPEAT operator */
1114 :
1115 : /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1116 :
1117 : TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1118 : ctx->pattern[1], ctx->pattern[2]));
1119 :
1120 0 : if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1121 0 : RETURN_FAILURE; /* cannot match */
1122 :
1123 0 : state->ptr = ctx->ptr;
1124 :
1125 0 : if (ctx->pattern[1] == 0)
1126 0 : ctx->count = 0;
1127 : else {
1128 : /* count using pattern min as the maximum */
1129 0 : ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1130 0 : RETURN_ON_ERROR(ret);
1131 0 : DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1132 0 : if (ret < (Py_ssize_t) ctx->pattern[1])
1133 : /* didn't match minimum number of times */
1134 0 : RETURN_FAILURE;
1135 : /* advance past minimum matches of repeat */
1136 0 : ctx->count = ret;
1137 0 : ctx->ptr += ctx->count;
1138 : }
1139 :
1140 0 : if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1141 : /* tail is empty. we're finished */
1142 0 : state->ptr = ctx->ptr;
1143 0 : RETURN_SUCCESS;
1144 :
1145 : } else {
1146 : /* general case */
1147 0 : LASTMARK_SAVE();
1148 0 : while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
1149 0 : || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1150 0 : state->ptr = ctx->ptr;
1151 0 : DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1152 : ctx->pattern+ctx->pattern[0]);
1153 0 : if (ret) {
1154 0 : RETURN_ON_ERROR(ret);
1155 0 : RETURN_SUCCESS;
1156 : }
1157 0 : state->ptr = ctx->ptr;
1158 0 : ret = SRE_COUNT(state, ctx->pattern+3, 1);
1159 0 : RETURN_ON_ERROR(ret);
1160 0 : DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1161 0 : if (ret == 0)
1162 0 : break;
1163 : assert(ret == 1);
1164 0 : ctx->ptr++;
1165 0 : ctx->count++;
1166 0 : LASTMARK_RESTORE();
1167 : }
1168 : }
1169 0 : RETURN_FAILURE;
1170 :
1171 : case SRE_OP_REPEAT:
1172 : /* create repeat context. all the hard work is done
1173 : by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1174 : /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1175 : TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1176 : ctx->pattern[1], ctx->pattern[2]));
1177 :
1178 : /* install new repeat context */
1179 0 : ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1180 0 : if (!ctx->u.rep) {
1181 0 : PyErr_NoMemory();
1182 0 : RETURN_FAILURE;
1183 : }
1184 0 : ctx->u.rep->count = -1;
1185 0 : ctx->u.rep->pattern = ctx->pattern;
1186 0 : ctx->u.rep->prev = state->repeat;
1187 0 : ctx->u.rep->last_ptr = NULL;
1188 0 : state->repeat = ctx->u.rep;
1189 :
1190 0 : state->ptr = ctx->ptr;
1191 0 : DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1192 0 : state->repeat = ctx->u.rep->prev;
1193 0 : PyObject_FREE(ctx->u.rep);
1194 :
1195 0 : if (ret) {
1196 0 : RETURN_ON_ERROR(ret);
1197 0 : RETURN_SUCCESS;
1198 : }
1199 0 : RETURN_FAILURE;
1200 :
1201 : case SRE_OP_MAX_UNTIL:
1202 : /* maximizing repeat */
1203 : /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1204 :
1205 : /* FIXME: we probably need to deal with zero-width
1206 : matches in here... */
1207 :
1208 0 : ctx->u.rep = state->repeat;
1209 0 : if (!ctx->u.rep)
1210 0 : RETURN_ERROR(SRE_ERROR_STATE);
1211 :
1212 0 : state->ptr = ctx->ptr;
1213 :
1214 0 : ctx->count = ctx->u.rep->count+1;
1215 :
1216 : TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1217 : ctx->ptr, ctx->count));
1218 :
1219 0 : if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1220 : /* not enough matches */
1221 0 : ctx->u.rep->count = ctx->count;
1222 0 : DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1223 : ctx->u.rep->pattern+3);
1224 0 : if (ret) {
1225 0 : RETURN_ON_ERROR(ret);
1226 0 : RETURN_SUCCESS;
1227 : }
1228 0 : ctx->u.rep->count = ctx->count-1;
1229 0 : state->ptr = ctx->ptr;
1230 0 : RETURN_FAILURE;
1231 : }
1232 :
1233 0 : if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1234 0 : ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1235 0 : state->ptr != ctx->u.rep->last_ptr) {
1236 : /* we may have enough matches, but if we can
1237 : match another item, do so */
1238 0 : ctx->u.rep->count = ctx->count;
1239 0 : LASTMARK_SAVE();
1240 0 : MARK_PUSH(ctx->lastmark);
1241 : /* zero-width match protection */
1242 0 : DATA_PUSH(&ctx->u.rep->last_ptr);
1243 0 : ctx->u.rep->last_ptr = state->ptr;
1244 0 : DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1245 : ctx->u.rep->pattern+3);
1246 0 : DATA_POP(&ctx->u.rep->last_ptr);
1247 0 : if (ret) {
1248 0 : MARK_POP_DISCARD(ctx->lastmark);
1249 0 : RETURN_ON_ERROR(ret);
1250 0 : RETURN_SUCCESS;
1251 : }
1252 0 : MARK_POP(ctx->lastmark);
1253 0 : LASTMARK_RESTORE();
1254 0 : ctx->u.rep->count = ctx->count-1;
1255 0 : state->ptr = ctx->ptr;
1256 : }
1257 :
1258 : /* cannot match more repeated items here. make sure the
1259 : tail matches */
1260 0 : state->repeat = ctx->u.rep->prev;
1261 0 : DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1262 0 : RETURN_ON_SUCCESS(ret);
1263 0 : state->repeat = ctx->u.rep;
1264 0 : state->ptr = ctx->ptr;
1265 0 : RETURN_FAILURE;
1266 :
1267 : case SRE_OP_MIN_UNTIL:
1268 : /* minimizing repeat */
1269 : /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1270 :
1271 0 : ctx->u.rep = state->repeat;
1272 0 : if (!ctx->u.rep)
1273 0 : RETURN_ERROR(SRE_ERROR_STATE);
1274 :
1275 0 : state->ptr = ctx->ptr;
1276 :
1277 0 : ctx->count = ctx->u.rep->count+1;
1278 :
1279 : TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1280 : ctx->ptr, ctx->count, ctx->u.rep->pattern));
1281 :
1282 0 : if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1283 : /* not enough matches */
1284 0 : ctx->u.rep->count = ctx->count;
1285 0 : DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1286 : ctx->u.rep->pattern+3);
1287 0 : if (ret) {
1288 0 : RETURN_ON_ERROR(ret);
1289 0 : RETURN_SUCCESS;
1290 : }
1291 0 : ctx->u.rep->count = ctx->count-1;
1292 0 : state->ptr = ctx->ptr;
1293 0 : RETURN_FAILURE;
1294 : }
1295 :
1296 0 : LASTMARK_SAVE();
1297 :
1298 : /* see if the tail matches */
1299 0 : state->repeat = ctx->u.rep->prev;
1300 0 : DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1301 0 : if (ret) {
1302 0 : RETURN_ON_ERROR(ret);
1303 0 : RETURN_SUCCESS;
1304 : }
1305 :
1306 0 : state->repeat = ctx->u.rep;
1307 0 : state->ptr = ctx->ptr;
1308 :
1309 0 : LASTMARK_RESTORE();
1310 :
1311 0 : if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1312 0 : && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1313 0 : state->ptr == ctx->u.rep->last_ptr)
1314 0 : RETURN_FAILURE;
1315 :
1316 0 : ctx->u.rep->count = ctx->count;
1317 : /* zero-width match protection */
1318 0 : DATA_PUSH(&ctx->u.rep->last_ptr);
1319 0 : ctx->u.rep->last_ptr = state->ptr;
1320 0 : DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1321 : ctx->u.rep->pattern+3);
1322 0 : DATA_POP(&ctx->u.rep->last_ptr);
1323 0 : if (ret) {
1324 0 : RETURN_ON_ERROR(ret);
1325 0 : RETURN_SUCCESS;
1326 : }
1327 0 : ctx->u.rep->count = ctx->count-1;
1328 0 : state->ptr = ctx->ptr;
1329 0 : RETURN_FAILURE;
1330 :
1331 : case SRE_OP_GROUPREF:
1332 : /* match backreference */
1333 : TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1334 : ctx->ptr, ctx->pattern[0]));
1335 0 : i = ctx->pattern[0];
1336 : {
1337 0 : Py_ssize_t groupref = i+i;
1338 0 : if (groupref >= state->lastmark) {
1339 0 : RETURN_FAILURE;
1340 : } else {
1341 0 : SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342 0 : SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343 0 : if (!p || !e || e < p)
1344 0 : RETURN_FAILURE;
1345 0 : while (p < e) {
1346 0 : if (ctx->ptr >= end || *ctx->ptr != *p)
1347 0 : RETURN_FAILURE;
1348 0 : p++; ctx->ptr++;
1349 : }
1350 : }
1351 : }
1352 0 : ctx->pattern++;
1353 0 : break;
1354 :
1355 : case SRE_OP_GROUPREF_IGNORE:
1356 : /* match backreference */
1357 : TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1358 : ctx->ptr, ctx->pattern[0]));
1359 0 : i = ctx->pattern[0];
1360 : {
1361 0 : Py_ssize_t groupref = i+i;
1362 0 : if (groupref >= state->lastmark) {
1363 0 : RETURN_FAILURE;
1364 : } else {
1365 0 : SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1366 0 : SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1367 0 : if (!p || !e || e < p)
1368 0 : RETURN_FAILURE;
1369 0 : while (p < e) {
1370 0 : if (ctx->ptr >= end ||
1371 0 : state->lower(*ctx->ptr) != state->lower(*p))
1372 0 : RETURN_FAILURE;
1373 0 : p++; ctx->ptr++;
1374 : }
1375 : }
1376 : }
1377 0 : ctx->pattern++;
1378 0 : break;
1379 :
1380 : case SRE_OP_GROUPREF_EXISTS:
1381 : TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1382 : ctx->ptr, ctx->pattern[0]));
1383 : /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1384 0 : i = ctx->pattern[0];
1385 : {
1386 0 : Py_ssize_t groupref = i+i;
1387 0 : if (groupref >= state->lastmark) {
1388 0 : ctx->pattern += ctx->pattern[1];
1389 0 : break;
1390 : } else {
1391 0 : SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1392 0 : SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1393 0 : if (!p || !e || e < p) {
1394 0 : ctx->pattern += ctx->pattern[1];
1395 0 : break;
1396 : }
1397 : }
1398 : }
1399 0 : ctx->pattern += 2;
1400 0 : break;
1401 :
1402 : case SRE_OP_ASSERT:
1403 : /* assert subpattern */
1404 : /* <ASSERT> <skip> <back> <pattern> */
1405 : TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1406 : ctx->ptr, ctx->pattern[1]));
1407 0 : if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
1408 0 : RETURN_FAILURE;
1409 0 : state->ptr = ctx->ptr - ctx->pattern[1];
1410 0 : DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1411 0 : RETURN_ON_FAILURE(ret);
1412 0 : ctx->pattern += ctx->pattern[0];
1413 0 : break;
1414 :
1415 : case SRE_OP_ASSERT_NOT:
1416 : /* assert not subpattern */
1417 : /* <ASSERT_NOT> <skip> <back> <pattern> */
1418 : TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1419 : ctx->ptr, ctx->pattern[1]));
1420 0 : if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1421 0 : state->ptr = ctx->ptr - ctx->pattern[1];
1422 0 : DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1423 0 : if (ret) {
1424 0 : RETURN_ON_ERROR(ret);
1425 0 : RETURN_FAILURE;
1426 : }
1427 : }
1428 0 : ctx->pattern += ctx->pattern[0];
1429 0 : break;
1430 :
1431 : case SRE_OP_FAILURE:
1432 : /* immediate failure */
1433 : TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1434 0 : RETURN_FAILURE;
1435 :
1436 : default:
1437 : TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1438 : ctx->pattern[-1]));
1439 0 : RETURN_ERROR(SRE_ERROR_ILLEGAL);
1440 : }
1441 10178 : }
1442 :
1443 : exit:
1444 13549 : ctx_pos = ctx->last_ctx_pos;
1445 13549 : jump = ctx->jump;
1446 13549 : DATA_POP_DISCARD(ctx);
1447 13549 : if (ctx_pos == -1)
1448 4905 : return ret;
1449 8644 : DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1450 :
1451 8644 : switch (jump) {
1452 : case JUMP_MAX_UNTIL_2:
1453 : TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1454 0 : goto jump_max_until_2;
1455 : case JUMP_MAX_UNTIL_3:
1456 : TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1457 0 : goto jump_max_until_3;
1458 : case JUMP_MIN_UNTIL_2:
1459 : TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1460 0 : goto jump_min_until_2;
1461 : case JUMP_MIN_UNTIL_3:
1462 : TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1463 0 : goto jump_min_until_3;
1464 : case JUMP_BRANCH:
1465 : TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1466 4248 : goto jump_branch;
1467 : case JUMP_MAX_UNTIL_1:
1468 : TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1469 0 : goto jump_max_until_1;
1470 : case JUMP_MIN_UNTIL_1:
1471 : TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1472 0 : goto jump_min_until_1;
1473 : case JUMP_REPEAT:
1474 : TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1475 0 : goto jump_repeat;
1476 : case JUMP_REPEAT_ONE_1:
1477 : TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1478 0 : goto jump_repeat_one_1;
1479 : case JUMP_REPEAT_ONE_2:
1480 : TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1481 4396 : goto jump_repeat_one_2;
1482 : case JUMP_MIN_REPEAT_ONE:
1483 : TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1484 0 : goto jump_min_repeat_one;
1485 : case JUMP_ASSERT:
1486 : TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1487 0 : goto jump_assert;
1488 : case JUMP_ASSERT_NOT:
1489 : TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1490 0 : goto jump_assert_not;
1491 : case JUMP_NONE:
1492 : TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1493 : ctx->ptr, ret));
1494 0 : break;
1495 : }
1496 :
1497 0 : return ret; /* should never get here */
1498 : }
1499 :
1500 : LOCAL(Py_ssize_t)
1501 3165 : SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1502 : {
1503 3165 : SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1504 3165 : SRE_CHAR* end = (SRE_CHAR *)state->end;
1505 3165 : Py_ssize_t status = 0;
1506 3165 : Py_ssize_t prefix_len = 0;
1507 3165 : Py_ssize_t prefix_skip = 0;
1508 3165 : SRE_CODE* prefix = NULL;
1509 3165 : SRE_CODE* charset = NULL;
1510 3165 : SRE_CODE* overlap = NULL;
1511 3165 : int flags = 0;
1512 :
1513 3165 : if (ptr > end)
1514 0 : return 0;
1515 :
1516 3165 : if (pattern[0] == SRE_OP_INFO) {
1517 : /* optimization info block */
1518 : /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1519 :
1520 3165 : flags = pattern[2];
1521 :
1522 3165 : if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1523 : TRACE(("reject (got %u chars, need %u)\n",
1524 : (unsigned int)(end - ptr), pattern[3]));
1525 405 : return 0;
1526 : }
1527 2760 : if (pattern[3] > 1) {
1528 : /* adjust end point (but make sure we leave at least one
1529 : character in there, so literal search will work) */
1530 0 : end -= pattern[3]-1;
1531 0 : if (end <= ptr)
1532 0 : end = ptr+1;
1533 : }
1534 :
1535 2760 : if (flags & SRE_INFO_PREFIX) {
1536 : /* pattern starts with a known prefix */
1537 : /* <length> <skip> <prefix data> <overlap data> */
1538 0 : prefix_len = pattern[5];
1539 0 : prefix_skip = pattern[6];
1540 0 : prefix = pattern + 7;
1541 0 : overlap = prefix + prefix_len - 1;
1542 2760 : } else if (flags & SRE_INFO_CHARSET)
1543 : /* pattern starts with a character from a known set */
1544 : /* <charset> */
1545 0 : charset = pattern + 5;
1546 :
1547 2760 : pattern += 1 + pattern[1];
1548 : }
1549 :
1550 : TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1551 : prefix, prefix_len, prefix_skip));
1552 : TRACE(("charset = %p\n", charset));
1553 :
1554 : #if defined(USE_FAST_SEARCH)
1555 2760 : if (prefix_len > 1) {
1556 : /* pattern starts with a known prefix. use the overlap
1557 : table to skip forward as fast as we possibly can */
1558 0 : Py_ssize_t i = 0;
1559 0 : end = (SRE_CHAR *)state->end;
1560 0 : while (ptr < end) {
1561 : for (;;) {
1562 0 : if ((SRE_CODE) ptr[0] != prefix[i]) {
1563 0 : if (!i)
1564 0 : break;
1565 : else
1566 0 : i = overlap[i];
1567 : } else {
1568 0 : if (++i == prefix_len) {
1569 : /* found a potential match */
1570 : TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1571 0 : state->start = ptr + 1 - prefix_len;
1572 0 : state->ptr = ptr + 1 - prefix_len + prefix_skip;
1573 0 : if (flags & SRE_INFO_LITERAL)
1574 0 : return 1; /* we got all of it */
1575 0 : status = SRE_MATCH(state, pattern + 2*prefix_skip);
1576 0 : if (status != 0)
1577 0 : return status;
1578 : /* close but no cigar -- try again */
1579 0 : i = overlap[i];
1580 : }
1581 0 : break;
1582 : }
1583 0 : }
1584 0 : ptr++;
1585 : }
1586 0 : return 0;
1587 : }
1588 : #endif
1589 :
1590 2760 : if (pattern[0] == SRE_OP_LITERAL) {
1591 : /* pattern starts with a literal character. this is used
1592 : for short prefixes, and if fast search is disabled */
1593 0 : SRE_CODE chr = pattern[1];
1594 0 : end = (SRE_CHAR *)state->end;
1595 : for (;;) {
1596 0 : while (ptr < end && (SRE_CODE) ptr[0] != chr)
1597 0 : ptr++;
1598 0 : if (ptr >= end)
1599 0 : return 0;
1600 : TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1601 0 : state->start = ptr;
1602 0 : state->ptr = ++ptr;
1603 0 : if (flags & SRE_INFO_LITERAL)
1604 0 : return 1; /* we got all of it */
1605 0 : status = SRE_MATCH(state, pattern + 2);
1606 0 : if (status != 0)
1607 0 : break;
1608 0 : }
1609 2760 : } else if (charset) {
1610 : /* pattern starts with a character from a known set */
1611 0 : end = (SRE_CHAR *)state->end;
1612 : for (;;) {
1613 0 : while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1614 0 : ptr++;
1615 0 : if (ptr >= end)
1616 0 : return 0;
1617 : TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1618 0 : state->start = ptr;
1619 0 : state->ptr = ptr;
1620 0 : status = SRE_MATCH(state, pattern);
1621 0 : if (status != 0)
1622 0 : break;
1623 0 : ptr++;
1624 0 : }
1625 : } else {
1626 : /* general case */
1627 : assert(ptr <= end);
1628 : while (1) {
1629 : TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1630 2760 : state->start = state->ptr = ptr;
1631 2760 : status = SRE_MATCH(state, pattern);
1632 2760 : if (status != 0 || ptr >= end)
1633 : break;
1634 0 : ptr++;
1635 0 : }
1636 : }
1637 :
1638 2760 : return status;
1639 : }
1640 :
1641 : LOCAL(int)
1642 0 : SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1643 : {
1644 : /* check if given string is a literal template (i.e. no escapes) */
1645 0 : while (len-- > 0)
1646 0 : if (*ptr++ == '\\')
1647 0 : return 0;
1648 0 : return 1;
1649 : }
1650 :
1651 : #if !defined(SRE_RECURSIVE)
1652 :
1653 : /* -------------------------------------------------------------------- */
1654 : /* factories and destructors */
1655 :
1656 : /* see sre.h for object declarations */
1657 : static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1658 : static PyObject*pattern_scanner(PatternObject*, PyObject*);
1659 :
1660 : static PyObject *
1661 0 : sre_codesize(PyObject* self, PyObject *unused)
1662 : {
1663 0 : return PyInt_FromSize_t(sizeof(SRE_CODE));
1664 : }
1665 :
1666 : static PyObject *
1667 396 : sre_getlower(PyObject* self, PyObject* args)
1668 : {
1669 : int character, flags;
1670 396 : if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1671 0 : return NULL;
1672 396 : if (flags & SRE_FLAG_LOCALE)
1673 0 : return Py_BuildValue("i", sre_lower_locale(character));
1674 396 : if (flags & SRE_FLAG_UNICODE)
1675 : #if defined(HAVE_UNICODE)
1676 0 : return Py_BuildValue("i", sre_lower_unicode(character));
1677 : #else
1678 : return Py_BuildValue("i", sre_lower_locale(character));
1679 : #endif
1680 396 : return Py_BuildValue("i", sre_lower(character));
1681 : }
1682 :
1683 : LOCAL(void)
1684 3165 : state_reset(SRE_STATE* state)
1685 : {
1686 : /* FIXME: dynamic! */
1687 : /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1688 :
1689 3165 : state->lastmark = -1;
1690 3165 : state->lastindex = -1;
1691 :
1692 3165 : state->repeat = NULL;
1693 :
1694 3165 : data_stack_dealloc(state);
1695 3165 : }
1696 :
1697 : static void*
1698 2898 : getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1699 : {
1700 : /* given a python object, return a data pointer, a length (in
1701 : characters), and a character size. return NULL if the object
1702 : is not a string (or not compatible) */
1703 :
1704 : PyBufferProcs *buffer;
1705 : Py_ssize_t size, bytes;
1706 : int charsize;
1707 : void* ptr;
1708 :
1709 : #if defined(HAVE_UNICODE)
1710 2898 : if (PyUnicode_Check(string)) {
1711 : /* unicode strings doesn't always support the buffer interface */
1712 0 : ptr = (void*) PyUnicode_AS_DATA(string);
1713 : /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1714 0 : size = PyUnicode_GET_SIZE(string);
1715 0 : charsize = sizeof(Py_UNICODE);
1716 :
1717 : } else {
1718 : #endif
1719 :
1720 : /* get pointer to string buffer */
1721 2898 : buffer = Py_TYPE(string)->tp_as_buffer;
1722 5796 : if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1723 2898 : buffer->bf_getsegcount(string, NULL) != 1) {
1724 0 : PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1725 0 : return NULL;
1726 : }
1727 :
1728 : /* determine buffer size */
1729 2898 : bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1730 2898 : if (bytes < 0) {
1731 0 : PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1732 0 : return NULL;
1733 : }
1734 :
1735 : /* determine character size */
1736 : #if PY_VERSION_HEX >= 0x01060000
1737 2898 : size = PyObject_Size(string);
1738 : #else
1739 : size = PyObject_Length(string);
1740 : #endif
1741 :
1742 2898 : if (PyString_Check(string) || bytes == size)
1743 2898 : charsize = 1;
1744 : #if defined(HAVE_UNICODE)
1745 0 : else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1746 0 : charsize = sizeof(Py_UNICODE);
1747 : #endif
1748 : else {
1749 0 : PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1750 0 : return NULL;
1751 : }
1752 :
1753 : #if defined(HAVE_UNICODE)
1754 : }
1755 : #endif
1756 :
1757 2898 : *p_length = size;
1758 2898 : *p_charsize = charsize;
1759 :
1760 2898 : return ptr;
1761 : }
1762 :
1763 : LOCAL(PyObject*)
1764 2898 : state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1765 : Py_ssize_t start, Py_ssize_t end)
1766 : {
1767 : /* prepare state object */
1768 :
1769 : Py_ssize_t length;
1770 : int charsize;
1771 : void* ptr;
1772 :
1773 2898 : memset(state, 0, sizeof(SRE_STATE));
1774 :
1775 2898 : state->lastmark = -1;
1776 2898 : state->lastindex = -1;
1777 :
1778 2898 : ptr = getstring(string, &length, &charsize);
1779 2898 : if (!ptr)
1780 0 : return NULL;
1781 :
1782 : /* adjust boundaries */
1783 2898 : if (start < 0)
1784 0 : start = 0;
1785 2898 : else if (start > length)
1786 0 : start = length;
1787 :
1788 2898 : if (end < 0)
1789 0 : end = 0;
1790 2898 : else if (end > length)
1791 2898 : end = length;
1792 :
1793 2898 : state->charsize = charsize;
1794 :
1795 2898 : state->beginning = ptr;
1796 :
1797 2898 : state->start = (void*) ((char*) ptr + start * state->charsize);
1798 2898 : state->end = (void*) ((char*) ptr + end * state->charsize);
1799 :
1800 2898 : Py_INCREF(string);
1801 2898 : state->string = string;
1802 2898 : state->pos = start;
1803 2898 : state->endpos = end;
1804 :
1805 2898 : if (pattern->flags & SRE_FLAG_LOCALE)
1806 0 : state->lower = sre_lower_locale;
1807 2898 : else if (pattern->flags & SRE_FLAG_UNICODE)
1808 : #if defined(HAVE_UNICODE)
1809 0 : state->lower = sre_lower_unicode;
1810 : #else
1811 : state->lower = sre_lower_locale;
1812 : #endif
1813 : else
1814 2898 : state->lower = sre_lower;
1815 :
1816 2898 : return string;
1817 : }
1818 :
1819 : LOCAL(void)
1820 2898 : state_fini(SRE_STATE* state)
1821 : {
1822 2898 : Py_XDECREF(state->string);
1823 2898 : data_stack_dealloc(state);
1824 2898 : }
1825 :
1826 : /* calculate offset from start of string */
1827 : #define STATE_OFFSET(state, member)\
1828 : (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1829 :
1830 : LOCAL(PyObject*)
1831 0 : state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1832 : {
1833 : Py_ssize_t i, j;
1834 :
1835 0 : index = (index - 1) * 2;
1836 :
1837 0 : if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1838 0 : if (empty)
1839 : /* want empty string */
1840 0 : i = j = 0;
1841 : else {
1842 0 : Py_INCREF(Py_None);
1843 0 : return Py_None;
1844 : }
1845 : } else {
1846 0 : i = STATE_OFFSET(state, state->mark[index]);
1847 0 : j = STATE_OFFSET(state, state->mark[index+1]);
1848 : }
1849 :
1850 0 : return PySequence_GetSlice(string, i, j);
1851 : }
1852 :
1853 : static void
1854 0 : pattern_error(int status)
1855 : {
1856 0 : switch (status) {
1857 : case SRE_ERROR_RECURSION_LIMIT:
1858 0 : PyErr_SetString(
1859 : PyExc_RuntimeError,
1860 : "maximum recursion limit exceeded"
1861 : );
1862 0 : break;
1863 : case SRE_ERROR_MEMORY:
1864 0 : PyErr_NoMemory();
1865 0 : break;
1866 : case SRE_ERROR_INTERRUPTED:
1867 : /* An exception has already been raised, so let it fly */
1868 0 : break;
1869 : default:
1870 : /* other error codes indicate compiler/engine bugs */
1871 0 : PyErr_SetString(
1872 : PyExc_RuntimeError,
1873 : "internal error in regular expression engine"
1874 : );
1875 : }
1876 0 : }
1877 :
1878 : static void
1879 843 : pattern_dealloc(PatternObject* self)
1880 : {
1881 843 : if (self->weakreflist != NULL)
1882 0 : PyObject_ClearWeakRefs((PyObject *) self);
1883 843 : Py_XDECREF(self->pattern);
1884 843 : Py_XDECREF(self->groupindex);
1885 843 : Py_XDECREF(self->indexgroup);
1886 843 : PyObject_DEL(self);
1887 843 : }
1888 :
1889 : static int
1890 2145 : check_args_size(const char *name, PyObject* args, PyObject* kw, int n)
1891 : {
1892 2145 : Py_ssize_t m = PyTuple_GET_SIZE(args) + (kw ? PyDict_Size(kw) : 0);
1893 2145 : if (m <= n)
1894 2145 : return 1;
1895 0 : PyErr_Format(PyExc_TypeError,
1896 : "%s() takes at most %d positional arguments (%zd given)",
1897 : name, n, m);
1898 0 : return 0;
1899 : }
1900 :
1901 : static PyObject*
1902 2145 : fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
1903 : {
1904 2145 : if (string2 != NULL) {
1905 : char buf[100];
1906 0 : if (string != NULL) {
1907 0 : PyErr_Format(PyExc_TypeError,
1908 : "Argument given by name ('%s') and position (1)",
1909 : oldname);
1910 0 : return NULL;
1911 : }
1912 0 : sprintf(buf, "The '%s' keyword parameter name is deprecated. "
1913 : "Use 'string' instead.", oldname);
1914 0 : if (PyErr_Warn(PyExc_DeprecationWarning, buf) < 0)
1915 0 : return NULL;
1916 0 : return string2;
1917 : }
1918 2145 : if (string == NULL) {
1919 0 : PyErr_SetString(PyExc_TypeError,
1920 : "Required argument 'string' (pos 1) not found");
1921 0 : return NULL;
1922 : }
1923 2145 : return string;
1924 : }
1925 :
1926 : static PyObject*
1927 2145 : pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1928 : {
1929 : SRE_STATE state;
1930 : int status;
1931 :
1932 2145 : PyObject *string = NULL, *string2 = NULL;
1933 2145 : Py_ssize_t start = 0;
1934 2145 : Py_ssize_t end = PY_SSIZE_T_MAX;
1935 : static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1936 2145 : if (!check_args_size("match", args, kw, 3))
1937 0 : return NULL;
1938 :
1939 2145 : if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:match", kwlist,
1940 : &string, &start, &end, &string2))
1941 0 : return NULL;
1942 :
1943 2145 : string = fix_string_param(string, string2, "pattern");
1944 2145 : if (!string)
1945 0 : return NULL;
1946 :
1947 2145 : string = state_init(&state, self, string, start, end);
1948 2145 : if (!string)
1949 0 : return NULL;
1950 :
1951 2145 : state.ptr = state.start;
1952 :
1953 : TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1954 :
1955 2145 : if (state.charsize == 1) {
1956 2145 : status = sre_match(&state, PatternObject_GetCode(self));
1957 : } else {
1958 : #if defined(HAVE_UNICODE)
1959 0 : status = sre_umatch(&state, PatternObject_GetCode(self));
1960 : #endif
1961 : }
1962 :
1963 : TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1964 2145 : if (PyErr_Occurred())
1965 0 : return NULL;
1966 :
1967 2145 : state_fini(&state);
1968 :
1969 2145 : return pattern_new_match(self, &state, status);
1970 : }
1971 :
1972 : static PyObject*
1973 0 : pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1974 : {
1975 : SRE_STATE state;
1976 : int status;
1977 :
1978 0 : PyObject *string = NULL, *string2 = NULL;
1979 0 : Py_ssize_t start = 0;
1980 0 : Py_ssize_t end = PY_SSIZE_T_MAX;
1981 : static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1982 0 : if (!check_args_size("search", args, kw, 3))
1983 0 : return NULL;
1984 :
1985 0 : if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:search", kwlist,
1986 : &string, &start, &end, &string2))
1987 0 : return NULL;
1988 :
1989 0 : string = fix_string_param(string, string2, "pattern");
1990 0 : if (!string)
1991 0 : return NULL;
1992 :
1993 0 : string = state_init(&state, self, string, start, end);
1994 0 : if (!string)
1995 0 : return NULL;
1996 :
1997 : TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1998 :
1999 0 : if (state.charsize == 1) {
2000 0 : status = sre_search(&state, PatternObject_GetCode(self));
2001 : } else {
2002 : #if defined(HAVE_UNICODE)
2003 0 : status = sre_usearch(&state, PatternObject_GetCode(self));
2004 : #endif
2005 : }
2006 :
2007 : TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2008 :
2009 0 : state_fini(&state);
2010 :
2011 0 : if (PyErr_Occurred())
2012 0 : return NULL;
2013 :
2014 0 : return pattern_new_match(self, &state, status);
2015 : }
2016 :
2017 : static PyObject*
2018 0 : call(char* module, char* function, PyObject* args)
2019 : {
2020 : PyObject* name;
2021 : PyObject* mod;
2022 : PyObject* func;
2023 : PyObject* result;
2024 :
2025 0 : if (!args)
2026 0 : return NULL;
2027 0 : name = PyString_FromString(module);
2028 0 : if (!name)
2029 0 : return NULL;
2030 0 : mod = PyImport_Import(name);
2031 0 : Py_DECREF(name);
2032 0 : if (!mod)
2033 0 : return NULL;
2034 0 : func = PyObject_GetAttrString(mod, function);
2035 0 : Py_DECREF(mod);
2036 0 : if (!func)
2037 0 : return NULL;
2038 0 : result = PyObject_CallObject(func, args);
2039 0 : Py_DECREF(func);
2040 0 : Py_DECREF(args);
2041 0 : return result;
2042 : }
2043 :
2044 : #ifdef USE_BUILTIN_COPY
2045 : static int
2046 : deepcopy(PyObject** object, PyObject* memo)
2047 : {
2048 : PyObject* copy;
2049 :
2050 : copy = call(
2051 : "copy", "deepcopy",
2052 : PyTuple_Pack(2, *object, memo)
2053 : );
2054 : if (!copy)
2055 : return 0;
2056 :
2057 : Py_SETREF(*object, copy);
2058 :
2059 : return 1; /* success */
2060 : }
2061 : #endif
2062 :
2063 : static PyObject*
2064 0 : join_list(PyObject* list, PyObject* string)
2065 : {
2066 : /* join list elements */
2067 :
2068 : PyObject* joiner;
2069 : #if PY_VERSION_HEX >= 0x01060000
2070 : PyObject* function;
2071 : PyObject* args;
2072 : #endif
2073 : PyObject* result;
2074 :
2075 0 : joiner = PySequence_GetSlice(string, 0, 0);
2076 0 : if (!joiner)
2077 0 : return NULL;
2078 :
2079 0 : if (PyList_GET_SIZE(list) == 0) {
2080 0 : Py_DECREF(list);
2081 0 : return joiner;
2082 : }
2083 :
2084 : #if PY_VERSION_HEX >= 0x01060000
2085 0 : function = PyObject_GetAttrString(joiner, "join");
2086 0 : if (!function) {
2087 0 : Py_DECREF(joiner);
2088 0 : return NULL;
2089 : }
2090 0 : args = PyTuple_New(1);
2091 0 : if (!args) {
2092 0 : Py_DECREF(function);
2093 0 : Py_DECREF(joiner);
2094 0 : return NULL;
2095 : }
2096 0 : PyTuple_SET_ITEM(args, 0, list);
2097 0 : result = PyObject_CallObject(function, args);
2098 0 : Py_DECREF(args); /* also removes list */
2099 0 : Py_DECREF(function);
2100 : #else
2101 : result = call(
2102 : "string", "join",
2103 : PyTuple_Pack(2, list, joiner)
2104 : );
2105 : #endif
2106 0 : Py_DECREF(joiner);
2107 :
2108 0 : return result;
2109 : }
2110 :
2111 : static PyObject*
2112 0 : pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2113 : {
2114 : SRE_STATE state;
2115 : PyObject* list;
2116 : int status;
2117 : Py_ssize_t i, b, e;
2118 :
2119 0 : PyObject *string = NULL, *string2 = NULL;
2120 0 : Py_ssize_t start = 0;
2121 0 : Py_ssize_t end = PY_SSIZE_T_MAX;
2122 : static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2123 0 : if (!check_args_size("findall", args, kw, 3))
2124 0 : return NULL;
2125 :
2126 0 : if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:findall", kwlist,
2127 : &string, &start, &end, &string2))
2128 0 : return NULL;
2129 :
2130 0 : string = fix_string_param(string, string2, "source");
2131 0 : if (!string)
2132 0 : return NULL;
2133 :
2134 0 : string = state_init(&state, self, string, start, end);
2135 0 : if (!string)
2136 0 : return NULL;
2137 :
2138 0 : list = PyList_New(0);
2139 0 : if (!list) {
2140 0 : state_fini(&state);
2141 0 : return NULL;
2142 : }
2143 :
2144 0 : while (state.start <= state.end) {
2145 :
2146 : PyObject* item;
2147 :
2148 0 : state_reset(&state);
2149 :
2150 0 : state.ptr = state.start;
2151 :
2152 0 : if (state.charsize == 1) {
2153 0 : status = sre_search(&state, PatternObject_GetCode(self));
2154 : } else {
2155 : #if defined(HAVE_UNICODE)
2156 0 : status = sre_usearch(&state, PatternObject_GetCode(self));
2157 : #endif
2158 : }
2159 :
2160 0 : if (PyErr_Occurred())
2161 0 : goto error;
2162 :
2163 0 : if (status <= 0) {
2164 0 : if (status == 0)
2165 0 : break;
2166 0 : pattern_error(status);
2167 0 : goto error;
2168 : }
2169 :
2170 : /* don't bother to build a match object */
2171 0 : switch (self->groups) {
2172 : case 0:
2173 0 : b = STATE_OFFSET(&state, state.start);
2174 0 : e = STATE_OFFSET(&state, state.ptr);
2175 0 : item = PySequence_GetSlice(string, b, e);
2176 0 : if (!item)
2177 0 : goto error;
2178 0 : break;
2179 : case 1:
2180 0 : item = state_getslice(&state, 1, string, 1);
2181 0 : if (!item)
2182 0 : goto error;
2183 0 : break;
2184 : default:
2185 0 : item = PyTuple_New(self->groups);
2186 0 : if (!item)
2187 0 : goto error;
2188 0 : for (i = 0; i < self->groups; i++) {
2189 0 : PyObject* o = state_getslice(&state, i+1, string, 1);
2190 0 : if (!o) {
2191 0 : Py_DECREF(item);
2192 0 : goto error;
2193 : }
2194 0 : PyTuple_SET_ITEM(item, i, o);
2195 : }
2196 0 : break;
2197 : }
2198 :
2199 0 : status = PyList_Append(list, item);
2200 0 : Py_DECREF(item);
2201 0 : if (status < 0)
2202 0 : goto error;
2203 :
2204 0 : if (state.ptr == state.start)
2205 0 : state.start = (void*) ((char*) state.ptr + state.charsize);
2206 : else
2207 0 : state.start = state.ptr;
2208 :
2209 : }
2210 :
2211 0 : state_fini(&state);
2212 0 : return list;
2213 :
2214 : error:
2215 0 : Py_DECREF(list);
2216 0 : state_fini(&state);
2217 0 : return NULL;
2218 :
2219 : }
2220 :
2221 : #if PY_VERSION_HEX >= 0x02020000
2222 : static PyObject*
2223 753 : pattern_finditer(PatternObject* pattern, PyObject* args)
2224 : {
2225 : PyObject* scanner;
2226 : PyObject* search;
2227 : PyObject* iterator;
2228 :
2229 753 : scanner = pattern_scanner(pattern, args);
2230 753 : if (!scanner)
2231 0 : return NULL;
2232 :
2233 753 : search = PyObject_GetAttrString(scanner, "search");
2234 753 : Py_DECREF(scanner);
2235 753 : if (!search)
2236 0 : return NULL;
2237 :
2238 753 : iterator = PyCallIter_New(search, Py_None);
2239 753 : Py_DECREF(search);
2240 :
2241 753 : return iterator;
2242 : }
2243 : #endif
2244 :
2245 : static PyObject*
2246 0 : pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2247 : {
2248 : SRE_STATE state;
2249 : PyObject* list;
2250 : PyObject* item;
2251 : int status;
2252 : Py_ssize_t n;
2253 : Py_ssize_t i;
2254 : void* last;
2255 :
2256 0 : PyObject *string = NULL, *string2 = NULL;
2257 0 : Py_ssize_t maxsplit = 0;
2258 : static char* kwlist[] = { "string", "maxsplit", "source", NULL };
2259 0 : if (!check_args_size("split", args, kw, 2))
2260 0 : return NULL;
2261 :
2262 0 : if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnO:split", kwlist,
2263 : &string, &maxsplit, &string2))
2264 0 : return NULL;
2265 :
2266 0 : string = fix_string_param(string, string2, "source");
2267 0 : if (!string)
2268 0 : return NULL;
2269 :
2270 0 : string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2271 0 : if (!string)
2272 0 : return NULL;
2273 :
2274 0 : list = PyList_New(0);
2275 0 : if (!list) {
2276 0 : state_fini(&state);
2277 0 : return NULL;
2278 : }
2279 :
2280 0 : n = 0;
2281 0 : last = state.start;
2282 :
2283 0 : while (!maxsplit || n < maxsplit) {
2284 :
2285 0 : state_reset(&state);
2286 :
2287 0 : state.ptr = state.start;
2288 :
2289 0 : if (state.charsize == 1) {
2290 0 : status = sre_search(&state, PatternObject_GetCode(self));
2291 : } else {
2292 : #if defined(HAVE_UNICODE)
2293 0 : status = sre_usearch(&state, PatternObject_GetCode(self));
2294 : #endif
2295 : }
2296 :
2297 0 : if (PyErr_Occurred())
2298 0 : goto error;
2299 :
2300 0 : if (status <= 0) {
2301 0 : if (status == 0)
2302 0 : break;
2303 0 : pattern_error(status);
2304 0 : goto error;
2305 : }
2306 :
2307 0 : if (state.start == state.ptr) {
2308 0 : if (last == state.end || state.ptr == state.end)
2309 : break;
2310 : /* skip one character */
2311 0 : state.start = (void*) ((char*) state.ptr + state.charsize);
2312 0 : continue;
2313 : }
2314 :
2315 : /* get segment before this match */
2316 0 : item = PySequence_GetSlice(
2317 0 : string, STATE_OFFSET(&state, last),
2318 0 : STATE_OFFSET(&state, state.start)
2319 : );
2320 0 : if (!item)
2321 0 : goto error;
2322 0 : status = PyList_Append(list, item);
2323 0 : Py_DECREF(item);
2324 0 : if (status < 0)
2325 0 : goto error;
2326 :
2327 : /* add groups (if any) */
2328 0 : for (i = 0; i < self->groups; i++) {
2329 0 : item = state_getslice(&state, i+1, string, 0);
2330 0 : if (!item)
2331 0 : goto error;
2332 0 : status = PyList_Append(list, item);
2333 0 : Py_DECREF(item);
2334 0 : if (status < 0)
2335 0 : goto error;
2336 : }
2337 :
2338 0 : n = n + 1;
2339 :
2340 0 : last = state.start = state.ptr;
2341 :
2342 : }
2343 :
2344 : /* get segment following last match (even if empty) */
2345 0 : item = PySequence_GetSlice(
2346 0 : string, STATE_OFFSET(&state, last), state.endpos
2347 : );
2348 0 : if (!item)
2349 0 : goto error;
2350 0 : status = PyList_Append(list, item);
2351 0 : Py_DECREF(item);
2352 0 : if (status < 0)
2353 0 : goto error;
2354 :
2355 0 : state_fini(&state);
2356 0 : return list;
2357 :
2358 : error:
2359 0 : Py_DECREF(list);
2360 0 : state_fini(&state);
2361 0 : return NULL;
2362 :
2363 : }
2364 :
2365 : static PyObject*
2366 0 : pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2367 : Py_ssize_t count, Py_ssize_t subn)
2368 : {
2369 : SRE_STATE state;
2370 : PyObject* list;
2371 : PyObject* item;
2372 : PyObject* filter;
2373 : PyObject* args;
2374 : PyObject* match;
2375 : void* ptr;
2376 : int status;
2377 : Py_ssize_t n;
2378 : Py_ssize_t i, b, e;
2379 : int bint;
2380 : int filter_is_callable;
2381 :
2382 0 : if (PyCallable_Check(ptemplate)) {
2383 : /* sub/subn takes either a function or a template */
2384 0 : filter = ptemplate;
2385 0 : Py_INCREF(filter);
2386 0 : filter_is_callable = 1;
2387 : } else {
2388 : /* if not callable, check if it's a literal string */
2389 : int literal;
2390 0 : ptr = getstring(ptemplate, &n, &bint);
2391 0 : b = bint;
2392 0 : if (ptr) {
2393 0 : if (b == 1) {
2394 0 : literal = sre_literal_template((unsigned char *)ptr, n);
2395 : } else {
2396 : #if defined(HAVE_UNICODE)
2397 0 : literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2398 : #endif
2399 : }
2400 : } else {
2401 0 : PyErr_Clear();
2402 0 : literal = 0;
2403 : }
2404 0 : if (literal) {
2405 0 : filter = ptemplate;
2406 0 : Py_INCREF(filter);
2407 0 : filter_is_callable = 0;
2408 : } else {
2409 : /* not a literal; hand it over to the template compiler */
2410 0 : filter = call(
2411 : SRE_PY_MODULE, "_subx",
2412 : PyTuple_Pack(2, self, ptemplate)
2413 : );
2414 0 : if (!filter)
2415 0 : return NULL;
2416 0 : filter_is_callable = PyCallable_Check(filter);
2417 : }
2418 : }
2419 :
2420 0 : string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2421 0 : if (!string) {
2422 0 : Py_DECREF(filter);
2423 0 : return NULL;
2424 : }
2425 :
2426 0 : list = PyList_New(0);
2427 0 : if (!list) {
2428 0 : Py_DECREF(filter);
2429 0 : state_fini(&state);
2430 0 : return NULL;
2431 : }
2432 :
2433 0 : n = i = 0;
2434 :
2435 0 : while (!count || n < count) {
2436 :
2437 0 : state_reset(&state);
2438 :
2439 0 : state.ptr = state.start;
2440 :
2441 0 : if (state.charsize == 1) {
2442 0 : status = sre_search(&state, PatternObject_GetCode(self));
2443 : } else {
2444 : #if defined(HAVE_UNICODE)
2445 0 : status = sre_usearch(&state, PatternObject_GetCode(self));
2446 : #endif
2447 : }
2448 :
2449 0 : if (PyErr_Occurred())
2450 0 : goto error;
2451 :
2452 0 : if (status <= 0) {
2453 0 : if (status == 0)
2454 0 : break;
2455 0 : pattern_error(status);
2456 0 : goto error;
2457 : }
2458 :
2459 0 : b = STATE_OFFSET(&state, state.start);
2460 0 : e = STATE_OFFSET(&state, state.ptr);
2461 :
2462 0 : if (i < b) {
2463 : /* get segment before this match */
2464 0 : item = PySequence_GetSlice(string, i, b);
2465 0 : if (!item)
2466 0 : goto error;
2467 0 : status = PyList_Append(list, item);
2468 0 : Py_DECREF(item);
2469 0 : if (status < 0)
2470 0 : goto error;
2471 :
2472 0 : } else if (i == b && i == e && n > 0)
2473 : /* ignore empty match on latest position */
2474 0 : goto next;
2475 :
2476 0 : if (filter_is_callable) {
2477 : /* pass match object through filter */
2478 0 : match = pattern_new_match(self, &state, 1);
2479 0 : if (!match)
2480 0 : goto error;
2481 0 : args = PyTuple_Pack(1, match);
2482 0 : if (!args) {
2483 0 : Py_DECREF(match);
2484 0 : goto error;
2485 : }
2486 0 : item = PyObject_CallObject(filter, args);
2487 0 : Py_DECREF(args);
2488 0 : Py_DECREF(match);
2489 0 : if (!item)
2490 0 : goto error;
2491 : } else {
2492 : /* filter is literal string */
2493 0 : item = filter;
2494 0 : Py_INCREF(item);
2495 : }
2496 :
2497 : /* add to list */
2498 0 : if (item != Py_None) {
2499 0 : status = PyList_Append(list, item);
2500 0 : Py_DECREF(item);
2501 0 : if (status < 0)
2502 0 : goto error;
2503 : }
2504 :
2505 0 : i = e;
2506 0 : n = n + 1;
2507 :
2508 : next:
2509 : /* move on */
2510 0 : if (state.ptr == state.end)
2511 0 : break;
2512 0 : if (state.ptr == state.start)
2513 0 : state.start = (void*) ((char*) state.ptr + state.charsize);
2514 : else
2515 0 : state.start = state.ptr;
2516 :
2517 : }
2518 :
2519 : /* get segment following last match */
2520 0 : if (i < state.endpos) {
2521 0 : item = PySequence_GetSlice(string, i, state.endpos);
2522 0 : if (!item)
2523 0 : goto error;
2524 0 : status = PyList_Append(list, item);
2525 0 : Py_DECREF(item);
2526 0 : if (status < 0)
2527 0 : goto error;
2528 : }
2529 :
2530 0 : state_fini(&state);
2531 :
2532 0 : Py_DECREF(filter);
2533 :
2534 : /* convert list to single string (also removes list) */
2535 0 : item = join_list(list, string);
2536 :
2537 0 : if (!item)
2538 0 : return NULL;
2539 :
2540 0 : if (subn)
2541 0 : return Py_BuildValue("Nn", item, n);
2542 :
2543 0 : return item;
2544 :
2545 : error:
2546 0 : Py_DECREF(list);
2547 0 : state_fini(&state);
2548 0 : Py_DECREF(filter);
2549 0 : return NULL;
2550 :
2551 : }
2552 :
2553 : static PyObject*
2554 0 : pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2555 : {
2556 : PyObject* ptemplate;
2557 : PyObject* string;
2558 0 : Py_ssize_t count = 0;
2559 : static char* kwlist[] = { "repl", "string", "count", NULL };
2560 0 : if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2561 : &ptemplate, &string, &count))
2562 0 : return NULL;
2563 :
2564 0 : return pattern_subx(self, ptemplate, string, count, 0);
2565 : }
2566 :
2567 : static PyObject*
2568 0 : pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2569 : {
2570 : PyObject* ptemplate;
2571 : PyObject* string;
2572 0 : Py_ssize_t count = 0;
2573 : static char* kwlist[] = { "repl", "string", "count", NULL };
2574 0 : if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2575 : &ptemplate, &string, &count))
2576 0 : return NULL;
2577 :
2578 0 : return pattern_subx(self, ptemplate, string, count, 1);
2579 : }
2580 :
2581 : static PyObject*
2582 0 : pattern_copy(PatternObject* self, PyObject *unused)
2583 : {
2584 : #ifdef USE_BUILTIN_COPY
2585 : PatternObject* copy;
2586 : int offset;
2587 :
2588 : copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2589 : if (!copy)
2590 : return NULL;
2591 :
2592 : offset = offsetof(PatternObject, groups);
2593 :
2594 : Py_XINCREF(self->groupindex);
2595 : Py_XINCREF(self->indexgroup);
2596 : Py_XINCREF(self->pattern);
2597 :
2598 : memcpy((char*) copy + offset, (char*) self + offset,
2599 : sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2600 : copy->weakreflist = NULL;
2601 :
2602 : return (PyObject*) copy;
2603 : #else
2604 0 : PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2605 0 : return NULL;
2606 : #endif
2607 : }
2608 :
2609 : static PyObject*
2610 0 : pattern_deepcopy(PatternObject* self, PyObject* memo)
2611 : {
2612 : #ifdef USE_BUILTIN_COPY
2613 : PatternObject* copy;
2614 :
2615 : copy = (PatternObject*) pattern_copy(self);
2616 : if (!copy)
2617 : return NULL;
2618 :
2619 : if (!deepcopy(©->groupindex, memo) ||
2620 : !deepcopy(©->indexgroup, memo) ||
2621 : !deepcopy(©->pattern, memo)) {
2622 : Py_DECREF(copy);
2623 : return NULL;
2624 : }
2625 :
2626 : #else
2627 0 : PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2628 0 : return NULL;
2629 : #endif
2630 : }
2631 :
2632 : PyDoc_STRVAR(pattern_match_doc,
2633 : "match(string[, pos[, endpos]]) --> match object or None.\n\
2634 : Matches zero or more characters at the beginning of the string");
2635 :
2636 : PyDoc_STRVAR(pattern_search_doc,
2637 : "search(string[, pos[, endpos]]) --> match object or None.\n\
2638 : Scan through string looking for a match, and return a corresponding\n\
2639 : match object instance. Return None if no position in the string matches.");
2640 :
2641 : PyDoc_STRVAR(pattern_split_doc,
2642 : "split(string[, maxsplit = 0]) --> list.\n\
2643 : Split string by the occurrences of pattern.");
2644 :
2645 : PyDoc_STRVAR(pattern_findall_doc,
2646 : "findall(string[, pos[, endpos]]) --> list.\n\
2647 : Return a list of all non-overlapping matches of pattern in string.");
2648 :
2649 : PyDoc_STRVAR(pattern_finditer_doc,
2650 : "finditer(string[, pos[, endpos]]) --> iterator.\n\
2651 : Return an iterator over all non-overlapping matches for the \n\
2652 : RE pattern in string. For each match, the iterator returns a\n\
2653 : match object.");
2654 :
2655 : PyDoc_STRVAR(pattern_sub_doc,
2656 : "sub(repl, string[, count = 0]) --> newstring\n\
2657 : Return the string obtained by replacing the leftmost non-overlapping\n\
2658 : occurrences of pattern in string by the replacement repl.");
2659 :
2660 : PyDoc_STRVAR(pattern_subn_doc,
2661 : "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2662 : Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2663 : the leftmost non-overlapping occurrences of pattern with the\n\
2664 : replacement repl.");
2665 :
2666 : PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2667 :
2668 : static PyMethodDef pattern_methods[] = {
2669 : {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2670 : pattern_match_doc},
2671 : {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2672 : pattern_search_doc},
2673 : {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2674 : pattern_sub_doc},
2675 : {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2676 : pattern_subn_doc},
2677 : {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2678 : pattern_split_doc},
2679 : {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2680 : pattern_findall_doc},
2681 : #if PY_VERSION_HEX >= 0x02020000
2682 : {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2683 : pattern_finditer_doc},
2684 : #endif
2685 : {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2686 : {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2687 : {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2688 : {NULL, NULL}
2689 : };
2690 :
2691 : #define PAT_OFF(x) offsetof(PatternObject, x)
2692 : static PyMemberDef pattern_members[] = {
2693 : {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2694 : {"flags", T_INT, PAT_OFF(flags), READONLY},
2695 : {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2696 : {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2697 : {NULL} /* Sentinel */
2698 : };
2699 :
2700 : statichere PyTypeObject Pattern_Type = {
2701 : PyObject_HEAD_INIT(NULL)
2702 : 0, "_" SRE_MODULE ".SRE_Pattern",
2703 : sizeof(PatternObject), sizeof(SRE_CODE),
2704 : (destructor)pattern_dealloc, /*tp_dealloc*/
2705 : 0, /* tp_print */
2706 : 0, /* tp_getattrn */
2707 : 0, /* tp_setattr */
2708 : 0, /* tp_compare */
2709 : 0, /* tp_repr */
2710 : 0, /* tp_as_number */
2711 : 0, /* tp_as_sequence */
2712 : 0, /* tp_as_mapping */
2713 : 0, /* tp_hash */
2714 : 0, /* tp_call */
2715 : 0, /* tp_str */
2716 : 0, /* tp_getattro */
2717 : 0, /* tp_setattro */
2718 : 0, /* tp_as_buffer */
2719 : Py_TPFLAGS_DEFAULT, /* tp_flags */
2720 : pattern_doc, /* tp_doc */
2721 : 0, /* tp_traverse */
2722 : 0, /* tp_clear */
2723 : 0, /* tp_richcompare */
2724 : offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2725 : 0, /* tp_iter */
2726 : 0, /* tp_iternext */
2727 : pattern_methods, /* tp_methods */
2728 : pattern_members, /* tp_members */
2729 : };
2730 :
2731 : static int _validate(PatternObject *self); /* Forward */
2732 :
2733 : static PyObject *
2734 849 : _compile(PyObject* self_, PyObject* args)
2735 : {
2736 : /* "compile" pattern descriptor to pattern object */
2737 :
2738 : PatternObject* self;
2739 : Py_ssize_t i, n;
2740 :
2741 : PyObject* pattern;
2742 849 : int flags = 0;
2743 : PyObject* code;
2744 849 : Py_ssize_t groups = 0;
2745 849 : PyObject* groupindex = NULL;
2746 849 : PyObject* indexgroup = NULL;
2747 849 : if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2748 : &PyList_Type, &code, &groups,
2749 : &groupindex, &indexgroup))
2750 0 : return NULL;
2751 :
2752 849 : n = PyList_GET_SIZE(code);
2753 : /* coverity[ampersand_in_size] */
2754 849 : self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2755 849 : if (!self)
2756 0 : return NULL;
2757 849 : self->weakreflist = NULL;
2758 849 : self->pattern = NULL;
2759 849 : self->groupindex = NULL;
2760 849 : self->indexgroup = NULL;
2761 :
2762 849 : self->codesize = n;
2763 :
2764 23640 : for (i = 0; i < n; i++) {
2765 22791 : PyObject *o = PyList_GET_ITEM(code, i);
2766 67977 : unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2767 45186 : : PyLong_AsUnsignedLong(o);
2768 22791 : if (value == (unsigned long)-1 && PyErr_Occurred()) {
2769 0 : if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2770 0 : PyErr_SetString(PyExc_OverflowError,
2771 : "regular expression code size limit exceeded");
2772 : }
2773 0 : break;
2774 : }
2775 22791 : self->code[i] = (SRE_CODE) value;
2776 22791 : if ((unsigned long) self->code[i] != value) {
2777 0 : PyErr_SetString(PyExc_OverflowError,
2778 : "regular expression code size limit exceeded");
2779 0 : break;
2780 : }
2781 : }
2782 :
2783 849 : if (PyErr_Occurred()) {
2784 0 : Py_DECREF(self);
2785 0 : return NULL;
2786 : }
2787 :
2788 849 : Py_INCREF(pattern);
2789 849 : self->pattern = pattern;
2790 :
2791 849 : self->flags = flags;
2792 :
2793 849 : self->groups = groups;
2794 :
2795 849 : Py_XINCREF(groupindex);
2796 849 : self->groupindex = groupindex;
2797 :
2798 849 : Py_XINCREF(indexgroup);
2799 849 : self->indexgroup = indexgroup;
2800 :
2801 849 : self->weakreflist = NULL;
2802 :
2803 849 : if (!_validate(self)) {
2804 0 : Py_DECREF(self);
2805 0 : return NULL;
2806 : }
2807 :
2808 849 : return (PyObject*) self;
2809 : }
2810 :
2811 : /* -------------------------------------------------------------------- */
2812 : /* Code validation */
2813 :
2814 : /* To learn more about this code, have a look at the _compile() function in
2815 : Lib/sre_compile.py. The validation functions below checks the code array
2816 : for conformance with the code patterns generated there.
2817 :
2818 : The nice thing about the generated code is that it is position-independent:
2819 : all jumps are relative jumps forward. Also, jumps don't cross each other:
2820 : the target of a later jump is always earlier than the target of an earlier
2821 : jump. IOW, this is okay:
2822 :
2823 : J---------J-------T--------T
2824 : \ \_____/ /
2825 : \______________________/
2826 :
2827 : but this is not:
2828 :
2829 : J---------J-------T--------T
2830 : \_________\_____/ /
2831 : \____________/
2832 :
2833 : It also helps that SRE_CODE is always an unsigned type.
2834 : */
2835 :
2836 : /* Defining this one enables tracing of the validator */
2837 : #undef VVERBOSE
2838 :
2839 : /* Trace macro for the validator */
2840 : #if defined(VVERBOSE)
2841 : #define VTRACE(v) printf v
2842 : #else
2843 : #define VTRACE(v) do {} while(0) /* do nothing */
2844 : #endif
2845 :
2846 : /* Report failure */
2847 : #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2848 :
2849 : /* Extract opcode, argument, or skip count from code array */
2850 : #define GET_OP \
2851 : do { \
2852 : VTRACE(("%p: ", code)); \
2853 : if (code >= end) FAIL; \
2854 : op = *code++; \
2855 : VTRACE(("%lu (op)\n", (unsigned long)op)); \
2856 : } while (0)
2857 : #define GET_ARG \
2858 : do { \
2859 : VTRACE(("%p= ", code)); \
2860 : if (code >= end) FAIL; \
2861 : arg = *code++; \
2862 : VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2863 : } while (0)
2864 : #define GET_SKIP_ADJ(adj) \
2865 : do { \
2866 : VTRACE(("%p= ", code)); \
2867 : if (code >= end) FAIL; \
2868 : skip = *code; \
2869 : VTRACE(("%lu (skip to %p)\n", \
2870 : (unsigned long)skip, code+skip)); \
2871 : if (skip-adj > end-code) \
2872 : FAIL; \
2873 : code++; \
2874 : } while (0)
2875 : #define GET_SKIP GET_SKIP_ADJ(0)
2876 :
2877 : static int
2878 642 : _validate_charset(SRE_CODE *code, SRE_CODE *end)
2879 : {
2880 : /* Some variables are manipulated by the macros above */
2881 : SRE_CODE op;
2882 : SRE_CODE arg;
2883 : SRE_CODE offset;
2884 : int i;
2885 :
2886 2214 : while (code < end) {
2887 930 : GET_OP;
2888 930 : switch (op) {
2889 :
2890 : case SRE_OP_NEGATE:
2891 81 : break;
2892 :
2893 : case SRE_OP_LITERAL:
2894 357 : GET_ARG;
2895 357 : break;
2896 :
2897 : case SRE_OP_RANGE:
2898 150 : GET_ARG;
2899 150 : GET_ARG;
2900 150 : break;
2901 :
2902 : case SRE_OP_CHARSET:
2903 198 : offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2904 198 : if (offset > end-code)
2905 0 : FAIL;
2906 198 : code += offset;
2907 198 : break;
2908 :
2909 : case SRE_OP_BIGCHARSET:
2910 0 : GET_ARG; /* Number of blocks */
2911 0 : offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2912 0 : if (offset > end-code)
2913 0 : FAIL;
2914 : /* Make sure that each byte points to a valid block */
2915 0 : for (i = 0; i < 256; i++) {
2916 0 : if (((unsigned char *)code)[i] >= arg)
2917 0 : FAIL;
2918 : }
2919 0 : code += offset;
2920 0 : offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2921 0 : if (offset > end-code)
2922 0 : FAIL;
2923 0 : code += offset;
2924 0 : break;
2925 :
2926 : case SRE_OP_CATEGORY:
2927 144 : GET_ARG;
2928 144 : switch (arg) {
2929 : case SRE_CATEGORY_DIGIT:
2930 : case SRE_CATEGORY_NOT_DIGIT:
2931 : case SRE_CATEGORY_SPACE:
2932 : case SRE_CATEGORY_NOT_SPACE:
2933 : case SRE_CATEGORY_WORD:
2934 : case SRE_CATEGORY_NOT_WORD:
2935 : case SRE_CATEGORY_LINEBREAK:
2936 : case SRE_CATEGORY_NOT_LINEBREAK:
2937 : case SRE_CATEGORY_LOC_WORD:
2938 : case SRE_CATEGORY_LOC_NOT_WORD:
2939 : case SRE_CATEGORY_UNI_DIGIT:
2940 : case SRE_CATEGORY_UNI_NOT_DIGIT:
2941 : case SRE_CATEGORY_UNI_SPACE:
2942 : case SRE_CATEGORY_UNI_NOT_SPACE:
2943 : case SRE_CATEGORY_UNI_WORD:
2944 : case SRE_CATEGORY_UNI_NOT_WORD:
2945 : case SRE_CATEGORY_UNI_LINEBREAK:
2946 : case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2947 144 : break;
2948 : default:
2949 0 : FAIL;
2950 : }
2951 144 : break;
2952 :
2953 : default:
2954 0 : FAIL;
2955 :
2956 : }
2957 : }
2958 :
2959 642 : return 1;
2960 : }
2961 :
2962 : static int
2963 1782 : _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2964 : {
2965 : /* Some variables are manipulated by the macros above */
2966 : SRE_CODE op;
2967 : SRE_CODE arg;
2968 : SRE_CODE skip;
2969 :
2970 : VTRACE(("code=%p, end=%p\n", code, end));
2971 :
2972 1782 : if (code > end)
2973 0 : FAIL;
2974 :
2975 7983 : while (code < end) {
2976 4419 : GET_OP;
2977 4419 : switch (op) {
2978 :
2979 : case SRE_OP_MARK:
2980 : /* We don't check whether marks are properly nested; the
2981 : sre_match() code is robust even if they don't, and the worst
2982 : you can get is nonsensical match results. */
2983 366 : GET_ARG;
2984 366 : if (arg > 2*groups+1) {
2985 : VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2986 0 : FAIL;
2987 : }
2988 366 : break;
2989 :
2990 : case SRE_OP_LITERAL:
2991 : case SRE_OP_NOT_LITERAL:
2992 : case SRE_OP_LITERAL_IGNORE:
2993 : case SRE_OP_NOT_LITERAL_IGNORE:
2994 1818 : GET_ARG;
2995 : /* The arg is just a character, nothing to check */
2996 1818 : break;
2997 :
2998 : case SRE_OP_SUCCESS:
2999 : case SRE_OP_FAILURE:
3000 : /* Nothing to check; these normally end the matching process */
3001 0 : break;
3002 :
3003 : case SRE_OP_AT:
3004 27 : GET_ARG;
3005 27 : switch (arg) {
3006 : case SRE_AT_BEGINNING:
3007 : case SRE_AT_BEGINNING_STRING:
3008 : case SRE_AT_BEGINNING_LINE:
3009 : case SRE_AT_END:
3010 : case SRE_AT_END_LINE:
3011 : case SRE_AT_END_STRING:
3012 : case SRE_AT_BOUNDARY:
3013 : case SRE_AT_NON_BOUNDARY:
3014 : case SRE_AT_LOC_BOUNDARY:
3015 : case SRE_AT_LOC_NON_BOUNDARY:
3016 : case SRE_AT_UNI_BOUNDARY:
3017 : case SRE_AT_UNI_NON_BOUNDARY:
3018 27 : break;
3019 : default:
3020 0 : FAIL;
3021 : }
3022 27 : break;
3023 :
3024 : case SRE_OP_ANY:
3025 : case SRE_OP_ANY_ALL:
3026 : /* These have no operands */
3027 54 : break;
3028 :
3029 : case SRE_OP_IN:
3030 : case SRE_OP_IN_IGNORE:
3031 618 : GET_SKIP;
3032 : /* Stop 1 before the end; we check the FAILURE below */
3033 618 : if (!_validate_charset(code, code+skip-2))
3034 0 : FAIL;
3035 618 : if (code[skip-2] != SRE_OP_FAILURE)
3036 0 : FAIL;
3037 618 : code += skip-1;
3038 618 : break;
3039 :
3040 : case SRE_OP_INFO:
3041 : {
3042 : /* A minimal info field is
3043 : <INFO> <1=skip> <2=flags> <3=min> <4=max>;
3044 : If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
3045 : more follows. */
3046 : SRE_CODE flags, i;
3047 : SRE_CODE *newcode;
3048 837 : GET_SKIP;
3049 837 : newcode = code+skip-1;
3050 837 : GET_ARG; flags = arg;
3051 837 : GET_ARG; /* min */
3052 837 : GET_ARG; /* max */
3053 : /* Check that only valid flags are present */
3054 837 : if ((flags & ~(SRE_INFO_PREFIX |
3055 : SRE_INFO_LITERAL |
3056 : SRE_INFO_CHARSET)) != 0)
3057 0 : FAIL;
3058 : /* PREFIX and CHARSET are mutually exclusive */
3059 1497 : if ((flags & SRE_INFO_PREFIX) &&
3060 660 : (flags & SRE_INFO_CHARSET))
3061 0 : FAIL;
3062 : /* LITERAL implies PREFIX */
3063 1464 : if ((flags & SRE_INFO_LITERAL) &&
3064 627 : !(flags & SRE_INFO_PREFIX))
3065 0 : FAIL;
3066 : /* Validate the prefix */
3067 837 : if (flags & SRE_INFO_PREFIX) {
3068 : SRE_CODE prefix_len;
3069 660 : GET_ARG; prefix_len = arg;
3070 660 : GET_ARG; /* prefix skip */
3071 : /* Here comes the prefix string */
3072 660 : if (prefix_len > newcode-code)
3073 0 : FAIL;
3074 660 : code += prefix_len;
3075 : /* And here comes the overlap table */
3076 660 : if (prefix_len > newcode-code)
3077 0 : FAIL;
3078 : /* Each overlap value should be < prefix_len */
3079 1953 : for (i = 0; i < prefix_len; i++) {
3080 1293 : if (code[i] >= prefix_len)
3081 0 : FAIL;
3082 : }
3083 660 : code += prefix_len;
3084 : }
3085 : /* Validate the charset */
3086 837 : if (flags & SRE_INFO_CHARSET) {
3087 24 : if (!_validate_charset(code, newcode-1))
3088 0 : FAIL;
3089 24 : if (newcode[-1] != SRE_OP_FAILURE)
3090 0 : FAIL;
3091 24 : code = newcode;
3092 : }
3093 813 : else if (code != newcode) {
3094 : VTRACE(("code=%p, newcode=%p\n", code, newcode));
3095 0 : FAIL;
3096 : }
3097 : }
3098 837 : break;
3099 :
3100 : case SRE_OP_BRANCH:
3101 : {
3102 114 : SRE_CODE *target = NULL;
3103 : for (;;) {
3104 462 : GET_SKIP;
3105 462 : if (skip == 0)
3106 114 : break;
3107 : /* Stop 2 before the end; we check the JUMP below */
3108 348 : if (!_validate_inner(code, code+skip-3, groups))
3109 0 : FAIL;
3110 348 : code += skip-3;
3111 : /* Check that it ends with a JUMP, and that each JUMP
3112 : has the same target */
3113 348 : GET_OP;
3114 348 : if (op != SRE_OP_JUMP)
3115 0 : FAIL;
3116 348 : GET_SKIP;
3117 348 : if (target == NULL)
3118 114 : target = code+skip-1;
3119 234 : else if (code+skip-1 != target)
3120 0 : FAIL;
3121 348 : }
3122 : }
3123 114 : break;
3124 :
3125 : case SRE_OP_REPEAT_ONE:
3126 : case SRE_OP_MIN_REPEAT_ONE:
3127 : {
3128 : SRE_CODE min, max;
3129 519 : GET_SKIP;
3130 519 : GET_ARG; min = arg;
3131 519 : GET_ARG; max = arg;
3132 519 : if (min > max)
3133 0 : FAIL;
3134 : if (max > SRE_MAXREPEAT)
3135 : FAIL;
3136 519 : if (!_validate_inner(code, code+skip-4, groups))
3137 0 : FAIL;
3138 519 : code += skip-4;
3139 519 : GET_OP;
3140 519 : if (op != SRE_OP_SUCCESS)
3141 0 : FAIL;
3142 : }
3143 519 : break;
3144 :
3145 : case SRE_OP_REPEAT:
3146 : {
3147 : SRE_CODE min, max;
3148 51 : GET_SKIP;
3149 51 : GET_ARG; min = arg;
3150 51 : GET_ARG; max = arg;
3151 51 : if (min > max)
3152 0 : FAIL;
3153 : if (max > SRE_MAXREPEAT)
3154 : FAIL;
3155 51 : if (!_validate_inner(code, code+skip-3, groups))
3156 0 : FAIL;
3157 51 : code += skip-3;
3158 51 : GET_OP;
3159 51 : if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3160 0 : FAIL;
3161 : }
3162 51 : break;
3163 :
3164 : case SRE_OP_GROUPREF:
3165 : case SRE_OP_GROUPREF_IGNORE:
3166 0 : GET_ARG;
3167 0 : if (arg >= groups)
3168 0 : FAIL;
3169 0 : break;
3170 :
3171 : case SRE_OP_GROUPREF_EXISTS:
3172 : /* The regex syntax for this is: '(?(group)then|else)', where
3173 : 'group' is either an integer group number or a group name,
3174 : 'then' and 'else' are sub-regexes, and 'else' is optional. */
3175 0 : GET_ARG;
3176 0 : if (arg >= groups)
3177 0 : FAIL;
3178 0 : GET_SKIP_ADJ(1);
3179 0 : code--; /* The skip is relative to the first arg! */
3180 : /* There are two possibilities here: if there is both a 'then'
3181 : part and an 'else' part, the generated code looks like:
3182 :
3183 : GROUPREF_EXISTS
3184 : <group>
3185 : <skipyes>
3186 : ...then part...
3187 : JUMP
3188 : <skipno>
3189 : (<skipyes> jumps here)
3190 : ...else part...
3191 : (<skipno> jumps here)
3192 :
3193 : If there is only a 'then' part, it looks like:
3194 :
3195 : GROUPREF_EXISTS
3196 : <group>
3197 : <skip>
3198 : ...then part...
3199 : (<skip> jumps here)
3200 :
3201 : There is no direct way to decide which it is, and we don't want
3202 : to allow arbitrary jumps anywhere in the code; so we just look
3203 : for a JUMP opcode preceding our skip target.
3204 : */
3205 0 : if (skip >= 3 && skip-3 < end-code &&
3206 0 : code[skip-3] == SRE_OP_JUMP)
3207 : {
3208 : VTRACE(("both then and else parts present\n"));
3209 0 : if (!_validate_inner(code+1, code+skip-3, groups))
3210 0 : FAIL;
3211 0 : code += skip-2; /* Position after JUMP, at <skipno> */
3212 0 : GET_SKIP;
3213 0 : if (!_validate_inner(code, code+skip-1, groups))
3214 0 : FAIL;
3215 0 : code += skip-1;
3216 : }
3217 : else {
3218 : VTRACE(("only a then part present\n"));
3219 0 : if (!_validate_inner(code+1, code+skip-1, groups))
3220 0 : FAIL;
3221 0 : code += skip-1;
3222 : }
3223 0 : break;
3224 :
3225 : case SRE_OP_ASSERT:
3226 : case SRE_OP_ASSERT_NOT:
3227 15 : GET_SKIP;
3228 15 : GET_ARG; /* 0 for lookahead, width for lookbehind */
3229 15 : code--; /* Back up over arg to simplify math below */
3230 15 : if (arg & 0x80000000)
3231 0 : FAIL; /* Width too large */
3232 : /* Stop 1 before the end; we check the SUCCESS below */
3233 15 : if (!_validate_inner(code+1, code+skip-2, groups))
3234 0 : FAIL;
3235 15 : code += skip-2;
3236 15 : GET_OP;
3237 15 : if (op != SRE_OP_SUCCESS)
3238 0 : FAIL;
3239 15 : break;
3240 :
3241 : default:
3242 0 : FAIL;
3243 :
3244 : }
3245 : }
3246 :
3247 : VTRACE(("okay\n"));
3248 1782 : return 1;
3249 : }
3250 :
3251 : static int
3252 849 : _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3253 : {
3254 849 : if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3255 0 : FAIL;
3256 849 : if (groups == 0) /* fix for simplejson */
3257 810 : groups = 100; /* 100 groups should always be safe */
3258 849 : return _validate_inner(code, end-1, groups);
3259 : }
3260 :
3261 : static int
3262 849 : _validate(PatternObject *self)
3263 : {
3264 849 : if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3265 : {
3266 0 : PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3267 0 : return 0;
3268 : }
3269 : else
3270 : VTRACE(("Success!\n"));
3271 849 : return 1;
3272 : }
3273 :
3274 : /* -------------------------------------------------------------------- */
3275 : /* match methods */
3276 :
3277 : static void
3278 2823 : match_dealloc(MatchObject* self)
3279 : {
3280 2823 : Py_XDECREF(self->regs);
3281 2823 : Py_XDECREF(self->string);
3282 2823 : Py_DECREF(self->pattern);
3283 2823 : PyObject_DEL(self);
3284 2823 : }
3285 :
3286 : static PyObject*
3287 2823 : match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3288 : {
3289 2823 : if (index < 0 || index >= self->groups) {
3290 : /* raise IndexError if we were given a bad group number */
3291 0 : PyErr_SetString(
3292 : PyExc_IndexError,
3293 : "no such group"
3294 : );
3295 0 : return NULL;
3296 : }
3297 :
3298 2823 : index *= 2;
3299 :
3300 2823 : if (self->string == Py_None || self->mark[index] < 0) {
3301 : /* return default value if the string or group is undefined */
3302 0 : Py_INCREF(def);
3303 0 : return def;
3304 : }
3305 :
3306 2823 : return PySequence_GetSlice(
3307 2823 : self->string, self->mark[index], self->mark[index+1]
3308 : );
3309 : }
3310 :
3311 : static Py_ssize_t
3312 2886 : match_getindex(MatchObject* self, PyObject* index)
3313 : {
3314 : Py_ssize_t i;
3315 :
3316 2886 : if (PyInt_Check(index) || PyLong_Check(index))
3317 2886 : return PyInt_AsSsize_t(index);
3318 :
3319 0 : i = -1;
3320 :
3321 0 : if (self->pattern->groupindex) {
3322 0 : index = PyObject_GetItem(self->pattern->groupindex, index);
3323 0 : if (index) {
3324 0 : if (PyInt_Check(index) || PyLong_Check(index))
3325 0 : i = PyInt_AsSsize_t(index);
3326 0 : Py_DECREF(index);
3327 : } else
3328 0 : PyErr_Clear();
3329 : }
3330 :
3331 0 : return i;
3332 : }
3333 :
3334 : static PyObject*
3335 2823 : match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3336 : {
3337 2823 : return match_getslice_by_index(self, match_getindex(self, index), def);
3338 : }
3339 :
3340 : static PyObject*
3341 0 : match_expand(MatchObject* self, PyObject* ptemplate)
3342 : {
3343 : /* delegate to Python code */
3344 0 : return call(
3345 : SRE_PY_MODULE, "_expand",
3346 : PyTuple_Pack(3, self->pattern, self, ptemplate)
3347 : );
3348 : }
3349 :
3350 : static PyObject*
3351 2823 : match_group(MatchObject* self, PyObject* args)
3352 : {
3353 : PyObject* result;
3354 : Py_ssize_t i, size;
3355 :
3356 2823 : size = PyTuple_GET_SIZE(args);
3357 :
3358 2823 : switch (size) {
3359 : case 0:
3360 0 : result = match_getslice(self, Py_False, Py_None);
3361 0 : break;
3362 : case 1:
3363 2823 : result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3364 2823 : break;
3365 : default:
3366 : /* fetch multiple items */
3367 0 : result = PyTuple_New(size);
3368 0 : if (!result)
3369 0 : return NULL;
3370 0 : for (i = 0; i < size; i++) {
3371 0 : PyObject* item = match_getslice(
3372 : self, PyTuple_GET_ITEM(args, i), Py_None
3373 : );
3374 0 : if (!item) {
3375 0 : Py_DECREF(result);
3376 0 : return NULL;
3377 : }
3378 0 : PyTuple_SET_ITEM(result, i, item);
3379 : }
3380 0 : break;
3381 : }
3382 2823 : return result;
3383 : }
3384 :
3385 : static PyObject*
3386 0 : match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3387 : {
3388 : PyObject* result;
3389 : Py_ssize_t index;
3390 :
3391 0 : PyObject* def = Py_None;
3392 : static char* kwlist[] = { "default", NULL };
3393 0 : if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3394 0 : return NULL;
3395 :
3396 0 : result = PyTuple_New(self->groups-1);
3397 0 : if (!result)
3398 0 : return NULL;
3399 :
3400 0 : for (index = 1; index < self->groups; index++) {
3401 : PyObject* item;
3402 0 : item = match_getslice_by_index(self, index, def);
3403 0 : if (!item) {
3404 0 : Py_DECREF(result);
3405 0 : return NULL;
3406 : }
3407 0 : PyTuple_SET_ITEM(result, index-1, item);
3408 : }
3409 :
3410 0 : return result;
3411 : }
3412 :
3413 : static PyObject*
3414 0 : match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3415 : {
3416 : PyObject* result;
3417 : PyObject* keys;
3418 : Py_ssize_t index;
3419 :
3420 0 : PyObject* def = Py_None;
3421 : static char* kwlist[] = { "default", NULL };
3422 0 : if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3423 0 : return NULL;
3424 :
3425 0 : result = PyDict_New();
3426 0 : if (!result || !self->pattern->groupindex)
3427 0 : return result;
3428 :
3429 0 : keys = PyMapping_Keys(self->pattern->groupindex);
3430 0 : if (!keys)
3431 0 : goto failed;
3432 :
3433 0 : for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3434 : int status;
3435 : PyObject* key;
3436 : PyObject* value;
3437 0 : key = PyList_GET_ITEM(keys, index);
3438 0 : if (!key)
3439 0 : goto failed;
3440 0 : value = match_getslice(self, key, def);
3441 0 : if (!value)
3442 0 : goto failed;
3443 0 : status = PyDict_SetItem(result, key, value);
3444 0 : Py_DECREF(value);
3445 0 : if (status < 0)
3446 0 : goto failed;
3447 : }
3448 :
3449 0 : Py_DECREF(keys);
3450 :
3451 0 : return result;
3452 :
3453 : failed:
3454 0 : Py_XDECREF(keys);
3455 0 : Py_DECREF(result);
3456 0 : return NULL;
3457 : }
3458 :
3459 : static PyObject*
3460 0 : match_start(MatchObject* self, PyObject* args)
3461 : {
3462 : Py_ssize_t index;
3463 :
3464 0 : PyObject* index_ = Py_False; /* zero */
3465 0 : if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3466 0 : return NULL;
3467 :
3468 0 : index = match_getindex(self, index_);
3469 :
3470 0 : if (index < 0 || index >= self->groups) {
3471 0 : PyErr_SetString(
3472 : PyExc_IndexError,
3473 : "no such group"
3474 : );
3475 0 : return NULL;
3476 : }
3477 :
3478 : /* mark is -1 if group is undefined */
3479 0 : return PyInt_FromSsize_t(self->mark[index*2]);
3480 : }
3481 :
3482 : static PyObject*
3483 63 : match_end(MatchObject* self, PyObject* args)
3484 : {
3485 : Py_ssize_t index;
3486 :
3487 63 : PyObject* index_ = Py_False; /* zero */
3488 63 : if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3489 0 : return NULL;
3490 :
3491 63 : index = match_getindex(self, index_);
3492 :
3493 63 : if (index < 0 || index >= self->groups) {
3494 0 : PyErr_SetString(
3495 : PyExc_IndexError,
3496 : "no such group"
3497 : );
3498 0 : return NULL;
3499 : }
3500 :
3501 : /* mark is -1 if group is undefined */
3502 63 : return PyInt_FromSsize_t(self->mark[index*2+1]);
3503 : }
3504 :
3505 : LOCAL(PyObject*)
3506 0 : _pair(Py_ssize_t i1, Py_ssize_t i2)
3507 : {
3508 : PyObject* pair;
3509 : PyObject* item;
3510 :
3511 0 : pair = PyTuple_New(2);
3512 0 : if (!pair)
3513 0 : return NULL;
3514 :
3515 0 : item = PyInt_FromSsize_t(i1);
3516 0 : if (!item)
3517 0 : goto error;
3518 0 : PyTuple_SET_ITEM(pair, 0, item);
3519 :
3520 0 : item = PyInt_FromSsize_t(i2);
3521 0 : if (!item)
3522 0 : goto error;
3523 0 : PyTuple_SET_ITEM(pair, 1, item);
3524 :
3525 0 : return pair;
3526 :
3527 : error:
3528 0 : Py_DECREF(pair);
3529 0 : return NULL;
3530 : }
3531 :
3532 : static PyObject*
3533 0 : match_span(MatchObject* self, PyObject* args)
3534 : {
3535 : Py_ssize_t index;
3536 :
3537 0 : PyObject* index_ = Py_False; /* zero */
3538 0 : if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3539 0 : return NULL;
3540 :
3541 0 : index = match_getindex(self, index_);
3542 :
3543 0 : if (index < 0 || index >= self->groups) {
3544 0 : PyErr_SetString(
3545 : PyExc_IndexError,
3546 : "no such group"
3547 : );
3548 0 : return NULL;
3549 : }
3550 :
3551 : /* marks are -1 if group is undefined */
3552 0 : return _pair(self->mark[index*2], self->mark[index*2+1]);
3553 : }
3554 :
3555 : static PyObject*
3556 0 : match_regs(MatchObject* self)
3557 : {
3558 : PyObject* regs;
3559 : PyObject* item;
3560 : Py_ssize_t index;
3561 :
3562 0 : regs = PyTuple_New(self->groups);
3563 0 : if (!regs)
3564 0 : return NULL;
3565 :
3566 0 : for (index = 0; index < self->groups; index++) {
3567 0 : item = _pair(self->mark[index*2], self->mark[index*2+1]);
3568 0 : if (!item) {
3569 0 : Py_DECREF(regs);
3570 0 : return NULL;
3571 : }
3572 0 : PyTuple_SET_ITEM(regs, index, item);
3573 : }
3574 :
3575 0 : Py_INCREF(regs);
3576 0 : self->regs = regs;
3577 :
3578 0 : return regs;
3579 : }
3580 :
3581 : static PyObject*
3582 0 : match_copy(MatchObject* self, PyObject *unused)
3583 : {
3584 : #ifdef USE_BUILTIN_COPY
3585 : MatchObject* copy;
3586 : Py_ssize_t slots, offset;
3587 :
3588 : slots = 2 * (self->pattern->groups+1);
3589 :
3590 : copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3591 : if (!copy)
3592 : return NULL;
3593 :
3594 : /* this value a constant, but any compiler should be able to
3595 : figure that out all by itself */
3596 : offset = offsetof(MatchObject, string);
3597 :
3598 : Py_XINCREF(self->pattern);
3599 : Py_XINCREF(self->string);
3600 : Py_XINCREF(self->regs);
3601 :
3602 : memcpy((char*) copy + offset, (char*) self + offset,
3603 : sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3604 :
3605 : return (PyObject*) copy;
3606 : #else
3607 0 : PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3608 0 : return NULL;
3609 : #endif
3610 : }
3611 :
3612 : static PyObject*
3613 0 : match_deepcopy(MatchObject* self, PyObject* memo)
3614 : {
3615 : #ifdef USE_BUILTIN_COPY
3616 : MatchObject* copy;
3617 :
3618 : copy = (MatchObject*) match_copy(self);
3619 : if (!copy)
3620 : return NULL;
3621 :
3622 : if (!deepcopy((PyObject**) ©->pattern, memo) ||
3623 : !deepcopy(©->string, memo) ||
3624 : !deepcopy(©->regs, memo)) {
3625 : Py_DECREF(copy);
3626 : return NULL;
3627 : }
3628 :
3629 : #else
3630 0 : PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3631 0 : return NULL;
3632 : #endif
3633 : }
3634 :
3635 : PyDoc_STRVAR(match_doc,
3636 : "The result of re.match() and re.search().\n\
3637 : Match objects always have a boolean value of True.");
3638 :
3639 : PyDoc_STRVAR(match_group_doc,
3640 : "group([group1, ...]) -> str or tuple.\n\
3641 : Return subgroup(s) of the match by indices or names.\n\
3642 : For 0 returns the entire match.");
3643 :
3644 : PyDoc_STRVAR(match_start_doc,
3645 : "start([group=0]) -> int.\n\
3646 : Return index of the start of the substring matched by group.");
3647 :
3648 : PyDoc_STRVAR(match_end_doc,
3649 : "end([group=0]) -> int.\n\
3650 : Return index of the end of the substring matched by group.");
3651 :
3652 : PyDoc_STRVAR(match_span_doc,
3653 : "span([group]) -> tuple.\n\
3654 : For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3655 :
3656 : PyDoc_STRVAR(match_groups_doc,
3657 : "groups([default=None]) -> tuple.\n\
3658 : Return a tuple containing all the subgroups of the match, from 1.\n\
3659 : The default argument is used for groups\n\
3660 : that did not participate in the match");
3661 :
3662 : PyDoc_STRVAR(match_groupdict_doc,
3663 : "groupdict([default=None]) -> dict.\n\
3664 : Return a dictionary containing all the named subgroups of the match,\n\
3665 : keyed by the subgroup name. The default argument is used for groups\n\
3666 : that did not participate in the match");
3667 :
3668 : PyDoc_STRVAR(match_expand_doc,
3669 : "expand(template) -> str.\n\
3670 : Return the string obtained by doing backslash substitution\n\
3671 : on the string template, as done by the sub() method.");
3672 :
3673 : static PyMethodDef match_methods[] = {
3674 : {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3675 : {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3676 : {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3677 : {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3678 : {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3679 : match_groups_doc},
3680 : {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3681 : match_groupdict_doc},
3682 : {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
3683 : {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3684 : {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3685 : {NULL, NULL}
3686 : };
3687 :
3688 : static PyObject *
3689 0 : match_lastindex_get(MatchObject *self)
3690 : {
3691 0 : if (self->lastindex >= 0)
3692 0 : return PyInt_FromSsize_t(self->lastindex);
3693 0 : Py_INCREF(Py_None);
3694 0 : return Py_None;
3695 : }
3696 :
3697 : static PyObject *
3698 0 : match_lastgroup_get(MatchObject *self)
3699 : {
3700 0 : if (self->pattern->indexgroup && self->lastindex >= 0) {
3701 0 : PyObject* result = PySequence_GetItem(
3702 0 : self->pattern->indexgroup, self->lastindex
3703 : );
3704 0 : if (result)
3705 0 : return result;
3706 0 : PyErr_Clear();
3707 : }
3708 0 : Py_INCREF(Py_None);
3709 0 : return Py_None;
3710 : }
3711 :
3712 : static PyObject *
3713 0 : match_regs_get(MatchObject *self)
3714 : {
3715 0 : if (self->regs) {
3716 0 : Py_INCREF(self->regs);
3717 0 : return self->regs;
3718 : } else
3719 0 : return match_regs(self);
3720 : }
3721 :
3722 : static PyGetSetDef match_getset[] = {
3723 : {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3724 : {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3725 : {"regs", (getter)match_regs_get, (setter)NULL},
3726 : {NULL}
3727 : };
3728 :
3729 : #define MATCH_OFF(x) offsetof(MatchObject, x)
3730 : static PyMemberDef match_members[] = {
3731 : {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3732 : {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3733 : {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3734 : {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3735 : {NULL}
3736 : };
3737 :
3738 :
3739 : /* FIXME: implement setattr("string", None) as a special case (to
3740 : detach the associated string, if any */
3741 :
3742 : static PyTypeObject Match_Type = {
3743 : PyVarObject_HEAD_INIT(NULL, 0)
3744 : "_" SRE_MODULE ".SRE_Match",
3745 : sizeof(MatchObject), sizeof(Py_ssize_t),
3746 : (destructor)match_dealloc, /* tp_dealloc */
3747 : 0, /* tp_print */
3748 : 0, /* tp_getattr */
3749 : 0, /* tp_setattr */
3750 : 0, /* tp_compare */
3751 : 0, /* tp_repr */
3752 : 0, /* tp_as_number */
3753 : 0, /* tp_as_sequence */
3754 : 0, /* tp_as_mapping */
3755 : 0, /* tp_hash */
3756 : 0, /* tp_call */
3757 : 0, /* tp_str */
3758 : 0, /* tp_getattro */
3759 : 0, /* tp_setattro */
3760 : 0, /* tp_as_buffer */
3761 : Py_TPFLAGS_DEFAULT,
3762 : match_doc, /* tp_doc */
3763 : 0, /* tp_traverse */
3764 : 0, /* tp_clear */
3765 : 0, /* tp_richcompare */
3766 : 0, /* tp_weaklistoffset */
3767 : 0, /* tp_iter */
3768 : 0, /* tp_iternext */
3769 : match_methods, /* tp_methods */
3770 : match_members, /* tp_members */
3771 : match_getset, /* tp_getset */
3772 : };
3773 :
3774 : static PyObject*
3775 5310 : pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3776 : {
3777 : /* create match object (from state object) */
3778 :
3779 : MatchObject* match;
3780 : Py_ssize_t i, j;
3781 : char* base;
3782 : int n;
3783 :
3784 5310 : if (status > 0) {
3785 :
3786 : /* create match object (with room for extra group marks) */
3787 : /* coverity[ampersand_in_size] */
3788 2823 : match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3789 : 2*(pattern->groups+1));
3790 2823 : if (!match)
3791 0 : return NULL;
3792 :
3793 2823 : Py_INCREF(pattern);
3794 2823 : match->pattern = pattern;
3795 :
3796 2823 : Py_INCREF(state->string);
3797 2823 : match->string = state->string;
3798 :
3799 2823 : match->regs = NULL;
3800 2823 : match->groups = pattern->groups+1;
3801 :
3802 : /* fill in group slices */
3803 :
3804 2823 : base = (char*) state->beginning;
3805 2823 : n = state->charsize;
3806 :
3807 2823 : match->mark[0] = ((char*) state->start - base) / n;
3808 2823 : match->mark[1] = ((char*) state->ptr - base) / n;
3809 :
3810 5583 : for (i = j = 0; i < pattern->groups; i++, j+=2)
3811 2760 : if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3812 2760 : match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3813 2760 : match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3814 : } else
3815 0 : match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3816 :
3817 2823 : match->pos = state->pos;
3818 2823 : match->endpos = state->endpos;
3819 :
3820 2823 : match->lastindex = state->lastindex;
3821 :
3822 2823 : return (PyObject*) match;
3823 :
3824 2487 : } else if (status == 0) {
3825 :
3826 : /* no match */
3827 2487 : Py_INCREF(Py_None);
3828 2487 : return Py_None;
3829 :
3830 : }
3831 :
3832 : /* internal error */
3833 0 : pattern_error(status);
3834 0 : return NULL;
3835 : }
3836 :
3837 :
3838 : /* -------------------------------------------------------------------- */
3839 : /* scanner methods (experimental) */
3840 :
3841 : static void
3842 753 : scanner_dealloc(ScannerObject* self)
3843 : {
3844 753 : state_fini(&self->state);
3845 753 : Py_XDECREF(self->pattern);
3846 753 : PyObject_DEL(self);
3847 753 : }
3848 :
3849 : static PyObject*
3850 0 : scanner_match(ScannerObject* self, PyObject *unused)
3851 : {
3852 0 : SRE_STATE* state = &self->state;
3853 : PyObject* match;
3854 : int status;
3855 :
3856 0 : if (state->start == NULL)
3857 0 : Py_RETURN_NONE;
3858 :
3859 0 : state_reset(state);
3860 :
3861 0 : state->ptr = state->start;
3862 :
3863 0 : if (state->charsize == 1) {
3864 0 : status = sre_match(state, PatternObject_GetCode(self->pattern));
3865 : } else {
3866 : #if defined(HAVE_UNICODE)
3867 0 : status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3868 : #endif
3869 : }
3870 0 : if (PyErr_Occurred())
3871 0 : return NULL;
3872 :
3873 0 : match = pattern_new_match((PatternObject*) self->pattern,
3874 : state, status);
3875 :
3876 0 : if (status == 0)
3877 0 : state->start = NULL;
3878 0 : else if (state->ptr != state->start)
3879 0 : state->start = state->ptr;
3880 0 : else if (state->ptr != state->end)
3881 0 : state->start = (void*) ((char*) state->ptr + state->charsize);
3882 : else
3883 0 : state->start = NULL;
3884 :
3885 0 : return match;
3886 : }
3887 :
3888 :
3889 : static PyObject*
3890 3165 : scanner_search(ScannerObject* self, PyObject *unused)
3891 : {
3892 3165 : SRE_STATE* state = &self->state;
3893 : PyObject* match;
3894 : int status;
3895 :
3896 3165 : if (state->start == NULL)
3897 0 : Py_RETURN_NONE;
3898 :
3899 3165 : state_reset(state);
3900 :
3901 3165 : state->ptr = state->start;
3902 :
3903 3165 : if (state->charsize == 1) {
3904 3165 : status = sre_search(state, PatternObject_GetCode(self->pattern));
3905 : } else {
3906 : #if defined(HAVE_UNICODE)
3907 0 : status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3908 : #endif
3909 : }
3910 3165 : if (PyErr_Occurred())
3911 0 : return NULL;
3912 :
3913 3165 : match = pattern_new_match((PatternObject*) self->pattern,
3914 : state, status);
3915 :
3916 3165 : if (status == 0)
3917 405 : state->start = NULL;
3918 2760 : else if (state->ptr != state->start)
3919 2760 : state->start = state->ptr;
3920 0 : else if (state->ptr != state->end)
3921 0 : state->start = (void*) ((char*) state->ptr + state->charsize);
3922 : else
3923 0 : state->start = NULL;
3924 :
3925 3165 : return match;
3926 : }
3927 :
3928 : static PyMethodDef scanner_methods[] = {
3929 : {"match", (PyCFunction) scanner_match, METH_NOARGS},
3930 : {"search", (PyCFunction) scanner_search, METH_NOARGS},
3931 : {NULL, NULL}
3932 : };
3933 :
3934 : #define SCAN_OFF(x) offsetof(ScannerObject, x)
3935 : static PyMemberDef scanner_members[] = {
3936 : {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3937 : {NULL} /* Sentinel */
3938 : };
3939 :
3940 : statichere PyTypeObject Scanner_Type = {
3941 : PyObject_HEAD_INIT(NULL)
3942 : 0, "_" SRE_MODULE ".SRE_Scanner",
3943 : sizeof(ScannerObject), 0,
3944 : (destructor)scanner_dealloc, /*tp_dealloc*/
3945 : 0, /* tp_print */
3946 : 0, /* tp_getattr */
3947 : 0, /* tp_setattr */
3948 : 0, /* tp_reserved */
3949 : 0, /* tp_repr */
3950 : 0, /* tp_as_number */
3951 : 0, /* tp_as_sequence */
3952 : 0, /* tp_as_mapping */
3953 : 0, /* tp_hash */
3954 : 0, /* tp_call */
3955 : 0, /* tp_str */
3956 : 0, /* tp_getattro */
3957 : 0, /* tp_setattro */
3958 : 0, /* tp_as_buffer */
3959 : Py_TPFLAGS_DEFAULT, /* tp_flags */
3960 : 0, /* tp_doc */
3961 : 0, /* tp_traverse */
3962 : 0, /* tp_clear */
3963 : 0, /* tp_richcompare */
3964 : 0, /* tp_weaklistoffset */
3965 : 0, /* tp_iter */
3966 : 0, /* tp_iternext */
3967 : scanner_methods, /* tp_methods */
3968 : scanner_members, /* tp_members */
3969 : 0, /* tp_getset */
3970 : };
3971 :
3972 : static PyObject*
3973 753 : pattern_scanner(PatternObject* pattern, PyObject* args)
3974 : {
3975 : /* create search state object */
3976 :
3977 : ScannerObject* self;
3978 :
3979 : PyObject* string;
3980 753 : Py_ssize_t start = 0;
3981 753 : Py_ssize_t end = PY_SSIZE_T_MAX;
3982 753 : if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3983 0 : return NULL;
3984 :
3985 : /* create scanner object */
3986 753 : self = PyObject_NEW(ScannerObject, &Scanner_Type);
3987 753 : if (!self)
3988 0 : return NULL;
3989 753 : self->pattern = NULL;
3990 :
3991 753 : string = state_init(&self->state, pattern, string, start, end);
3992 753 : if (!string) {
3993 0 : Py_DECREF(self);
3994 0 : return NULL;
3995 : }
3996 :
3997 753 : Py_INCREF(pattern);
3998 753 : self->pattern = (PyObject*) pattern;
3999 :
4000 753 : return (PyObject*) self;
4001 : }
4002 :
4003 : static PyMethodDef _functions[] = {
4004 : {"compile", _compile, METH_VARARGS},
4005 : {"getcodesize", sre_codesize, METH_NOARGS},
4006 : {"getlower", sre_getlower, METH_VARARGS},
4007 : {NULL, NULL}
4008 : };
4009 :
4010 : #if PY_VERSION_HEX < 0x02030000
4011 : DL_EXPORT(void) init_sre(void)
4012 : #else
4013 3 : PyMODINIT_FUNC init_sre(void)
4014 : #endif
4015 : {
4016 : PyObject* m;
4017 : PyObject* d;
4018 : PyObject* x;
4019 :
4020 : /* Patch object types */
4021 6 : if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
4022 3 : PyType_Ready(&Scanner_Type))
4023 0 : return;
4024 :
4025 3 : m = Py_InitModule("_" SRE_MODULE, _functions);
4026 3 : if (m == NULL)
4027 0 : return;
4028 3 : d = PyModule_GetDict(m);
4029 :
4030 3 : x = PyInt_FromLong(SRE_MAGIC);
4031 3 : if (x) {
4032 3 : PyDict_SetItemString(d, "MAGIC", x);
4033 3 : Py_DECREF(x);
4034 : }
4035 :
4036 3 : x = PyInt_FromLong(sizeof(SRE_CODE));
4037 3 : if (x) {
4038 3 : PyDict_SetItemString(d, "CODESIZE", x);
4039 3 : Py_DECREF(x);
4040 : }
4041 :
4042 3 : x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
4043 3 : if (x) {
4044 3 : PyDict_SetItemString(d, "MAXREPEAT", x);
4045 3 : Py_DECREF(x);
4046 : }
4047 :
4048 3 : x = PyString_FromString(copyright);
4049 3 : if (x) {
4050 3 : PyDict_SetItemString(d, "copyright", x);
4051 3 : Py_DECREF(x);
4052 : }
4053 : }
4054 :
4055 : #endif /* !defined(SRE_RECURSIVE) */
4056 :
4057 : /* vim:ts=4:sw=4:et
4058 : */
|