1 #!/usr/bin/env python2
2 """Lex_gen.py."""
3 from __future__ import print_function
4
5 from _devbuild.gen.id_kind import (TEST_UNARY_LOOKUP, TEST_BINARY_LOOKUP,
6 TEST_OTHER_LOOKUP)
7 from _devbuild.gen.id_kind_asdl import Id_str
8 from _devbuild.gen.types_asdl import lex_mode_str
9
10 import cStringIO
11 import sys
12 import sre_parse
13 import sre_constants
14
15 from frontend import lexer_def
16
17
18 def PrintTree(re_tree, depth=2):
19 """
20 re_tree: List of children
21 """
22 for child in re_tree:
23 name, arg = child
24 sys.stdout.write(depth * '\t')
25 sys.stdout.write(name)
26 sys.stdout.write(' ')
27 if name == 'in': # character class
28 print('{')
29 PrintTree(arg, depth=depth + 1)
30 sys.stdout.write(depth * '\t')
31 print('}')
32 elif name == 'max_repeat': # repetition
33 min_, max_, children = arg
34 # min = 0 means *, min = 1 means +
35 assert min_ in (0, 1), min_
36 print(min_, max_, '{')
37 PrintTree(children, depth=depth + 1)
38 sys.stdout.write(depth * '\t')
39 print()
40 elif name == 'negate': # Oh this is a ^. It doesn't form a node.
41 assert arg is None
42 print()
43 elif name == 'literal': # Quote \ and " in re2c syntax
44 print(repr(chr(arg)))
45 elif name == 'not_literal': # ditto
46 print(repr(chr(arg)))
47 elif name == 'range': # ascii range
48 begin, end = arg
49 print(repr(chr(begin)), repr(chr(end)))
50 elif name == 'any': # This is the '.' character
51 assert arg is None
52 print()
53 else:
54 raise AssertionError(name)
55
56 # NOTE: negate and not_literal are sort of duplicated
57
58
59 def PrintRegex(pat):
60 re_tree = sre_parse.parse(pat)
61 print('\t\t[')
62 PrintTree(re_tree)
63 print('\t\t]')
64
65
66 # re2c literals are inside double quotes, so we don't need to do anything with
67 # ^ or whatever.
68 LITERAL_META = ['\\', '"']
69 LITERAL_META_CODES = [ord(c) for c in LITERAL_META]
70
71
72 def _Literal(arg, char_escapes=LITERAL_META_CODES):
73 if arg == 0:
74 s = r'\x00' # "\000"
75 elif arg == ord('\n'):
76 s = r'\n'
77 elif arg == ord('\r'):
78 s = r'\r'
79 elif arg == ord('\t'):
80 s = r'\t'
81 elif arg in char_escapes:
82 s = '\\' + chr(arg)
83 elif arg >= 0x80:
84 s = '\\x%02x' % arg # for \x80-\xff
85 else:
86 s = chr(arg)
87 return s
88
89
90 # - means range. Note that re2c gives an error we uselessly escape \^.
91 CHAR_CLASS_META = ['\\', '-', ']']
92 CHAR_CLASS_META_CODES = [ord(c) for c in CHAR_CLASS_META]
93
94
95 def _CharClassLiteral(arg):
96 return _Literal(arg, char_escapes=CHAR_CLASS_META_CODES)
97
98
99 def TranslateConstant(pat):
100 return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
101
102
103 def TranslateTree(re_tree, f, in_char_class=False):
104 """
105 re_tree: List of children
106 """
107 for child in re_tree:
108 name, arg = child
109 if name == 'in': # character class
110 f.write('[')
111 TranslateTree(arg, f, in_char_class=True) # list of literals/ranges
112 f.write(']')
113
114 elif name == 'max_repeat': # repetition
115 min_, max_, children = arg
116 # min = 0 means *, min = 1 means +
117 #assert min_ in (0, 1), min_
118 TranslateTree(children, f)
119
120 if min_ == 0 and max_ == 1:
121 f.write('? ')
122
123 elif max_ == sre_constants.MAXREPEAT:
124 if min_ == 0:
125 f.write('* ')
126 elif min_ == 1:
127 f.write('+ ')
128 else:
129 assert 0, min_
130
131 else: # re2c also supports [0-7]{1,2} syntax
132 # note: might generated {2,2}
133 f.write('{%d,%d} ' % (min_, max_))
134
135 elif name == 'negate': # ^ in [^a-z]
136 assert arg is None
137 f.write('^')
138
139 elif name == 'literal': # Quote \ and " in re2c syntax
140 # TODO: it matters if we're inside a character class
141 #print("literal ARG %r" % arg)
142
143 if in_char_class:
144 s = _CharClassLiteral(arg)
145 else:
146 s = '"%s" ' % _Literal(arg)
147 f.write(s)
148
149 elif name == 'not_literal': # ditto
150 assert not in_char_class
151 f.write('[^%s]' % _CharClassLiteral(arg))
152
153 elif name == 'range': # ascii range
154 begin, end = arg
155 f.write('%s-%s' %
156 (_CharClassLiteral(begin), _CharClassLiteral(end)))
157
158 elif name == 'any': # This is the '.' character
159 assert arg is None
160 f.write('.')
161
162 elif name == 'subpattern':
163 _, children = arg # Not sure what the _ is, but it works
164 f.write('(')
165 TranslateTree(children, f)
166 f.write(')')
167
168 else:
169 if 0:
170 from mycpp.mylib import log
171 log('child %s', child)
172 raise RuntimeError("I don't understand regex construct: %r" % name)
173
174 # NOTE: negate and not_literal are sort of duplicated
175
176
177 def TranslateRegex(pat):
178 re_tree = sre_parse.parse(pat)
179 # For debugging
180 #import pprint
181 #print(pprint.pformat(re_tree), file=sys.stderr)
182 f = cStringIO.StringIO()
183 try:
184 TranslateTree(re_tree, f)
185 except RuntimeError:
186 print('Error translating %r' % pat, file=sys.stderr)
187 raise
188 return f.getvalue()
189
190
191 # This explains the sentinel method, which we will use.
192 # http://re2c.org/examples/example_01.html
193 #
194 # TODO: Change ParseTuple to use 's' rather than '#s' ?
195
196 # I don't think we need this YYFILL mechanism, because we lex a line at a
197 # time.
198 # http://re2c.org/examples/example_03.html
199
200
201 def TranslateSimpleLexer(func_name, lexer_def):
202 print(r"""
203 static inline void %s(const unsigned char* line, int line_len,
204 int start_pos, int* id, int* end_pos) {
205 assert(start_pos <= line_len); /* caller should have checked */
206
207 const unsigned char* p = line + start_pos; /* modified by re2c */
208
209 /* Echo and History lexer apparently need this, but others don't */
210 __attribute__((unused)) const unsigned char* YYMARKER;
211
212 for (;;) {
213 /*!re2c
214 """ % func_name)
215
216 for is_regex, pat, id_ in lexer_def:
217 if is_regex:
218 re2c_pat = TranslateRegex(pat)
219 else:
220 re2c_pat = TranslateConstant(pat)
221 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
222 print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
223
224 # EARLY RETURN: Do NOT advance past the NUL terminator.
225 print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
226 r'"\x00"')
227
228 print("""
229 */
230 }
231 *end_pos = p - line; /* relative */
232 }
233 """)
234
235
236 def TranslateBracket(func_name, token_dict):
237 print(r"""
238 static inline int %s(const unsigned char* s, int len) {
239 const unsigned char* p = s; /* modified by re2c */
240 const unsigned char* end = s + len;
241
242 __attribute__((unused)) const unsigned char* YYMARKER;
243 int id;
244
245 for (;;) {
246 /*!re2c
247 """ % func_name)
248
249 for pat in sorted(token_dict):
250 id_ = token_dict[pat]
251 re2c_pat = TranslateConstant(pat)
252 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
253 print(' %-30s { id = id__%s; break; }' % (re2c_pat, id_name))
254
255 # EARLY RETURN: Do NOT advance past other chars, including the NUL
256 # terminator.
257 print(' %-30s { return id__Undefined_Tok; }' % '*')
258
259 print("""
260 */
261 }
262 // must be an exact match
263 return (p == end) ? id : id__Undefined_Tok;
264 }
265 """)
266
267
268 def StringToInt(func_name, name_def):
269 print(r"""
270 static inline void %s(const unsigned char* s, int len, int* id) {
271 const unsigned char* p = s; /* modified by re2c */
272 const unsigned char* end = s + len;
273
274 //fprintf(stderr, "*** s = %%s\n", s);
275
276 for (;;) {
277 /*!re2c
278 """ % func_name)
279
280 for name, enum in name_def:
281 re2c_pat = TranslateConstant(name)
282 print(' %-30s { *id = %s; break; }' % (re2c_pat, enum))
283
284 # Not found. * matches anything else.
285 print(' %-30s { *id = 0; return; }' % \
286 r'*')
287
288 print(r"""
289 */
290 }
291 if (p != end) {
292 //fprintf(stderr, "EXTRA CHARS\n", s);
293 *id = 0; // Not an exact match
294 }
295 }
296 """)
297
298 # TODO: Check that we're at the END OF THE STRING
299
300
301 def TranslateOshLexer(lexer_def):
302 # https://stackoverflow.com/questions/12836171/difference-between-an-inline-function-and-static-inline-function
303 # Has to be 'static inline' rather than 'inline', otherwise the
304 # _bin/oil.ovm-dbg build fails (but the _bin/oil.ovm doesn't!).
305 # Since we reference this function in exactly one translation unit --
306 # fastlex.c, the difference is moot, and we just satisfy the compiler.
307
308 print(r"""
309 /* Common stuff */
310
311 /*!re2c
312 re2c:define:YYCTYPE = "unsigned char";
313 re2c:define:YYCURSOR = p;
314 re2c:yyfill:enable = 0; // generated code doesn't ask for more input
315 */
316
317 static inline void MatchOshToken(int lex_mode, const unsigned char* line, int line_len,
318 int start_pos, int* id, int* end_pos) {
319 assert(start_pos <= line_len); /* caller should have checked */
320
321 const unsigned char* p = line + start_pos; /* modified by re2c */
322 //printf("p: %p q: %p\n", p, q);
323
324 __attribute__((unused)) const unsigned char* YYMARKER; /* why do we need this? */
325 switch (lex_mode) {
326 """)
327
328 # TODO: Should be ordered by most common? Or will profile-directed feedback
329 # help?
330 for state, pat_list in lexer_def.iteritems():
331 # e.g. lex_mode.DQ => lex_mode__DQ
332 print(' case %s:' % lex_mode_str(state).replace('.', '__'))
333 print(' for (;;) {')
334 print(' /*!re2c')
335
336 for is_regex, pat, id_ in pat_list:
337 if is_regex:
338 re2c_pat = TranslateRegex(pat)
339 else:
340 re2c_pat = TranslateConstant(pat)
341 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
342 print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
343
344 # EARLY RETURN: Do NOT advance past the NUL terminator.
345 print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
346 r'"\x00"')
347
348 print(' */')
349 print(' }')
350 print(' break;')
351 print()
352
353 # This is literal code without generation:
354 """
355 case lex_mode__OUTER:
356 for (;;) {
357 /*!re2c
358 literal_chunk = [a-zA-Z0-9_/.-]+;
359 var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
360 comment = [ \t\r]* "#" [^\000\r\n]*;
361 space = [ \t\r]+;
362 nul = "\000";
363
364 literal_chunk { *id = id__Lit_Chars; break; }
365 var_like { *id = id__Lit_VarLike; break; }
366
367 [ \t\r]* "\n" { *id = id__Op_Newline; break; }
368 space { *id = id__WS_Space; break; }
369
370 nul { *id = id__Eof_Real; break; }
371
372 // anything else
373 * { *id = id__Lit_Other; break; }
374
375 */
376 }
377
378 *end_pos = p - line;
379 break;
380
381 case lex_mode__COMMENT:
382 *id = id__Lit_Other;
383 *end_pos = 6;
384 break;
385 """
386
387 print("""\
388 default:
389 assert(0);
390
391 }
392 *end_pos = p - line; /* relative */
393 }
394 """)
395
396
397 def TranslateRegexToPredicate(py_regex, func_name):
398 re2c_pat = TranslateRegex(py_regex)
399 print(r"""
400 static inline int %s(const unsigned char* s, int len) {
401 const unsigned char* p = s; /* modified by re2c */
402 const unsigned char* end = s + len;
403
404 /* MatchBraceRangeToken needs this, but others don't */
405 __attribute__((unused)) const unsigned char* YYMARKER;
406
407 /*!re2c
408 re2c:define:YYCTYPE = "unsigned char";
409 re2c:define:YYCURSOR = p;
410 %-30s { return p == end; } // Match must be anchored right, like $
411 * { return 0; }
412 */
413 }
414 """ % (func_name, re2c_pat))
415
416
417 # note: use YYCURSOR and YYLIMIT
418 # limit should be the end of string
419 # line + line_len
420 def main(argv):
421 action = argv[1]
422 if action == 'c':
423 # Code is printed to stdout
424
425 TranslateOshLexer(lexer_def.LEXER_DEF)
426
427 TranslateSimpleLexer('MatchEchoToken', lexer_def.ECHO_E_DEF)
428 TranslateSimpleLexer('MatchGlobToken', lexer_def.GLOB_DEF)
429 TranslateSimpleLexer('MatchPS1Token', lexer_def.PS1_DEF)
430 TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
431 TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
432 #TranslateSimpleLexer('MatchQsnToken', lexer_def.QSN_DEF)
433
434 TranslateRegexToPredicate(lexer_def.VAR_NAME_RE, 'IsValidVarName')
435 TranslateRegexToPredicate(lexer_def.SHOULD_HIJACK_RE, 'ShouldHijack')
436 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_INTEGER,
437 'LooksLikeInteger')
438 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_FLOAT, 'LooksLikeFloat')
439
440 TranslateBracket('BracketUnary', TEST_UNARY_LOOKUP)
441 TranslateBracket('BracketBinary', TEST_BINARY_LOOKUP)
442 TranslateBracket('BracketOther', TEST_OTHER_LOOKUP)
443
444 elif action == 'print-all':
445 # Top level is a switch statement.
446 for state, pat_list in lexer_def.LEXER_DEF.iteritems():
447 print(state)
448 # This level is re2c patterns.
449 for is_regex, pat, token_id in pat_list:
450 print('\t%r -> %r' % (pat, token_id))
451 if is_regex:
452 #print re_tree
453 _ = TranslateRegex(pat)
454 #print out_pat
455
456 print()
457
458 elif action == 'print-regex':
459 unique = set()
460
461 num_regexes = 0
462 for state, pat_list in lexer_def.LEXER_DEF.iteritems():
463 print(state)
464 # This level is re2c patterns.
465 for is_regex, pat, token_id in pat_list:
466 #print '\t%r -> %r' % (pat, token_id)
467 if is_regex:
468 print('\t' + pat)
469 print('\t' + TranslateRegex(pat))
470 print()
471 #PrintRegex(pat)
472 num_regexes += 1
473 unique.add(pat)
474 else:
475 print('\t' + TranslateConstant(pat))
476
477 print()
478
479 print('Printed %d regexes (%d unique)' % (num_regexes, len(unique)))
480
481
482 if __name__ == '__main__':
483 try:
484 main(sys.argv)
485 except RuntimeError as e:
486 print('FATAL: %s' % e, file=sys.stderr)
487 sys.exit(1)