OILS
/
frontend
/
lexer_gen.py
1 |
#!/usr/bin/env python2
|
2 |
"""Lex_gen.py."""
|
3 |
from __future__ import print_function
|
4 |
|
5 |
from _devbuild.gen.id_kind import (TEST_UNARY_LOOKUP, TEST_BINARY_LOOKUP,
|
6 |
TEST_OTHER_LOOKUP)
|
7 |
from _devbuild.gen.id_kind_asdl import Id_str
|
8 |
from _devbuild.gen.types_asdl import lex_mode_str
|
9 |
|
10 |
import cStringIO
|
11 |
import sys
|
12 |
import sre_parse
|
13 |
import sre_constants
|
14 |
|
15 |
from frontend import lexer_def
|
16 |
|
17 |
|
18 |
def PrintTree(re_tree, depth=2):
|
19 |
"""
|
20 |
re_tree: List of children
|
21 |
"""
|
22 |
for child in re_tree:
|
23 |
name, arg = child
|
24 |
sys.stdout.write(depth * '\t')
|
25 |
sys.stdout.write(name)
|
26 |
sys.stdout.write(' ')
|
27 |
if name == 'in': # character class
|
28 |
print('{')
|
29 |
PrintTree(arg, depth=depth + 1)
|
30 |
sys.stdout.write(depth * '\t')
|
31 |
print('}')
|
32 |
elif name == 'max_repeat': # repetition
|
33 |
min_, max_, children = arg
|
34 |
# min = 0 means *, min = 1 means +
|
35 |
assert min_ in (0, 1), min_
|
36 |
print(min_, max_, '{')
|
37 |
PrintTree(children, depth=depth + 1)
|
38 |
sys.stdout.write(depth * '\t')
|
39 |
print()
|
40 |
elif name == 'negate': # Oh this is a ^. It doesn't form a node.
|
41 |
assert arg is None
|
42 |
print()
|
43 |
elif name == 'literal': # Quote \ and " in re2c syntax
|
44 |
print(repr(chr(arg)))
|
45 |
elif name == 'not_literal': # ditto
|
46 |
print(repr(chr(arg)))
|
47 |
elif name == 'range': # ascii range
|
48 |
begin, end = arg
|
49 |
print(repr(chr(begin)), repr(chr(end)))
|
50 |
elif name == 'any': # This is the '.' character
|
51 |
assert arg is None
|
52 |
print()
|
53 |
else:
|
54 |
raise AssertionError(name)
|
55 |
|
56 |
# NOTE: negate and not_literal are sort of duplicated
|
57 |
|
58 |
|
59 |
def PrintRegex(pat):
|
60 |
re_tree = sre_parse.parse(pat)
|
61 |
print('\t\t[')
|
62 |
PrintTree(re_tree)
|
63 |
print('\t\t]')
|
64 |
|
65 |
|
66 |
# re2c literals are inside double quotes, so we don't need to do anything with
|
67 |
# ^ or whatever.
|
68 |
LITERAL_META = ['\\', '"']
|
69 |
LITERAL_META_CODES = [ord(c) for c in LITERAL_META]
|
70 |
|
71 |
|
72 |
def _Literal(arg, char_escapes=LITERAL_META_CODES):
|
73 |
if arg == 0:
|
74 |
s = r'\x00' # "\000"
|
75 |
elif arg == ord('\n'):
|
76 |
s = r'\n'
|
77 |
elif arg == ord('\r'):
|
78 |
s = r'\r'
|
79 |
elif arg == ord('\t'):
|
80 |
s = r'\t'
|
81 |
elif arg in char_escapes:
|
82 |
s = '\\' + chr(arg)
|
83 |
elif arg >= 0x80:
|
84 |
s = '\\x%02x' % arg # for \x80-\xff
|
85 |
else:
|
86 |
s = chr(arg)
|
87 |
return s
|
88 |
|
89 |
|
90 |
# - means range. Note that re2c gives an error we uselessly escape \^.
|
91 |
CHAR_CLASS_META = ['\\', '-', ']']
|
92 |
CHAR_CLASS_META_CODES = [ord(c) for c in CHAR_CLASS_META]
|
93 |
|
94 |
|
95 |
def _CharClassLiteral(arg):
|
96 |
return _Literal(arg, char_escapes=CHAR_CLASS_META_CODES)
|
97 |
|
98 |
|
99 |
def TranslateConstant(pat):
|
100 |
return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
|
101 |
|
102 |
|
103 |
def TranslateTree(re_tree, f, in_char_class=False):
|
104 |
"""
|
105 |
re_tree: List of children
|
106 |
"""
|
107 |
for child in re_tree:
|
108 |
name, arg = child
|
109 |
if name == 'in': # character class
|
110 |
f.write('[')
|
111 |
TranslateTree(arg, f, in_char_class=True) # list of literals/ranges
|
112 |
f.write(']')
|
113 |
|
114 |
elif name == 'max_repeat': # repetition
|
115 |
min_, max_, children = arg
|
116 |
# min = 0 means *, min = 1 means +
|
117 |
#assert min_ in (0, 1), min_
|
118 |
TranslateTree(children, f)
|
119 |
|
120 |
if min_ == 0 and max_ == 1:
|
121 |
f.write('? ')
|
122 |
|
123 |
elif max_ == sre_constants.MAXREPEAT:
|
124 |
if min_ == 0:
|
125 |
f.write('* ')
|
126 |
elif min_ == 1:
|
127 |
f.write('+ ')
|
128 |
else:
|
129 |
assert 0, min_
|
130 |
|
131 |
else: # re2c also supports [0-7]{1,2} syntax
|
132 |
# note: might generated {2,2}
|
133 |
f.write('{%d,%d} ' % (min_, max_))
|
134 |
|
135 |
elif name == 'negate': # ^ in [^a-z]
|
136 |
assert arg is None
|
137 |
f.write('^')
|
138 |
|
139 |
elif name == 'literal': # Quote \ and " in re2c syntax
|
140 |
# TODO: it matters if we're inside a character class
|
141 |
#print("literal ARG %r" % arg)
|
142 |
|
143 |
if in_char_class:
|
144 |
s = _CharClassLiteral(arg)
|
145 |
else:
|
146 |
s = '"%s" ' % _Literal(arg)
|
147 |
f.write(s)
|
148 |
|
149 |
elif name == 'not_literal': # ditto
|
150 |
assert not in_char_class
|
151 |
f.write('[^%s]' % _CharClassLiteral(arg))
|
152 |
|
153 |
elif name == 'range': # ascii range
|
154 |
begin, end = arg
|
155 |
f.write('%s-%s' %
|
156 |
(_CharClassLiteral(begin), _CharClassLiteral(end)))
|
157 |
|
158 |
elif name == 'any': # This is the '.' character
|
159 |
assert arg is None
|
160 |
f.write('.')
|
161 |
|
162 |
elif name == 'subpattern':
|
163 |
_, children = arg # Not sure what the _ is, but it works
|
164 |
f.write('(')
|
165 |
TranslateTree(children, f)
|
166 |
f.write(')')
|
167 |
|
168 |
else:
|
169 |
if 0:
|
170 |
from mycpp.mylib import log
|
171 |
log('child %s', child)
|
172 |
raise RuntimeError("I don't understand regex construct: %r" % name)
|
173 |
|
174 |
# NOTE: negate and not_literal are sort of duplicated
|
175 |
|
176 |
|
177 |
def TranslateRegex(pat):
|
178 |
re_tree = sre_parse.parse(pat)
|
179 |
# For debugging
|
180 |
#import pprint
|
181 |
#print(pprint.pformat(re_tree), file=sys.stderr)
|
182 |
f = cStringIO.StringIO()
|
183 |
try:
|
184 |
TranslateTree(re_tree, f)
|
185 |
except RuntimeError:
|
186 |
print('Error translating %r' % pat, file=sys.stderr)
|
187 |
raise
|
188 |
return f.getvalue()
|
189 |
|
190 |
|
191 |
# This explains the sentinel method, which we will use.
|
192 |
# http://re2c.org/examples/example_01.html
|
193 |
#
|
194 |
# TODO: Change ParseTuple to use 's' rather than '#s' ?
|
195 |
|
196 |
# I don't think we need this YYFILL mechanism, because we lex a line at a
|
197 |
# time.
|
198 |
# http://re2c.org/examples/example_03.html
|
199 |
|
200 |
|
201 |
def TranslateSimpleLexer(func_name, lexer_def):
|
202 |
print(r"""
|
203 |
static inline void %s(const unsigned char* line, int line_len,
|
204 |
int start_pos, int* id, int* end_pos) {
|
205 |
assert(start_pos <= line_len); /* caller should have checked */
|
206 |
|
207 |
const unsigned char* p = line + start_pos; /* modified by re2c */
|
208 |
|
209 |
/* Echo and History lexer apparently need this, but others don't */
|
210 |
__attribute__((unused)) const unsigned char* YYMARKER;
|
211 |
|
212 |
for (;;) {
|
213 |
/*!re2c
|
214 |
""" % func_name)
|
215 |
|
216 |
for is_regex, pat, id_ in lexer_def:
|
217 |
if is_regex:
|
218 |
re2c_pat = TranslateRegex(pat)
|
219 |
else:
|
220 |
re2c_pat = TranslateConstant(pat)
|
221 |
id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
|
222 |
print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
|
223 |
|
224 |
# EARLY RETURN: Do NOT advance past the NUL terminator.
|
225 |
print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
|
226 |
r'"\x00"')
|
227 |
|
228 |
print("""
|
229 |
*/
|
230 |
}
|
231 |
*end_pos = p - line; /* relative */
|
232 |
}
|
233 |
""")
|
234 |
|
235 |
|
236 |
def TranslateBracket(func_name, token_dict):
|
237 |
print(r"""
|
238 |
static inline int %s(const unsigned char* s, int len) {
|
239 |
const unsigned char* p = s; /* modified by re2c */
|
240 |
const unsigned char* end = s + len;
|
241 |
|
242 |
__attribute__((unused)) const unsigned char* YYMARKER;
|
243 |
int id;
|
244 |
|
245 |
for (;;) {
|
246 |
/*!re2c
|
247 |
""" % func_name)
|
248 |
|
249 |
for pat in sorted(token_dict):
|
250 |
id_ = token_dict[pat]
|
251 |
re2c_pat = TranslateConstant(pat)
|
252 |
id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
|
253 |
print(' %-30s { id = id__%s; break; }' % (re2c_pat, id_name))
|
254 |
|
255 |
# EARLY RETURN: Do NOT advance past other chars, including the NUL
|
256 |
# terminator.
|
257 |
print(' %-30s { return id__Undefined_Tok; }' % '*')
|
258 |
|
259 |
print("""
|
260 |
*/
|
261 |
}
|
262 |
// must be an exact match
|
263 |
return (p == end) ? id : id__Undefined_Tok;
|
264 |
}
|
265 |
""")
|
266 |
|
267 |
|
268 |
def StringToInt(func_name, name_def):
|
269 |
print(r"""
|
270 |
static inline void %s(const unsigned char* s, int len, int* id) {
|
271 |
const unsigned char* p = s; /* modified by re2c */
|
272 |
const unsigned char* end = s + len;
|
273 |
|
274 |
//fprintf(stderr, "*** s = %%s\n", s);
|
275 |
|
276 |
for (;;) {
|
277 |
/*!re2c
|
278 |
""" % func_name)
|
279 |
|
280 |
for name, enum in name_def:
|
281 |
re2c_pat = TranslateConstant(name)
|
282 |
print(' %-30s { *id = %s; break; }' % (re2c_pat, enum))
|
283 |
|
284 |
# Not found. * matches anything else.
|
285 |
print(' %-30s { *id = 0; return; }' % \
|
286 |
r'*')
|
287 |
|
288 |
print(r"""
|
289 |
*/
|
290 |
}
|
291 |
if (p != end) {
|
292 |
//fprintf(stderr, "EXTRA CHARS\n", s);
|
293 |
*id = 0; // Not an exact match
|
294 |
}
|
295 |
}
|
296 |
""")
|
297 |
|
298 |
# TODO: Check that we're at the END OF THE STRING
|
299 |
|
300 |
|
301 |
def TranslateOshLexer(lexer_def):
|
302 |
# https://stackoverflow.com/questions/12836171/difference-between-an-inline-function-and-static-inline-function
|
303 |
# Has to be 'static inline' rather than 'inline', otherwise the
|
304 |
# _bin/oil.ovm-dbg build fails (but the _bin/oil.ovm doesn't!).
|
305 |
# Since we reference this function in exactly one translation unit --
|
306 |
# fastlex.c, the difference is moot, and we just satisfy the compiler.
|
307 |
|
308 |
print(r"""
|
309 |
/* Common stuff */
|
310 |
|
311 |
/*!re2c
|
312 |
re2c:define:YYCTYPE = "unsigned char";
|
313 |
re2c:define:YYCURSOR = p;
|
314 |
re2c:yyfill:enable = 0; // generated code doesn't ask for more input
|
315 |
*/
|
316 |
|
317 |
static inline void MatchOshToken(int lex_mode, const unsigned char* line, int line_len,
|
318 |
int start_pos, int* id, int* end_pos) {
|
319 |
assert(start_pos <= line_len); /* caller should have checked */
|
320 |
|
321 |
const unsigned char* p = line + start_pos; /* modified by re2c */
|
322 |
//printf("p: %p q: %p\n", p, q);
|
323 |
|
324 |
__attribute__((unused)) const unsigned char* YYMARKER; /* why do we need this? */
|
325 |
switch (lex_mode) {
|
326 |
""")
|
327 |
|
328 |
# TODO: Should be ordered by most common? Or will profile-directed feedback
|
329 |
# help?
|
330 |
for state, pat_list in lexer_def.iteritems():
|
331 |
# e.g. lex_mode.DQ => lex_mode__DQ
|
332 |
print(' case %s:' % lex_mode_str(state).replace('.', '__'))
|
333 |
print(' for (;;) {')
|
334 |
print(' /*!re2c')
|
335 |
|
336 |
for is_regex, pat, id_ in pat_list:
|
337 |
if is_regex:
|
338 |
re2c_pat = TranslateRegex(pat)
|
339 |
else:
|
340 |
re2c_pat = TranslateConstant(pat)
|
341 |
id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
|
342 |
print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
|
343 |
|
344 |
# EARLY RETURN: Do NOT advance past the NUL terminator.
|
345 |
print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
|
346 |
r'"\x00"')
|
347 |
|
348 |
print(' */')
|
349 |
print(' }')
|
350 |
print(' break;')
|
351 |
print()
|
352 |
|
353 |
# This is literal code without generation:
|
354 |
"""
|
355 |
case lex_mode__OUTER:
|
356 |
for (;;) {
|
357 |
/*!re2c
|
358 |
literal_chunk = [a-zA-Z0-9_/.-]+;
|
359 |
var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
|
360 |
comment = [ \t\r]* "#" [^\000\r\n]*;
|
361 |
space = [ \t\r]+;
|
362 |
nul = "\000";
|
363 |
|
364 |
literal_chunk { *id = id__Lit_Chars; break; }
|
365 |
var_like { *id = id__Lit_VarLike; break; }
|
366 |
|
367 |
[ \t\r]* "\n" { *id = id__Op_Newline; break; }
|
368 |
space { *id = id__WS_Space; break; }
|
369 |
|
370 |
nul { *id = id__Eof_Real; break; }
|
371 |
|
372 |
// anything else
|
373 |
* { *id = id__Lit_Other; break; }
|
374 |
|
375 |
*/
|
376 |
}
|
377 |
|
378 |
*end_pos = p - line;
|
379 |
break;
|
380 |
|
381 |
case lex_mode__COMMENT:
|
382 |
*id = id__Lit_Other;
|
383 |
*end_pos = 6;
|
384 |
break;
|
385 |
"""
|
386 |
|
387 |
print("""\
|
388 |
default:
|
389 |
assert(0);
|
390 |
|
391 |
}
|
392 |
*end_pos = p - line; /* relative */
|
393 |
}
|
394 |
""")
|
395 |
|
396 |
|
397 |
def TranslateRegexToPredicate(py_regex, func_name):
|
398 |
re2c_pat = TranslateRegex(py_regex)
|
399 |
print(r"""
|
400 |
static inline int %s(const unsigned char* s, int len) {
|
401 |
const unsigned char* p = s; /* modified by re2c */
|
402 |
const unsigned char* end = s + len;
|
403 |
|
404 |
/* MatchBraceRangeToken needs this, but others don't */
|
405 |
__attribute__((unused)) const unsigned char* YYMARKER;
|
406 |
|
407 |
/*!re2c
|
408 |
re2c:define:YYCTYPE = "unsigned char";
|
409 |
re2c:define:YYCURSOR = p;
|
410 |
%-30s { return p == end; } // Match must be anchored right, like $
|
411 |
* { return 0; }
|
412 |
*/
|
413 |
}
|
414 |
""" % (func_name, re2c_pat))
|
415 |
|
416 |
|
417 |
# note: use YYCURSOR and YYLIMIT
|
418 |
# limit should be the end of string
|
419 |
# line + line_len
|
420 |
def main(argv):
|
421 |
action = argv[1]
|
422 |
if action == 'c':
|
423 |
# Code is printed to stdout
|
424 |
|
425 |
TranslateOshLexer(lexer_def.LEXER_DEF)
|
426 |
|
427 |
TranslateSimpleLexer('MatchEchoToken', lexer_def.ECHO_E_DEF)
|
428 |
TranslateSimpleLexer('MatchGlobToken', lexer_def.GLOB_DEF)
|
429 |
TranslateSimpleLexer('MatchPS1Token', lexer_def.PS1_DEF)
|
430 |
TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
|
431 |
TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
|
432 |
#TranslateSimpleLexer('MatchQsnToken', lexer_def.QSN_DEF)
|
433 |
|
434 |
TranslateRegexToPredicate(lexer_def.VAR_NAME_RE, 'IsValidVarName')
|
435 |
TranslateRegexToPredicate(lexer_def.SHOULD_HIJACK_RE, 'ShouldHijack')
|
436 |
TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_INTEGER,
|
437 |
'LooksLikeInteger')
|
438 |
TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_FLOAT, 'LooksLikeFloat')
|
439 |
|
440 |
TranslateBracket('BracketUnary', TEST_UNARY_LOOKUP)
|
441 |
TranslateBracket('BracketBinary', TEST_BINARY_LOOKUP)
|
442 |
TranslateBracket('BracketOther', TEST_OTHER_LOOKUP)
|
443 |
|
444 |
elif action == 'print-all':
|
445 |
# Top level is a switch statement.
|
446 |
for state, pat_list in lexer_def.LEXER_DEF.iteritems():
|
447 |
print(state)
|
448 |
# This level is re2c patterns.
|
449 |
for is_regex, pat, token_id in pat_list:
|
450 |
print('\t%r -> %r' % (pat, token_id))
|
451 |
if is_regex:
|
452 |
#print re_tree
|
453 |
_ = TranslateRegex(pat)
|
454 |
#print out_pat
|
455 |
|
456 |
print()
|
457 |
|
458 |
elif action == 'print-regex':
|
459 |
unique = set()
|
460 |
|
461 |
num_regexes = 0
|
462 |
for state, pat_list in lexer_def.LEXER_DEF.iteritems():
|
463 |
print(state)
|
464 |
# This level is re2c patterns.
|
465 |
for is_regex, pat, token_id in pat_list:
|
466 |
#print '\t%r -> %r' % (pat, token_id)
|
467 |
if is_regex:
|
468 |
print('\t' + pat)
|
469 |
print('\t' + TranslateRegex(pat))
|
470 |
print()
|
471 |
#PrintRegex(pat)
|
472 |
num_regexes += 1
|
473 |
unique.add(pat)
|
474 |
else:
|
475 |
print('\t' + TranslateConstant(pat))
|
476 |
|
477 |
print()
|
478 |
|
479 |
print('Printed %d regexes (%d unique)' % (num_regexes, len(unique)))
|
480 |
|
481 |
|
482 |
if __name__ == '__main__':
|
483 |
try:
|
484 |
main(sys.argv)
|
485 |
except RuntimeError as e:
|
486 |
print('FATAL: %s' % e, file=sys.stderr)
|
487 |
sys.exit(1)
|