frontend/lexer

OILS / frontend / lexer_gen.py

1	#!/usr/bin/env python2
2	"""Lex_gen.py."""
3	from __future__ import print_function
4
5	from _devbuild.gen.id_kind import (TEST_UNARY_LOOKUP, TEST_BINARY_LOOKUP,
6	TEST_OTHER_LOOKUP)
7	from _devbuild.gen.id_kind_asdl import Id_str
8	from _devbuild.gen.types_asdl import lex_mode_str
9
10	import cStringIO
11	import sys
12	import sre_parse
13	import sre_constants
14
15	from frontend import lexer_def
16
17
18	def PrintTree(re_tree, depth=2):
19	"""
20	re_tree: List of children
21	"""
22	for child in re_tree:
23	name, arg = child
24	sys.stdout.write(depth * '\t')
25	sys.stdout.write(name)
26	sys.stdout.write(' ')
27	if name == 'in': # character class
28	print('{')
29	PrintTree(arg, depth=depth + 1)
30	sys.stdout.write(depth * '\t')
31	print('}')
32	elif name == 'max_repeat': # repetition
33	min_, max_, children = arg
34	# min = 0 means *, min = 1 means +
35	assert min_ in (0, 1), min_
36	print(min_, max_, '{')
37	PrintTree(children, depth=depth + 1)
38	sys.stdout.write(depth * '\t')
39	print()
40	elif name == 'negate': # Oh this is a ^. It doesn't form a node.
41	assert arg is None
42	print()
43	elif name == 'literal': # Quote \ and " in re2c syntax
44	print(repr(chr(arg)))
45	elif name == 'not_literal': # ditto
46	print(repr(chr(arg)))
47	elif name == 'range': # ascii range
48	begin, end = arg
49	print(repr(chr(begin)), repr(chr(end)))
50	elif name == 'any': # This is the '.' character
51	assert arg is None
52	print()
53	else:
54	raise AssertionError(name)
55
56	# NOTE: negate and not_literal are sort of duplicated
57
58
59	def PrintRegex(pat):
60	re_tree = sre_parse.parse(pat)
61	print('\t\t[')
62	PrintTree(re_tree)
63	print('\t\t]')
64
65
66	# re2c literals are inside double quotes, so we don't need to do anything with
67	# ^ or whatever.
68	LITERAL_META = ['\\', '"']
69	LITERAL_META_CODES = [ord(c) for c in LITERAL_META]
70
71
72	def _Literal(arg, char_escapes=LITERAL_META_CODES):
73	if arg == 0:
74	s = r'\x00' # "\000"
75	elif arg == ord('\n'):
76	s = r'\n'
77	elif arg == ord('\r'):
78	s = r'\r'
79	elif arg == ord('\t'):
80	s = r'\t'
81	elif arg in char_escapes:
82	s = '\\' + chr(arg)
83	elif arg >= 0x80:
84	s = '\\x%02x' % arg # for \x80-\xff
85	else:
86	s = chr(arg)
87	return s
88
89
90	# - means range. Note that re2c gives an error we uselessly escape \^.
91	CHAR_CLASS_META = ['\\', '-', ']']
92	CHAR_CLASS_META_CODES = [ord(c) for c in CHAR_CLASS_META]
93
94
95	def _CharClassLiteral(arg):
96	return _Literal(arg, char_escapes=CHAR_CLASS_META_CODES)
97
98
99	def TranslateConstant(pat):
100	return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
101
102
103	def TranslateTree(re_tree, f, in_char_class=False):
104	"""
105	re_tree: List of children
106	"""
107	for child in re_tree:
108	name, arg = child
109	if name == 'in': # character class
110	f.write('[')
111	TranslateTree(arg, f, in_char_class=True) # list of literals/ranges
112	f.write(']')
113
114	elif name == 'max_repeat': # repetition
115	min_, max_, children = arg
116	# min = 0 means *, min = 1 means +
117	#assert min_ in (0, 1), min_
118	TranslateTree(children, f)
119
120	if min_ == 0 and max_ == 1:
121	f.write('? ')
122
123	elif max_ == sre_constants.MAXREPEAT:
124	if min_ == 0:
125	f.write('* ')
126	elif min_ == 1:
127	f.write('+ ')
128	else:
129	assert 0, min_
130
131	else: # re2c also supports [0-7]{1,2} syntax
132	# note: might generated {2,2}
133	f.write('{%d,%d} ' % (min_, max_))
134
135	elif name == 'negate': # ^ in [^a-z]
136	assert arg is None
137	f.write('^')
138
139	elif name == 'literal': # Quote \ and " in re2c syntax
140	# TODO: it matters if we're inside a character class
141	#print("literal ARG %r" % arg)
142
143	if in_char_class:
144	s = _CharClassLiteral(arg)
145	else:
146	s = '"%s" ' % _Literal(arg)
147	f.write(s)
148
149	elif name == 'not_literal': # ditto
150	assert not in_char_class
151	f.write('[^%s]' % _CharClassLiteral(arg))
152
153	elif name == 'range': # ascii range
154	begin, end = arg
155	f.write('%s-%s' %
156	(_CharClassLiteral(begin), _CharClassLiteral(end)))
157
158	elif name == 'any': # This is the '.' character
159	assert arg is None
160	f.write('.')
161
162	elif name == 'subpattern':
163	_, children = arg # Not sure what the _ is, but it works
164	f.write('(')
165	TranslateTree(children, f)
166	f.write(')')
167
168	else:
169	if 0:
170	from mycpp.mylib import log
171	log('child %s', child)
172	raise RuntimeError("I don't understand regex construct: %r" % name)
173
174	# NOTE: negate and not_literal are sort of duplicated
175
176
177	def TranslateRegex(pat):
178	re_tree = sre_parse.parse(pat)
179	# For debugging
180	#import pprint
181	#print(pprint.pformat(re_tree), file=sys.stderr)
182	f = cStringIO.StringIO()
183	try:
184	TranslateTree(re_tree, f)
185	except RuntimeError:
186	print('Error translating %r' % pat, file=sys.stderr)
187	raise
188	return f.getvalue()
189
190
191	# This explains the sentinel method, which we will use.
192	# http://re2c.org/examples/example_01.html
193	#
194	# TODO: Change ParseTuple to use 's' rather than '#s' ?
195
196	# I don't think we need this YYFILL mechanism, because we lex a line at a
197	# time.
198	# http://re2c.org/examples/example_03.html
199
200
201	def TranslateSimpleLexer(func_name, lexer_def):
202	print(r"""
203	static inline void %s(const unsigned char* line, int line_len,
204	int start_pos, int* id, int* end_pos) {
205	assert(start_pos <= line_len); /* caller should have checked */
206
207	const unsigned char* p = line + start_pos; /* modified by re2c */
208
209	/* Echo and History lexer apparently need this, but others don't */
210	__attribute__((unused)) const unsigned char* YYMARKER;
211
212	for (;;) {
213	/*!re2c
214	""" % func_name)
215
216	for is_regex, pat, id_ in lexer_def:
217	if is_regex:
218	re2c_pat = TranslateRegex(pat)
219	else:
220	re2c_pat = TranslateConstant(pat)
221	id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
222	print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
223
224	# EARLY RETURN: Do NOT advance past the NUL terminator.
225	print(' %-30s { id = id__Eol_Tok; end_pos = start_pos; return; }' % \
226	r'"\x00"')
227
228	print("""
229	*/
230	}
231	end_pos = p - line; / relative */
232	}
233	""")
234
235
236	def TranslateBracket(func_name, token_dict):
237	print(r"""
238	static inline int %s(const unsigned char* s, int len) {
239	const unsigned char* p = s; /* modified by re2c */
240	const unsigned char* end = s + len;
241
242	__attribute__((unused)) const unsigned char* YYMARKER;
243	int id;
244
245	for (;;) {
246	/*!re2c
247	""" % func_name)
248
249	for pat in sorted(token_dict):
250	id_ = token_dict[pat]
251	re2c_pat = TranslateConstant(pat)
252	id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
253	print(' %-30s { id = id__%s; break; }' % (re2c_pat, id_name))
254
255	# EARLY RETURN: Do NOT advance past other chars, including the NUL
256	# terminator.
257	print(' %-30s { return id__Undefined_Tok; }' % '*')
258
259	print("""
260	*/
261	}
262	// must be an exact match
263	return (p == end) ? id : id__Undefined_Tok;
264	}
265	""")
266
267
268	def StringToInt(func_name, name_def):
269	print(r"""
270	static inline void %s(const unsigned char* s, int len, int* id) {
271	const unsigned char* p = s; /* modified by re2c */
272	const unsigned char* end = s + len;
273
274	//fprintf(stderr, "*** s = %%s\n", s);
275
276	for (;;) {
277	/*!re2c
278	""" % func_name)
279
280	for name, enum in name_def:
281	re2c_pat = TranslateConstant(name)
282	print(' %-30s { *id = %s; break; }' % (re2c_pat, enum))
283
284	# Not found. * matches anything else.
285	print(' %-30s { *id = 0; return; }' % \
286	r'*')
287
288	print(r"""
289	*/
290	}
291	if (p != end) {
292	//fprintf(stderr, "EXTRA CHARS\n", s);
293	*id = 0; // Not an exact match
294	}
295	}
296	""")
297
298	# TODO: Check that we're at the END OF THE STRING
299
300
301	def TranslateOshLexer(lexer_def):
302	# https://stackoverflow.com/questions/12836171/difference-between-an-inline-function-and-static-inline-function
303	# Has to be 'static inline' rather than 'inline', otherwise the
304	# _bin/oil.ovm-dbg build fails (but the _bin/oil.ovm doesn't!).
305	# Since we reference this function in exactly one translation unit --
306	# fastlex.c, the difference is moot, and we just satisfy the compiler.
307
308	print(r"""
309	/* Common stuff */
310
311	/*!re2c
312	re2c:define:YYCTYPE = "unsigned char";
313	re2c:define:YYCURSOR = p;
314	re2c:yyfill:enable = 0; // generated code doesn't ask for more input
315	*/
316
317	static inline void MatchOshToken(int lex_mode, const unsigned char* line, int line_len,
318	int start_pos, int* id, int* end_pos) {
319	assert(start_pos <= line_len); /* caller should have checked */
320
321	const unsigned char* p = line + start_pos; /* modified by re2c */
322	//printf("p: %p q: %p\n", p, q);
323
324	__attribute__((unused)) const unsigned char* YYMARKER; /* why do we need this? */
325	switch (lex_mode) {
326	""")
327
328	# TODO: Should be ordered by most common? Or will profile-directed feedback
329	# help?
330	for state, pat_list in lexer_def.iteritems():
331	# e.g. lex_mode.DQ => lex_mode__DQ
332	print(' case %s:' % lex_mode_str(state).replace('.', '__'))
333	print(' for (;;) {')
334	print(' /*!re2c')
335
336	for is_regex, pat, id_ in pat_list:
337	if is_regex:
338	re2c_pat = TranslateRegex(pat)
339	else:
340	re2c_pat = TranslateConstant(pat)
341	id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
342	print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
343
344	# EARLY RETURN: Do NOT advance past the NUL terminator.
345	print(' %-30s { id = id__Eol_Tok; end_pos = start_pos; return; }' % \
346	r'"\x00"')
347
348	print(' */')
349	print(' }')
350	print(' break;')
351	print()
352
353	# This is literal code without generation:
354	"""
355	case lex_mode__OUTER:
356	for (;;) {
357	/*!re2c
358	literal_chunk = [a-zA-Z0-9_/.-]+;
359	var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
360	comment = [ \t\r]* "#" [^\000\r\n]*;
361	space = [ \t\r]+;
362	nul = "\000";
363
364	literal_chunk { *id = id__Lit_Chars; break; }
365	var_like { *id = id__Lit_VarLike; break; }
366
367	[ \t\r]* "\n" { *id = id__Op_Newline; break; }
368	space { *id = id__WS_Space; break; }
369
370	nul { *id = id__Eof_Real; break; }
371
372	// anything else
373	* { *id = id__Lit_Other; break; }
374
375	*/
376	}
377
378	*end_pos = p - line;
379	break;
380
381	case lex_mode__COMMENT:
382	*id = id__Lit_Other;
383	*end_pos = 6;
384	break;
385	"""
386
387	print("""\
388	default:
389	assert(0);
390
391	}
392	end_pos = p - line; / relative */
393	}
394	""")
395
396
397	def TranslateRegexToPredicate(py_regex, func_name):
398	re2c_pat = TranslateRegex(py_regex)
399	print(r"""
400	static inline int %s(const unsigned char* s, int len) {
401	const unsigned char* p = s; /* modified by re2c */
402	const unsigned char* end = s + len;
403
404	/* MatchBraceRangeToken needs this, but others don't */
405	__attribute__((unused)) const unsigned char* YYMARKER;
406
407	/*!re2c
408	re2c:define:YYCTYPE = "unsigned char";
409	re2c:define:YYCURSOR = p;
410	%-30s { return p == end; } // Match must be anchored right, like $
411	* { return 0; }
412	*/
413	}
414	""" % (func_name, re2c_pat))
415
416
417	# note: use YYCURSOR and YYLIMIT
418	# limit should be the end of string
419	# line + line_len
420	def main(argv):
421	action = argv[1]
422	if action == 'c':
423	# Code is printed to stdout
424
425	TranslateOshLexer(lexer_def.LEXER_DEF)
426
427	TranslateSimpleLexer('MatchEchoToken', lexer_def.ECHO_E_DEF)
428	TranslateSimpleLexer('MatchGlobToken', lexer_def.GLOB_DEF)
429	TranslateSimpleLexer('MatchPS1Token', lexer_def.PS1_DEF)
430	TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
431	TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
432	#TranslateSimpleLexer('MatchQsnToken', lexer_def.QSN_DEF)
433
434	TranslateRegexToPredicate(lexer_def.VAR_NAME_RE, 'IsValidVarName')
435	TranslateRegexToPredicate(lexer_def.SHOULD_HIJACK_RE, 'ShouldHijack')
436	TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_INTEGER,
437	'LooksLikeInteger')
438	TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_FLOAT, 'LooksLikeFloat')
439
440	TranslateBracket('BracketUnary', TEST_UNARY_LOOKUP)
441	TranslateBracket('BracketBinary', TEST_BINARY_LOOKUP)
442	TranslateBracket('BracketOther', TEST_OTHER_LOOKUP)
443
444	elif action == 'print-all':
445	# Top level is a switch statement.
446	for state, pat_list in lexer_def.LEXER_DEF.iteritems():
447	print(state)
448	# This level is re2c patterns.
449	for is_regex, pat, token_id in pat_list:
450	print('\t%r -> %r' % (pat, token_id))
451	if is_regex:
452	#print re_tree
453	_ = TranslateRegex(pat)
454	#print out_pat
455
456	print()
457
458	elif action == 'print-regex':
459	unique = set()
460
461	num_regexes = 0
462	for state, pat_list in lexer_def.LEXER_DEF.iteritems():
463	print(state)
464	# This level is re2c patterns.
465	for is_regex, pat, token_id in pat_list:
466	#print '\t%r -> %r' % (pat, token_id)
467	if is_regex:
468	print('\t' + pat)
469	print('\t' + TranslateRegex(pat))
470	print()
471	#PrintRegex(pat)
472	num_regexes += 1
473	unique.add(pat)
474	else:
475	print('\t' + TranslateConstant(pat))
476
477	print()
478
479	print('Printed %d regexes (%d unique)' % (num_regexes, len(unique)))
480
481
482	if __name__ == '__main__':
483	try:
484	main(sys.argv)
485	except RuntimeError as e:
486	print('FATAL: %s' % e, file=sys.stderr)
487	sys.exit(1)