frontend/lexer

OILS / frontend / lexer_def.py

1	"""
2	lexer_def.py -- A lexer for both OSH and YSH.
3
4	It consists of a series of lexer modes, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/dev.sh all
9
10	or at least:
11
12	build/dev.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0. The core/lexer_gen.py code
22	generator adds and extra rule for \0.
23
24	For example, use [^'\0]+ instead of [^']+ .
25
26	If this rule isn't followed, we would read uninitialized memory past the
27	sentinel. Python's regex engine knows where the end of the input string is, so
28	it doesn't require need a sentinel like \0.
29	"""
30
31	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
32	from _devbuild.gen.types_asdl import lex_mode_e
33
34	from frontend import id_kind_def
35
36	from typing import Tuple
37
38	# Initialize spec that the lexer depends on.
39	ID_SPEC = id_kind_def.IdSpec({}, {})
40
41	id_kind_def.AddKinds(ID_SPEC)
42	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
43	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
44
45
46	def C(pat, tok_type):
47	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
48	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
49	return (False, pat, tok_type)
50
51
52	def R(pat, tok_type):
53	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
54	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
55	return (True, pat, tok_type)
56
57
58	# See unit tests in frontend/match_test.py.
59	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
60	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
61
62	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
63
64	# Tilde expansion chars are Lit_Chars, but WITHOUT the /. The NEXT token (if
65	# any) after this TildeLike token should start with a /.
66	#
67	# It would have been REALLY NICE to add an optional /? at the end of THIS
68	# token, but we can't do that because of ${x//~/replace}. The third / is not
69	# part of the tilde sub!!!
70	_TILDE_LIKE = R(r'~[a-zA-Z0-9_.-]*', Id.Lit_TildeLike)
71
72	_BACKSLASH = [
73	# To be conservative, we could deny a set of chars similar to
74	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
75	# like \( and \;.
76	#
77	# strict_backslash makes this stricter.
78	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
79	C('\\\n', Id.Ignored_LineCont),
80	]
81
82	# Only 4 characters are backslash escaped inside "".
83	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
84	_DQ_BACKSLASH = [
85	R(r'\\[$`"\\]', Id.Lit_EscapedChar),
86	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
87	]
88
89	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
90
91	# All Kind.VSub
92	_VARS = [
93	# Unbraced variables
94	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
95	R(r'\$[0-9]', Id.VSub_Number),
96	C(r'$!', Id.VSub_Bang),
97	C(r'$@', Id.VSub_At),
98	C(r'$#', Id.VSub_Pound),
99	C(r'$$', Id.VSub_Dollar),
100	C(r'$*', Id.VSub_Star),
101	C(r'$-', Id.VSub_Hyphen),
102	C(r'$?', Id.VSub_QMark),
103	]
104
105	# Kind.Left that are valid in double-quoted modes.
106
107	_LEFT_SUBS = [
108	C('`', Id.Left_Backtick),
109	C('$(', Id.Left_DollarParen),
110	C('${', Id.Left_DollarBrace),
111	C('$((', Id.Left_DollarDParen),
112	C('$[', Id.Left_DollarBracket),
113	]
114
115	# Additional Kind.Left that are valid in unquoted modes.
116	_LEFT_UNQUOTED = [
117	C('"', Id.Left_DoubleQuote),
118	C("'", Id.Left_SingleQuote),
119	C('$"', Id.Left_DollarDoubleQuote),
120	C("$'", Id.Left_DollarSingleQuote),
121	]
122
123	_LEFT_PROCSUB = [
124	C('<(', Id.Left_ProcSubIn),
125	C('>(', Id.Left_ProcSubOut),
126	]
127
128	# The regexes below are in Python syntax, but are translate to re2c syntax by
129	# frontend/lexer_gen.py.
130	#
131	# http://re2c.org/manual/syntax/syntax.html
132	# https://docs.python.org/2/library/re.html
133	#
134	# We use a limited set of constructs:
135	# - + and * for repetition
136	# - Character classes [] with simple ranges and negation
137	# - Escapes like \n \0
138
139	LEXER_DEF = {} # TODO: Should be a list so we enforce order.
140
141	# Anything until the end of the line is a comment. Does not match the newline
142	# itself. We want to switch modes and possibly process Op_Newline for here
143	# docs, etc.
144	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
145
146	# A whitelist for efficiency. The shell language says that "anything else" is
147	# a literal character. In other words, a single $ \ or ! is a literal, not a
148	# syntax error. It's defined negatively, but let's define positive runs here.
149	# TODO: Add + here because it's never special? It's different for YSH though.
150
151	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
152	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_/.\-]+'
153
154	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
155	# NOTE: We could add anything 128 and above to this character class? So
156	# utf-8 characters don't get split?
157	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
158	_TILDE_LIKE,
159	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
160	C('$', Id.Lit_Dollar), # shopt -u parse_dollar
161	C('#', Id.Lit_Pound), # For comments
162	_SIGNIFICANT_SPACE,
163	C('\n', Id.Op_Newline),
164	C('&', Id.Op_Amp),
165	C('\|', Id.Op_Pipe),
166	C('\|&', Id.Op_PipeAmp),
167	C('&&', Id.Op_DAmp),
168	C('\|\|', Id.Op_DPipe),
169	C(';', Id.Op_Semi),
170	C(';;', Id.Op_DSemi),
171	C('(', Id.Op_LParen),
172	C(')', Id.Op_RParen),
173	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
174	]
175
176	# In ShCommand and DBracket states.
177	_EXTGLOB_BEGIN = [
178	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
179	C('@(', Id.ExtGlob_At),
180	C('*(', Id.ExtGlob_Star),
181	C('+(', Id.ExtGlob_Plus),
182	C('?(', Id.ExtGlob_QMark),
183	C('!(', Id.ExtGlob_Bang),
184	]
185
186	_KEYWORDS = [
187	# NOTE: { is matched elsewhere
188	C('[[', Id.KW_DLeftBracket),
189	C('!', Id.KW_Bang),
190	C('for', Id.KW_For),
191	C('while', Id.KW_While),
192	C('until', Id.KW_Until),
193	C('do', Id.KW_Do),
194	C('done', Id.KW_Done),
195	C('in', Id.KW_In),
196	C('case', Id.KW_Case),
197	C('esac', Id.KW_Esac),
198	C('if', Id.KW_If),
199	C('fi', Id.KW_Fi),
200	C('then', Id.KW_Then),
201	C('else', Id.KW_Else),
202	C('elif', Id.KW_Elif),
203	C('function', Id.KW_Function),
204	C('time', Id.KW_Time),
205
206	# YSH integration
207	C('const', Id.KW_Const),
208	C('var', Id.KW_Var),
209	C('setvar', Id.KW_SetVar),
210	C('setref', Id.KW_SetRef),
211	C('setglobal', Id.KW_SetGlobal),
212	C('proc', Id.KW_Proc),
213
214	# Tea-only
215
216	# TODO: parse_tea should enable these so we can have 'setvar x = func'
217	C('func', Id.KW_Func),
218	C('data', Id.KW_Data),
219	C('enum', Id.KW_Enum),
220	C('class', Id.KW_Class),
221
222	# 'import' is a Python-like import for tea. Contrast with 'use lib
223	# foo.oil', which is a builtin.
224	C('import', Id.KW_Import),
225	# and we also need export
226	]
227
228	# These are treated like builtins in bash, but keywords in OSH. However, we
229	# maintain compatibility with bash for the 'type' builtin.
230	_CONTROL_FLOW = [
231	C('break', Id.ControlFlow_Break),
232	C('continue', Id.ControlFlow_Continue),
233	C('return', Id.ControlFlow_Return),
234	C('exit', Id.ControlFlow_Exit),
235	]
236
237	# Used by ysh/grammar_gen.py too
238	EXPR_WORDS = [
239	C('null', Id.Expr_Null),
240	C('true', Id.Expr_True),
241	C('false', Id.Expr_False),
242	C('and', Id.Expr_And),
243	C('or', Id.Expr_Or),
244	C('not', Id.Expr_Not),
245	C('for', Id.Expr_For),
246	C('while', Id.Expr_While),
247	C('is', Id.Expr_Is),
248	C('in', Id.Expr_In),
249	C('if', Id.Expr_If),
250	C('else', Id.Expr_Else),
251
252	# for function literals
253	C('func', Id.Expr_Func),
254
255	# Note: can 'virtual' just be 'override'? What do other languages do?
256	C('virtual', Id.Expr_Virtual),
257	C('override', Id.Expr_Override),
258	C('abstract', Id.Expr_Abstract),
259	C('as', Id.Expr_As), # use 'foo.sh' as bar
260
261	# Tea Control Flow Operators
262	C('break', Id.Expr_Break),
263	C('continue', Id.Expr_Continue),
264	C('return', Id.Expr_Return),
265	]
266
267	CONTROL_FLOW_NAMES = [name for _, name, _ in _CONTROL_FLOW]
268
269	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
270
271	# file descriptors can only have two digits, like mksh
272	# dash/zsh/etc. can have one
273	FD_NUM = r'[0-9]?[0-9]?'
274
275	# These two can must be recognized in the ShCommand state, but can't nested
276	# within [[.
277	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
278	# of <Lit_Chars "if">.
279	LEXER_DEF[lex_mode_e.ShCommand] = [
280	# These four are not allowed within [[, so they are in ShCommand but not
281	# _UNQUOTED.
282
283	# e.g. beginning of NAME=val, which will always be longer than
284	# _LITERAL_WHITELIST_REGEX.
285	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
286	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
287	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
288	C('((', Id.Op_DLeftParen),
289
290	# For static globbing, and [] for array literals
291	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
292	C(']', Id.Lit_RBracket), # e.g. *.[ch]
293	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
294	C('*', Id.Lit_Star),
295	C('?', Id.Lit_QMark),
296	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
297	C('...', Id.Lit_TDot), # ... for multiline commands
298
299	# For brace expansion {a,b}
300	C('{', Id.Lit_LBrace),
301	C('}', Id.Lit_RBrace), # Also for var sub ${a}
302	C(',', Id.Lit_Comma),
303	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
304	C('_', Id.Lit_Underscore), # for _ f(x)
305	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
306
307	# @array and @func(1, c)
308	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
309	C('@[', Id.Lit_AtLBracket), # @[split(x)]
310	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
311	R(FD_NUM + r'<', Id.Redir_Less),
312	R(FD_NUM + r'>', Id.Redir_Great),
313	R(FD_NUM + r'<<', Id.Redir_DLess),
314	R(FD_NUM + r'<<<', Id.Redir_TLess),
315	R(FD_NUM + r'>>', Id.Redir_DGreat),
316	R(FD_NUM + r'<<-', Id.Redir_DLessDash),
317	R(FD_NUM + r'>&', Id.Redir_GreatAnd),
318	R(FD_NUM + r'<&', Id.Redir_LessAnd),
319	R(FD_NUM + r'<>', Id.Redir_LessGreat),
320	R(FD_NUM + r'>\\|', Id.Redir_Clobber),
321	R(FD_VAR_NAME + r'<', Id.Redir_Less),
322	R(FD_VAR_NAME + r'>', Id.Redir_Great),
323	R(FD_VAR_NAME + r'<<', Id.Redir_DLess),
324	R(FD_VAR_NAME + r'<<<', Id.Redir_TLess),
325	R(FD_VAR_NAME + r'>>', Id.Redir_DGreat),
326	R(FD_VAR_NAME + r'<<-', Id.Redir_DLessDash),
327	R(FD_VAR_NAME + r'>&', Id.Redir_GreatAnd),
328	R(FD_VAR_NAME + r'<&', Id.Redir_LessAnd),
329	R(FD_VAR_NAME + r'<>', Id.Redir_LessGreat),
330	R(FD_VAR_NAME + r'>\\|', Id.Redir_Clobber),
331
332	# No leading descriptor (2 is implied)
333	C(r'&>', Id.Redir_AndGreat),
334	C(r'&>>', Id.Redir_AndDGreat),
335	] + _KEYWORDS + _CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
336
337	# Preprocessing before ShCommand
338	LEXER_DEF[lex_mode_e.Backtick] = [
339	C(r'`', Id.Backtick_Right),
340	# A backslash, and then $ or ` or \
341	R(r'\\[$`\\]', Id.Backtick_Quoted),
342	# \" treated specially, depending on whether bacticks are double-quoted!
343	R(r'\\"', Id.Backtick_DoubleQuote),
344	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
345	R(r'[^\0]', Id.Backtick_Other), # anything else
346	]
347
348	# DBRACKET: can be like ShCommand, except:
349	# - Don't really need redirects either... Redir_Less could be Op_Less
350	# - Id.Op_DLeftParen can't be nested inside.
351	LEXER_DEF[lex_mode_e.DBracket] = [
352	C(']]', Id.Lit_DRightBracket),
353	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
354	# in addition to [[ ! a && b ]]
355	C('!', Id.KW_Bang),
356	C('<', Id.Op_Less),
357	C('>', Id.Op_Great),
358	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
359	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
360	_UNQUOTED + _EXTGLOB_BEGIN
361
362	# Inside an extended glob, most characters are literals, including spaces and
363	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
364	# nested, so _EXTGLOB_BEGIN appears here.
365	#
366	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
367	LEXER_DEF[lex_mode_e.ExtGlob] = \
368	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
369	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
370	C('\|', Id.Op_Pipe),
371	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
372	R(r'[^\0]', Id.Lit_Other), # everything else is literal
373	]
374
375	# Notes on BASH_REGEX states
376	#
377	# From bash manual:
378	#
379	# - Any part of the pattern may be quoted to force the quoted portion to be
380	# matched as a string.
381	# - Bracket expressions in regular expressions must be treated carefully, since
382	# normal quoting characters lose their meanings between brackets.
383	# - If the pattern is stored in a shell variable, quoting the variable
384	# expansion forces the entire pattern to be matched as a string.
385	#
386	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
387	#
388	# TODO: For testing, write a script to extract and save regexes... and compile
389	# them with regcomp. I've only seen constant regexes.
390	#
391	# From code: ( \| ) are treated special.
392
393	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
394
395	# NOTE: bash accounts for spaces and non-word punctuation like ; inside ()
396	# and []. We will avoid that and ask the user to extract a variable?
397	R(r'[a-zA-Z0-9_/-]+', Id.Lit_Chars), # not including period
398	_TILDE_LIKE, # bash weirdness: RHS of [[ x =~ ~ ]] is expanded
399	_SIGNIFICANT_SPACE,
400
401	# Normally, \x evaluates to x. But quoted regex metacharacters like \* should
402	# evaluate to \*. Compare with ( \| ).
403	R(r'\\[*+?.^$\[\]]', Id.Lit_RegexMeta),
404
405	# NOTE: ( \| and ) aren't operators!
406	R(r'[^\0]', Id.Lit_Other), # Everything else is a literal
407	] + _BACKSLASH # These have to come after RegexMeta
408
409	LEXER_DEF[lex_mode_e.DQ] = _DQ_BACKSLASH + [
410	C('\\\n', Id.Ignored_LineCont),
411	] + _LEFT_SUBS + _VARS + [
412	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
413	C('$', Id.Lit_Dollar), # completion of var names relies on this
414	# NOTE: When parsing here doc line, this token doesn't end it.
415	C('"', Id.Right_DoubleQuote),
416	]
417
418	_VS_ARG_COMMON = [
419	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
420	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
421	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
422	C('}', Id.Right_DollarBrace), # For var sub "${a}"
423	C('$', Id.Lit_Dollar), # completion of var names relies on this
424	]
425
426	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
427	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
428	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
429	_VARS + _EXTGLOB_BEGIN + [
430
431	_TILDE_LIKE,
432	# - doesn't match < and > so it doesn't eat <()
433	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
434	# not enough
435	R(r'[^$`/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
436	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
437	]
438
439	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
440	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = \
441	_DQ_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
442
443	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
444
445	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
446
447	# Weird wart: even in double quoted state, double quotes are allowed
448	C('"', Id.Left_DoubleQuote),
449
450	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
451	C("$'", Id.Left_DollarSingleQuote),
452	]
453
454	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
455	# state.
456	LEXER_DEF[lex_mode_e.SQ_Raw] = [
457	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
458	C("'", Id.Right_SingleQuote),
459	]
460
461	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
462	#
463	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
464	# 0x1234. And \0 is 0x0.
465
466	# In Python:
467	# chr(0x00012345) == u'\U00012345'
468	#
469	# In YSH:
470	# 0x00012345 == \u{12345}
471	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
472
473	# We choose to match QSN (Rust) rather than Python or bash.
474	# Technically it could be \u123456, because we're not embedded in a string, but
475	# it's better to be consistent.
476
477	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
478
479	_X_CHAR = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex)
480
481	# Stricter QSN
482	_X_CHAR_2 = R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex)
483
484	EXPR_CHARS = [
485	# This is like Rust. We don't have the legacy C escapes like \b.
486
487	# NOTE: \' and \" are more readable versions of '"' and "'" in regexs
488	R(r'\\[0rtn\\"%s]' % "'", Id.Char_OneChar),
489	R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex),
490
491	# Because 'a' is a string, we use the syntax #'a' for char literals.
492	# We explicitly leave out #''' because it's confusing.
493	# TODO: extend this to a valid utf-8 code point (rune), rather than a single
494	# byte.
495	R(r"#'[^'\0]'", Id.Char_Pound),
496	_U_BRACED_CHAR,
497	]
498
499	# Shared between echo -e and $''.
500	_C_STRING_COMMON = [
501
502	# \x6 is valid in bash
503	_X_CHAR,
504	R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4),
505	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
506
507	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
508
509	# Backslash that ends a line. Note '.' doesn't match a newline character.
510	C('\\\n', Id.Char_Literals),
511
512	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
513	# but a lint tool could warn about it.
514	C('\\', Id.Unknown_Backslash),
515
516	# could be at the end of the line
517	#R('\\[uU]', Id.Unknown_BackslashU),
518	]
519
520	# Used by ECHO_LEXER in core/builtin.py.
521	ECHO_E_DEF = _C_STRING_COMMON + [
522	# Note: tokens above \0377 can either be truncated or be flagged a syntax
523	# error in strict mode.
524	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
525	C(r'\c', Id.Char_Stop),
526
527	# e.g. 'foo', anything that's not a backslash escape
528	R(r'[^\\\0]+', Id.Char_Literals),
529	]
530
531	OCTAL3_RE = r'\\[0-7]{1,3}'
532
533	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
534	PS1_DEF = [
535	R(OCTAL3_RE, Id.PS_Octal3),
536	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
537	# \D{%H:%M} strftime format
538	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
539	C(r'\[', Id.PS_LBrace), # non-printing
540	C(r'\]', Id.PS_RBrace),
541	R(r'[^\\\0]+', Id.PS_Literals),
542	# e.g. \x is not a valid escape.
543	C('\\', Id.PS_BadBackslash),
544	]
545
546	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
547	# point of it is that supports other backslash escapes like \n! It just
548	# becomes a regular backslash.
549	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
550	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
551	# with no leading 0.
552	R(OCTAL3_RE, Id.Char_Octal3),
553
554	# ' and " are escaped in $'' mode, but not echo -e.
555	C(r"\'", Id.Char_OneChar),
556	C(r'\"', Id.Char_OneChar),
557
558	# e.g. 'foo', anything that's not a backslash escape or '
559	R(r"[^\\'\0]+", Id.Char_Literals),
560	C("'", Id.Right_SingleQuote),
561
562	# Backslash that ends the file! Caught by re2c exhaustiveness check. Parser
563	# will assert; should give a better syntax error.
564	C('\\\0', Id.Unknown_Tok),
565	]
566
567	# Should match the pure Python decoder in data_lang/qsn.py
568	LEXER_DEF[lex_mode_e.QSN] = [
569	R(r'''\\[nrt0'"\\]''', Id.Char_OneChar),
570	_X_CHAR_2, # \xff
571	_U_BRACED_CHAR, # \u{3bc}
572
573	# Like SQ_C, but literal newlines and tabs are illegal.
574	R(r"[^\\'\0\t\n]+", Id.Char_Literals),
575	C("'", Id.Right_SingleQuote),
576	R(r'[^\0]', Id.Unknown_Tok),
577	]
578
579	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
580	R(OCTAL3_RE, Id.Char_Octal3),
581	R(r"[^%\\\0]+", Id.Char_Literals),
582	C('%%', Id.Format_EscapedPercent),
583	C('%', Id.Format_Percent),
584	]
585
586	# Maybe: bash also supports %(strftime)T
587	LEXER_DEF[lex_mode_e.PrintfPercent] = [
588	# Flags
589	R('[- +#]', Id.Format_Flag),
590	C('0', Id.Format_Zero),
591	R('[1-9][0-9]*', Id.Format_Num),
592	C('*', Id.Format_Star),
593	C('.', Id.Format_Dot),
594	# We support dsq. The others we parse to display an error message.
595	R('[disqbcouxXeEfFgG]', Id.Format_Type),
596	R('$[^()\0]*$T', Id.Format_Time),
597	R(r'[^\0]', Id.Unknown_Tok), # any other char
598	]
599
600	LEXER_DEF[lex_mode_e.VSub_1] = [
601	R(VAR_NAME_RE, Id.VSub_Name),
602	# ${11} is valid, compared to $11 which is $1 and then literal 1.
603	R(r'[0-9]+', Id.VSub_Number),
604	C('!', Id.VSub_Bang),
605	C('@', Id.VSub_At),
606	C('#', Id.VSub_Pound),
607	C('$', Id.VSub_Dollar),
608	C('*', Id.VSub_Star),
609	C('-', Id.VSub_Hyphen),
610	C('?', Id.VSub_QMark),
611	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
612	C('}', Id.Right_DollarBrace),
613	C('\\\n', Id.Ignored_LineCont),
614	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
615	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
616	]
617
618	LEXER_DEF[lex_mode_e.VSub_2] = \
619	ID_SPEC.LexerPairs(Kind.VTest) + \
620	ID_SPEC.LexerPairs(Kind.VOp0) + \
621	ID_SPEC.LexerPairs(Kind.VOpOil) + \
622	ID_SPEC.LexerPairs(Kind.VOp1) + \
623	ID_SPEC.LexerPairs(Kind.VOp2) + \
624	ID_SPEC.LexerPairs(Kind.VOp3) + [
625	C('}', Id.Right_DollarBrace),
626
627	C('\\\n', Id.Ignored_LineCont),
628	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
629	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
630	]
631
632	_EXPR_ARITH_SHARED = [
633	C('\\\n', Id.Ignored_LineCont),
634	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
635	]
636
637	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
638	LEXER_DEF[lex_mode_e.Arith] = \
639	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
640
641	# Arithmetic expressions can cross newlines.
642	R(r'[ \t\r\n]+', Id.Ignored_Space),
643
644	# Examples of arith constants:
645	# 64#azAZ
646	# 0xabc 0xABC
647	# 0123
648	# A separate digits token makes this easier to parse STATICALLY. But this
649	# doesn't help with DYNAMIC parsing.
650	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
651	R(r'[0-9]+', Id.Lit_Digits),
652	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
653	C('#', Id.Lit_Pound), # for 64#a
654
655	# TODO: 64#@ interferes with VS_AT. Hm.
656	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
657
658	# A lexer for the parser that converts globs to extended regexes. Since we're
659	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
660	# don't need lexer modes here.
661	GLOB_DEF = [
662	# These could be operators in the glob, or just literals in a char class,
663	# e.g. touch '?'; echo [?].
664	C('*', Id.Glob_Star),
665	C('?', Id.Glob_QMark),
666
667	# For negation. Treated as operators inside [], but literals outside.
668	C('!', Id.Glob_Bang),
669	C('^', Id.Glob_Caret),
670
671	# Character classes.
672	C('[', Id.Glob_LBracket),
673	C(']', Id.Glob_RBracket),
674
675	# There is no whitelist of characters; backslashes are unconditionally
676	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
677	# See libc_test.py.
678	R(r'\\[^\0]', Id.Glob_EscapedChar),
679	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
680
681	# For efficiency, combine other characters into a single token, e.g. 'py' in
682	# '*.py' or 'alpha' in '[[:alpha:]]'.
683	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
684	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
685	]
686
687	# History expansion. We're doing this as "pre-lexing" since that's what bash
688	# and zsh seem to do. Example:
689	#
690	# $ foo=x
691	# $ echo $
692	# $ !!foo # expands to echo $foo and prints x
693	#
694	# We can also reuse this in the RootCompleter to expand history interactively.
695	#
696	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
697	# quotes AGAIN.
698	#
699	# Note: \! gets expanded to literal \! for the real lexer, but no history
700	# expansion occurs.
701
702	HISTORY_DEF = [
703	# Common operators.
704	R(r'![!*^$]', Id.History_Op),
705
706	# By command number.
707	R(r'!-?[0-9]+', Id.History_Num),
708
709	# Search by prefix of substring (optional '?').
710	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
711	# No hyphen since it conflits with $-1 too.
712	#
713	# Required trailing whitespace is there to avoid conflict with [!charclass]
714	# and ${!indirect}. This is a simpler hack than the one bash has. See
715	# frontend/lex_test.py.
716	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
717
718	# Comment is until end of line
719	R(r"#[^\0]*", Id.History_Other),
720
721	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
722	# end of string.
723	R(r"'[^'\0]*'?", Id.History_Other),
724
725	# Runs of chars that are definitely not special
726	R(r"[^!\\'#\0]+", Id.History_Other),
727
728	# Escaped characters. \! disables history
729	R(r'\\[^\0]', Id.History_Other),
730	# Other single chars, like a trailing \ or !
731	R(r'[^\0]', Id.History_Other),
732	]
733
734	BRACE_RANGE_DEF = [
735	R(r'-?[0-9]+', Id.Range_Int),
736	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
737	R(r'\.\.', Id.Range_Dots),
738	R(r'[^\0]', Id.Range_Other), # invalid
739	]
740
741	#
742	# YSH lexing
743	#
744
745	# Valid in lex_mode_e.{Expr,DQ}
746	# Used by ysh/grammar_gen.py
747	YSH_LEFT_SUBS = [
748	C('$(', Id.Left_DollarParen),
749	C('${', Id.Left_DollarBrace),
750	C('$[', Id.Left_DollarBracket), # TODO: Implement $[x]
751	]
752
753	# Valid in lex_mode_e.Expr, but not valid in DQ
754	# Used by ysh/grammar_gen.py
755
756	YSH_LEFT_UNQUOTED = [
757	C('"', Id.Left_DoubleQuote),
758	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
759	C("'", Id.Left_SingleQuote),
760	C("r'", Id.Left_RSingleQuote),
761	C("$'", Id.Left_DollarSingleQuote),
762	C('"""', Id.Left_TDoubleQuote),
763	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
764	C("'''", Id.Left_TSingleQuote),
765	C("r'''", Id.Left_RTSingleQuote),
766	C("$'''", Id.Left_DollarTSingleQuote),
767	C('@(', Id.Left_AtParen), # Split Command Sub
768	C('^(', Id.Left_CaretParen), # Block literals in expression mode
769	C('^[', Id.Left_CaretBracket), # Expr literals, unimplemented
770	C('^{', Id.Left_CaretBrace), # Unused
771	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
772	C('%(', Id.Left_PercentParen), # old syntax for shell-like word arrays.
773	C('%[', Id.Expr_Reserved), # Maybe: like %() without unquoted [], {}
774	C('%{', Id.Expr_Reserved), # Table literals
775	# t = %{
776	# name:Str age:Int
777	# 'andy c' 10
778	# }
779	# Significant newlines. No unquoted [], {}
780
781	# Not sure if we'll use these
782	C('@{', Id.Expr_Reserved),
783	C('@[', Id.Expr_Reserved),
784
785	# Idea: Set literals are #{a, b} like Clojure
786	]
787
788	# Used by ysh/grammar_gen.py
789	EXPR_OPS = [
790	# Terminator
791	C(';', Id.Op_Semi),
792	C('(', Id.Op_LParen),
793	C(')', Id.Op_RParen),
794	# NOTE: type expressions are expressions, e.g. Dict[Str, Int]
795	C('[', Id.Op_LBracket),
796	C(']', Id.Op_RBracket),
797	C('{', Id.Op_LBrace),
798	C('}', Id.Op_RBrace),
799	]
800
801	# Newline is significant, but sometimes elided by expr_parse.py.
802	_EXPR_NEWLINE_COMMENT = [
803	C('\n', Id.Op_Newline),
804	R(r'#[^\n\0]*', Id.Ignored_Comment),
805	R(r'[ \t\r]+', Id.Ignored_Space),
806	]
807
808	# TODO: unify this with LEXER_REFINEMENTS
809	_SIMPLE_FLOAT_RE = r'[0-9]+(\.[0-9]*)?([eE][+\-]?[0-9]+)?'
810
811	_WHITESPACE = r'[ \t\r\n]*' # not including legacy \f \v
812
813	# Used for YSH comparison operators > >= < <=
814	# Optional -?
815	LOOKS_LIKE_FLOAT = _WHITESPACE + '-?' + _SIMPLE_FLOAT_RE + _WHITESPACE
816
817	# Ditto, used for comparison operators
818
819	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
820	# consistent, and avoid '00' turning into a float!
821	_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
822
823	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?' + _DECIMAL_INT_RE + _WHITESPACE
824
825	# Python 3 float literals:
826
827	# digitpart ::= digit (["_"] digit)*
828	# fraction ::= "." digitpart
829	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
830	# pointfloat ::= [digitpart] fraction \| digitpart "."
831	# exponentfloat ::= (digitpart \| pointfloat) exponent
832	# floatnumber ::= pointfloat \| exponentfloat
833
834	# This is the same as far as I can tell?
835
836	# This is a hand-written re2c rule to "refine" the _SIMPLE_FLOAT_RE token to
837	# include underscores: 1_000.234_567
838
839	LEXER_REFINEMENTS = {
840	(lex_mode_e.Expr, Id.Expr_Float):
841	"""
842	digit = [0-9]
843	digitpart = digit ("_"? digit)*
844	fraction = "." digitpart
845	exponent = ("e" \| "E") ("+" \| "-")? digitpart
846	float = digitpart fraction? exponent? \| fraction exponent?
847	"""
848	}
849
850	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
851	LEXER_DEF[lex_mode_e.Expr] = \
852	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
853	EXPR_CHARS + [
854
855	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
856	#
857	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
858	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
859	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
860	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
861	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
862	# nonzerodigit ::= "1"..."9"
863	# digit ::= "0"..."9"
864	# bindigit ::= "0" \| "1"
865	# octdigit ::= "0"..."7"
866	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
867
868	R(_DECIMAL_INT_RE, Id.Expr_DecInt),
869
870	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
871	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
872	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
873
874	# !!! This is REFINED by a hand-written re2c rule !!!
875	# The dev build is slightly different than the production build.
876	R(_SIMPLE_FLOAT_RE, Id.Expr_Float),
877
878	# These can be looked up as keywords separately, so you enforce that they have
879	# space around them?
880	R(VAR_NAME_RE, Id.Expr_Name),
881
882	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
883
884	#
885	# Arith
886	#
887
888	C(',', Id.Arith_Comma),
889	C(':', Id.Arith_Colon), # for slicing a[1:2]
890
891	C('?', Id.Arith_QMark), # regex postfix
892
893	C('+', Id.Arith_Plus), # arith infix, regex postfix
894	C('-', Id.Arith_Minus), # arith infix, regex postfix
895	C('*', Id.Arith_Star),
896	C('^', Id.Arith_Caret), # xor
897	C('/', Id.Arith_Slash),
898	C('%', Id.Arith_Percent),
899
900	C('**', Id.Arith_DStar), # exponentiation
901	C('++', Id.Arith_DPlus), # Option for string/list concatenation
902
903	C('<', Id.Arith_Less),
904	C('>', Id.Arith_Great),
905	C('<=', Id.Arith_LessEqual),
906	C('>=', Id.Arith_GreatEqual),
907	C('===', Id.Expr_TEqual),
908	C('!==', Id.Expr_NotDEqual),
909
910	C('==', Id.Unknown_DEqual), # user must choose === or ~==
911
912	# Bitwise operators
913	C('&', Id.Arith_Amp),
914	C('\|', Id.Arith_Pipe),
915	C('>>', Id.Arith_DGreat),
916	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
917
918	# Bitwise complement, as well as infix pattern matching
919	C('~', Id.Arith_Tilde),
920	C('!~', Id.Expr_NotTilde),
921	C('~~', Id.Expr_DTilde),
922	C('!~~', Id.Expr_NotDTilde),
923
924	# Left out for now:
925	# ++ -- -- needed for loops, awk?
926	# ! && \|\| -- needed for find dialect
927	# = += etc.
928
929	C('=', Id.Arith_Equal),
930
931	C('+=', Id.Arith_PlusEqual),
932	C('-=', Id.Arith_MinusEqual),
933	C('*=', Id.Arith_StarEqual),
934	C('/=', Id.Arith_SlashEqual),
935	C('%=', Id.Arith_PercentEqual),
936
937	C('&=', Id.Arith_AmpEqual),
938	C('\|=', Id.Arith_PipeEqual),
939	C('^=', Id.Arith_CaretEqual), # Exponentiation
940
941	C('>>=', Id.Arith_DGreatEqual),
942	C('<<=', Id.Arith_DLessEqual),
943
944	#
945	# Expr
946	#
947
948	C('!', Id.Expr_Bang), # For eggex negation
949
950	C('//', Id.Expr_DSlash), # For YSH integer division
951	C('~==', Id.Expr_TildeDEqual), # approximate equality
952
953	C('.', Id.Expr_Dot), # attribute access (static or dynamic)
954	C('::', Id.Expr_DColon), # static namespace access
955	C('->', Id.Expr_RArrow), # dynamic dict access: be d->name->age
956	# instead of d['name']['age']
957	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
958
959	# Reserved this. Go uses it for channels, etc.
960	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
961	C('<-', Id.Expr_Reserved),
962	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
963	# and match (x) { 1 => "one" }
964	# note: other languages use \|>
965	# R/dplyr uses %>%
966
967	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
968
969	# For multiline regex literals?
970	C('///', Id.Expr_Reserved),
971
972	# Splat operators
973	C('@', Id.Expr_At),
974	# NOTE: Unused
975	C('@@', Id.Expr_DoubleAt),
976	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
977
978	LEXER_DEF[lex_mode_e.FuncParens] = [
979	# () with spaces
980	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
981	# anything else
982	R(r'[^\0]', Id.Unknown_Tok)
983	]