frontend/lexer

OILS / frontend / lexer_def.py View on Github | oilshell.org

960 lines, 488 significant

1	"""
2	lexer_def.py -- A lexer for both OSH and YSH.
3
4	It consists of a series of lexer modes, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/dev.sh all
9
10	or at least:
11
12	build/dev.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0. The core/lexer_gen.py code
22	generator adds and extra rule for \0.
23
24	For example, use [^'\0]+ instead of [^']+ .
25
26	If this rule isn't followed, we would read uninitialized memory past the
27	sentinel. Python's regex engine knows where the end of the input string is, so
28	it doesn't require need a sentinel like \0.
29	"""
30
31	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
32	from _devbuild.gen.types_asdl import lex_mode_e
33
34	from frontend import id_kind_def
35
36	from typing import Tuple
37
38	# Initialize spec that the lexer depends on.
39	ID_SPEC = id_kind_def.IdSpec({}, {})
40
41	id_kind_def.AddKinds(ID_SPEC)
42	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
43	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
44
45
46	def C(pat, tok_type):
47	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
48	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
49	return (False, pat, tok_type)
50
51
52	def R(pat, tok_type):
53	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
54	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
55	return (True, pat, tok_type)
56
57
58	# See unit tests in frontend/match_test.py.
59	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
60	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
61
62	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
63
64	# Tilde expansion chars are Lit_Chars, but WITHOUT the /. The NEXT token (if
65	# any) after this TildeLike token should start with a /.
66	#
67	# It would have been REALLY NICE to add an optional /? at the end of THIS
68	# token, but we can't do that because of ${x//~/replace}. The third / is not
69	# part of the tilde sub!!!
70	_TILDE_LIKE = R(r'~[a-zA-Z0-9_.-]*', Id.Lit_TildeLike)
71
72	_BACKSLASH = [
73	# To be conservative, we could deny a set of chars similar to
74	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
75	# like \( and \;.
76	#
77	# strict_backslash makes this stricter.
78	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
79	C('\\\n', Id.Ignored_LineCont),
80	]
81
82	# Only 4 characters are backslash escaped inside "".
83	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
84	_DQ_BACKSLASH = [
85	R(r'\\[$`"\\]', Id.Lit_EscapedChar),
86	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
87	]
88
89	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
90
91	# All Kind.VSub
92	_VARS = [
93	# Unbraced variables
94	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
95	R(r'\$[0-9]', Id.VSub_Number),
96	C(r'$!', Id.VSub_Bang),
97	C(r'$@', Id.VSub_At),
98	C(r'$#', Id.VSub_Pound),
99	C(r'$$', Id.VSub_Dollar),
100	C(r'$*', Id.VSub_Star),
101	C(r'$-', Id.VSub_Hyphen),
102	C(r'$?', Id.VSub_QMark),
103	]
104
105	# Kind.Left that are valid in double-quoted modes.
106
107	_LEFT_SUBS = [
108	C('`', Id.Left_Backtick),
109	C('$(', Id.Left_DollarParen),
110	C('${', Id.Left_DollarBrace),
111	C('$((', Id.Left_DollarDParen),
112	C('$[', Id.Left_DollarBracket),
113	]
114
115	# Additional Kind.Left that are valid in unquoted modes.
116	_LEFT_UNQUOTED = [
117	C('"', Id.Left_DoubleQuote),
118	C("'", Id.Left_SingleQuote),
119	C('$"', Id.Left_DollarDoubleQuote),
120	C("$'", Id.Left_DollarSingleQuote),
121	]
122
123	_LEFT_PROCSUB = [
124	C('<(', Id.Left_ProcSubIn),
125	C('>(', Id.Left_ProcSubOut),
126	]
127
128	# The regexes below are in Python syntax, but are translate to re2c syntax by
129	# frontend/lexer_gen.py.
130	#
131	# http://re2c.org/manual/syntax/syntax.html
132	# https://docs.python.org/2/library/re.html
133	#
134	# We use a limited set of constructs:
135	# - + and * for repetition
136	# - Character classes [] with simple ranges and negation
137	# - Escapes like \n \0
138
139	LEXER_DEF = {} # TODO: Should be a list so we enforce order.
140
141	# Anything until the end of the line is a comment. Does not match the newline
142	# itself. We want to switch modes and possibly process Op_Newline for here
143	# docs, etc.
144	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
145
146	# A whitelist for efficiency. The shell language says that "anything else" is
147	# a literal character. In other words, a single $ \ or ! is a literal, not a
148	# syntax error. It's defined negatively, but let's define positive runs here.
149	# TODO: Add + here because it's never special? It's different for YSH though.
150
151	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
152	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_/.\-]+'
153
154	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
155	# NOTE: We could add anything 128 and above to this character class? So
156	# utf-8 characters don't get split?
157	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
158	_TILDE_LIKE,
159	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
160	C('$', Id.Lit_Dollar), # shopt -u parse_dollar
161	C('#', Id.Lit_Pound), # For comments
162	_SIGNIFICANT_SPACE,
163	C('\n', Id.Op_Newline),
164	C('&', Id.Op_Amp),
165	C('\|', Id.Op_Pipe),
166	C('\|&', Id.Op_PipeAmp),
167	C('&&', Id.Op_DAmp),
168	C('\|\|', Id.Op_DPipe),
169	C(';', Id.Op_Semi),
170	C(';;', Id.Op_DSemi),
171	C('(', Id.Op_LParen),
172	C(')', Id.Op_RParen),
173	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
174	]
175
176	# In ShCommand and DBracket states.
177	_EXTGLOB_BEGIN = [
178	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
179	C('@(', Id.ExtGlob_At),
180	C('*(', Id.ExtGlob_Star),
181	C('+(', Id.ExtGlob_Plus),
182	C('?(', Id.ExtGlob_QMark),
183	C('!(', Id.ExtGlob_Bang),
184	]
185
186	_KEYWORDS = [
187	# NOTE: { is matched elsewhere
188	C('[[', Id.KW_DLeftBracket),
189	C('!', Id.KW_Bang),
190	C('for', Id.KW_For),
191	C('while', Id.KW_While),
192	C('until', Id.KW_Until),
193	C('do', Id.KW_Do),
194	C('done', Id.KW_Done),
195	C('in', Id.KW_In),
196	C('case', Id.KW_Case),
197	C('esac', Id.KW_Esac),
198	C('if', Id.KW_If),
199	C('fi', Id.KW_Fi),
200	C('then', Id.KW_Then),
201	C('else', Id.KW_Else),
202	C('elif', Id.KW_Elif),
203	C('function', Id.KW_Function),
204	C('time', Id.KW_Time),
205
206	# YSH
207	C('const', Id.KW_Const), # maybe remove this
208	C('var', Id.KW_Var),
209	C('setvar', Id.KW_SetVar),
210	C('setglobal', Id.KW_SetGlobal),
211	C('call', Id.KW_Call),
212	C('proc', Id.KW_Proc),
213	C('func', Id.KW_Func),
214
215	# for future use
216	C('class', Id.KW_Class),
217	C('data', Id.KW_Data),
218	C('enum', Id.KW_Enum),
219	]
220
221	# These are treated like builtins in bash, but keywords in OSH. However, we
222	# maintain compatibility with bash for the 'type' builtin.
223	_CONTROL_FLOW = [
224	C('break', Id.ControlFlow_Break),
225	C('continue', Id.ControlFlow_Continue),
226	C('return', Id.ControlFlow_Return),
227	C('exit', Id.ControlFlow_Exit),
228	]
229
230	# Used by ysh/grammar_gen.py too
231	EXPR_WORDS = [
232	C('null', Id.Expr_Null),
233	C('true', Id.Expr_True),
234	C('false', Id.Expr_False),
235	C('and', Id.Expr_And),
236	C('or', Id.Expr_Or),
237	C('not', Id.Expr_Not),
238	C('for', Id.Expr_For),
239	C('while', Id.Expr_While),
240	C('is', Id.Expr_Is),
241	C('in', Id.Expr_In),
242	C('if', Id.Expr_If),
243	C('else', Id.Expr_Else),
244
245	# for function literals
246	C('func', Id.Expr_Func),
247
248	# / <capture d+/
249	C('capture', Id.Expr_Capture),
250	# / <capture d+ as date> /
251	C('as', Id.Expr_As),
252
253	# Tea Control Flow Operators
254	C('break', Id.Expr_Break),
255	C('continue', Id.Expr_Continue),
256	C('return', Id.Expr_Return),
257	]
258
259	CONTROL_FLOW_NAMES = [name for _, name, _ in _CONTROL_FLOW]
260
261	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
262
263	# file descriptors can only have two digits, like mksh
264	# dash/zsh/etc. can have one
265	FD_NUM = r'[0-9]?[0-9]?'
266
267	# These two can must be recognized in the ShCommand state, but can't nested
268	# within [[.
269	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
270	# of <Lit_Chars "if">.
271	LEXER_DEF[lex_mode_e.ShCommand] = [
272	# These four are not allowed within [[, so they are in ShCommand but not
273	# _UNQUOTED.
274
275	# e.g. beginning of NAME=val, which will always be longer than
276	# _LITERAL_WHITELIST_REGEX.
277	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
278	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
279	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
280	C('((', Id.Op_DLeftParen),
281
282	# For static globbing, and [] for array literals
283	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
284	C(']', Id.Lit_RBracket), # e.g. *.[ch]
285	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
286	C('*', Id.Lit_Star),
287	C('?', Id.Lit_QMark),
288	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
289	C('...', Id.Lit_TDot), # ... for multiline commands
290
291	# For brace expansion {a,b}
292	C('{', Id.Lit_LBrace),
293	C('}', Id.Lit_RBrace), # Also for var sub ${a}
294	C(',', Id.Lit_Comma),
295	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
296	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
297
298	# @array and @func(1, c)
299	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
300	C('@[', Id.Lit_AtLBracket), # @[split(x)]
301	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
302	R(FD_NUM + r'<', Id.Redir_Less),
303	R(FD_NUM + r'>', Id.Redir_Great),
304	R(FD_NUM + r'<<', Id.Redir_DLess),
305	R(FD_NUM + r'<<<', Id.Redir_TLess),
306	R(FD_NUM + r'>>', Id.Redir_DGreat),
307	R(FD_NUM + r'<<-', Id.Redir_DLessDash),
308	R(FD_NUM + r'>&', Id.Redir_GreatAnd),
309	R(FD_NUM + r'<&', Id.Redir_LessAnd),
310	R(FD_NUM + r'<>', Id.Redir_LessGreat),
311	R(FD_NUM + r'>\\|', Id.Redir_Clobber),
312	R(FD_VAR_NAME + r'<', Id.Redir_Less),
313	R(FD_VAR_NAME + r'>', Id.Redir_Great),
314	R(FD_VAR_NAME + r'<<', Id.Redir_DLess),
315	R(FD_VAR_NAME + r'<<<', Id.Redir_TLess),
316	R(FD_VAR_NAME + r'>>', Id.Redir_DGreat),
317	R(FD_VAR_NAME + r'<<-', Id.Redir_DLessDash),
318	R(FD_VAR_NAME + r'>&', Id.Redir_GreatAnd),
319	R(FD_VAR_NAME + r'<&', Id.Redir_LessAnd),
320	R(FD_VAR_NAME + r'<>', Id.Redir_LessGreat),
321	R(FD_VAR_NAME + r'>\\|', Id.Redir_Clobber),
322
323	# No leading descriptor (2 is implied)
324	C(r'&>', Id.Redir_AndGreat),
325	C(r'&>>', Id.Redir_AndDGreat),
326	] + _KEYWORDS + _CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
327
328	# Preprocessing before ShCommand
329	LEXER_DEF[lex_mode_e.Backtick] = [
330	C(r'`', Id.Backtick_Right),
331	# A backslash, and then $ or ` or \
332	R(r'\\[$`\\]', Id.Backtick_Quoted),
333	# \" treated specially, depending on whether bacticks are double-quoted!
334	R(r'\\"', Id.Backtick_DoubleQuote),
335	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
336	R(r'[^\0]', Id.Backtick_Other), # anything else
337	]
338
339	# DBRACKET: can be like ShCommand, except:
340	# - Don't really need redirects either... Redir_Less could be Op_Less
341	# - Id.Op_DLeftParen can't be nested inside.
342	LEXER_DEF[lex_mode_e.DBracket] = [
343	C(']]', Id.Lit_DRightBracket),
344	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
345	# in addition to [[ ! a && b ]]
346	C('!', Id.KW_Bang),
347	C('<', Id.Op_Less),
348	C('>', Id.Op_Great),
349	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
350	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
351	_UNQUOTED + _EXTGLOB_BEGIN
352
353	# Inside an extended glob, most characters are literals, including spaces and
354	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
355	# nested, so _EXTGLOB_BEGIN appears here.
356	#
357	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
358	LEXER_DEF[lex_mode_e.ExtGlob] = \
359	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
360	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
361	C('\|', Id.Op_Pipe),
362	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
363	R(r'[^\0]', Id.Lit_Other), # everything else is literal
364	]
365
366	# Notes on BASH_REGEX states
367	#
368	# From bash manual:
369	#
370	# - Any part of the pattern may be quoted to force the quoted portion to be
371	# matched as a string.
372	# - Bracket expressions in regular expressions must be treated carefully, since
373	# normal quoting characters lose their meanings between brackets.
374	# - If the pattern is stored in a shell variable, quoting the variable
375	# expansion forces the entire pattern to be matched as a string.
376	#
377	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
378	#
379	# TODO: For testing, write a script to extract and save regexes... and compile
380	# them with regcomp. I've only seen constant regexes.
381	#
382	# From code: ( \| ) are treated special.
383
384	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
385
386	# NOTE: bash accounts for spaces and non-word punctuation like ; inside ()
387	# and []. We will avoid that and ask the user to extract a variable?
388	R(r'[a-zA-Z0-9_/-]+', Id.Lit_Chars), # not including period
389	_TILDE_LIKE, # bash weirdness: RHS of [[ x =~ ~ ]] is expanded
390	_SIGNIFICANT_SPACE,
391
392	# Normally, \x evaluates to x. But quoted regex metacharacters like \* should
393	# evaluate to \*. Compare with ( \| ).
394	R(r'\\[*+?.^$\[\]]', Id.Lit_RegexMeta),
395
396	# NOTE: ( \| and ) aren't operators!
397	R(r'[^\0]', Id.Lit_Other), # Everything else is a literal
398	] + _BACKSLASH # These have to come after RegexMeta
399
400	LEXER_DEF[lex_mode_e.DQ] = _DQ_BACKSLASH + [
401	C('\\\n', Id.Ignored_LineCont),
402	] + _LEFT_SUBS + _VARS + [
403	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
404	C('$', Id.Lit_Dollar), # completion of var names relies on this
405	# NOTE: When parsing here doc line, this token doesn't end it.
406	C('"', Id.Right_DoubleQuote),
407	]
408
409	_VS_ARG_COMMON = [
410	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
411	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
412	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
413	C('}', Id.Right_DollarBrace), # For var sub "${a}"
414	C('$', Id.Lit_Dollar), # completion of var names relies on this
415	]
416
417	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
418	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
419	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
420	_VARS + _EXTGLOB_BEGIN + [
421
422	_TILDE_LIKE,
423	# - doesn't match < and > so it doesn't eat <()
424	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
425	# not enough
426	R(r'[^$`/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
427	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
428	]
429
430	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
431	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = \
432	_DQ_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
433
434	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
435
436	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
437
438	# Weird wart: even in double quoted state, double quotes are allowed
439	C('"', Id.Left_DoubleQuote),
440
441	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
442	C("$'", Id.Left_DollarSingleQuote),
443	]
444
445	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
446	# state.
447	LEXER_DEF[lex_mode_e.SQ_Raw] = [
448	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
449	C("'", Id.Right_SingleQuote),
450	]
451
452	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
453	#
454	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
455	# 0x1234. And \0 is 0x0.
456
457	# In Python:
458	# chr(0x00012345) == u'\U00012345'
459	#
460	# In YSH:
461	# 0x00012345 == \u{12345}
462	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
463
464	# We choose to match QSN (Rust) rather than Python or bash.
465	# Technically it could be \u123456, because we're not embedded in a string, but
466	# it's better to be consistent.
467
468	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
469
470	_X_CHAR = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex)
471
472	# Stricter QSN
473	_X_CHAR_2 = R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex)
474
475	EXPR_CHARS = [
476	# This is like Rust. We don't have the legacy C escapes like \b.
477
478	# NOTE: \' and \" are more readable versions of '"' and "'" in regexs
479	R(r'\\[0rtn\\"%s]' % "'", Id.Char_OneChar),
480	R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex),
481
482	# Because 'a' is a string, we use the syntax #'a' for char literals.
483	# We explicitly leave out #''' because it's confusing.
484	# TODO: extend this to a valid utf-8 code point (rune), rather than a single
485	# byte.
486	R(r"#'[^'\0]'", Id.Char_Pound),
487	_U_BRACED_CHAR,
488	]
489
490	# Shared between echo -e and $''.
491	_C_STRING_COMMON = [
492
493	# \x6 is valid in bash
494	_X_CHAR,
495	R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4),
496	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
497	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
498
499	# Backslash that ends a line. Note '.' doesn't match a newline character.
500	C('\\\n', Id.Char_Literals),
501
502	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
503	# but a lint tool could warn about it.
504	C('\\', Id.Unknown_Backslash),
505
506	# could be at the end of the line
507	#R('\\[uU]', Id.Unknown_BackslashU),
508	]
509
510	# Used by ECHO_LEXER in core/builtin.py.
511	ECHO_E_DEF = _C_STRING_COMMON + [
512	# Note: tokens above \0377 can either be truncated or be flagged a syntax
513	# error in strict mode.
514	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
515	C(r'\c', Id.Char_Stop),
516
517	# e.g. 'foo', anything that's not a backslash escape
518	R(r'[^\\\0]+', Id.Char_Literals),
519	]
520
521	OCTAL3_RE = r'\\[0-7]{1,3}'
522
523	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
524	PS1_DEF = [
525	R(OCTAL3_RE, Id.PS_Octal3),
526	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
527	# \D{%H:%M} strftime format
528	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
529	C(r'\[', Id.PS_LBrace), # non-printing
530	C(r'\]', Id.PS_RBrace),
531	R(r'[^\\\0]+', Id.PS_Literals),
532	# e.g. \x is not a valid escape.
533	C('\\', Id.PS_BadBackslash),
534	]
535
536	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
537	# point of it is that supports other backslash escapes like \n! It just
538	# becomes a regular backslash.
539	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
540	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
541	# with no leading 0.
542	R(OCTAL3_RE, Id.Char_Octal3),
543
544	# ' and " are escaped in $'' mode, but not echo -e.
545	C(r"\'", Id.Char_OneChar),
546	C(r'\"', Id.Char_OneChar),
547
548	# e.g. 'foo', anything that's not a backslash escape or '
549	R(r"[^\\'\0]+", Id.Char_Literals),
550	C("'", Id.Right_SingleQuote),
551
552	# Backslash that ends the file! Caught by re2c exhaustiveness check. Parser
553	# will assert; should give a better syntax error.
554	C('\\\0', Id.Unknown_Tok),
555	]
556
557	# Should match the pure Python decoder in data_lang/qsn.py
558	LEXER_DEF[lex_mode_e.QSN] = [
559	R(r'''\\[nrt0'"\\]''', Id.Char_OneChar),
560	_X_CHAR_2, # \xff
561	_U_BRACED_CHAR, # \u{3bc}
562
563	# Like SQ_C, but literal newlines and tabs are illegal.
564	R(r"[^\\'\0\t\n]+", Id.Char_Literals),
565	C("'", Id.Right_SingleQuote),
566	R(r'[^\0]', Id.Unknown_Tok),
567	]
568
569	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
570	R(OCTAL3_RE, Id.Char_Octal3),
571	R(r"[^%\\\0]+", Id.Char_Literals),
572	C('%%', Id.Format_EscapedPercent),
573	C('%', Id.Format_Percent),
574	]
575
576	# Maybe: bash also supports %(strftime)T
577	LEXER_DEF[lex_mode_e.PrintfPercent] = [
578	# Flags
579	R('[- +#]', Id.Format_Flag),
580	C('0', Id.Format_Zero),
581	R('[1-9][0-9]*', Id.Format_Num),
582	C('*', Id.Format_Star),
583	C('.', Id.Format_Dot),
584	# We support dsq. The others we parse to display an error message.
585	R('[disqbcouxXeEfFgG]', Id.Format_Type),
586	R('$[^()\0]*$T', Id.Format_Time),
587	R(r'[^\0]', Id.Unknown_Tok), # any other char
588	]
589
590	LEXER_DEF[lex_mode_e.VSub_1] = [
591	R(VAR_NAME_RE, Id.VSub_Name),
592	# ${11} is valid, compared to $11 which is $1 and then literal 1.
593	R(r'[0-9]+', Id.VSub_Number),
594	C('!', Id.VSub_Bang),
595	C('@', Id.VSub_At),
596	C('#', Id.VSub_Pound),
597	C('$', Id.VSub_Dollar),
598	C('*', Id.VSub_Star),
599	C('-', Id.VSub_Hyphen),
600	C('?', Id.VSub_QMark),
601	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
602	C('}', Id.Right_DollarBrace),
603	C('\\\n', Id.Ignored_LineCont),
604	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
605	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
606	]
607
608	LEXER_DEF[lex_mode_e.VSub_2] = \
609	ID_SPEC.LexerPairs(Kind.VTest) + \
610	ID_SPEC.LexerPairs(Kind.VOp0) + \
611	ID_SPEC.LexerPairs(Kind.VOpOil) + \
612	ID_SPEC.LexerPairs(Kind.VOp1) + \
613	ID_SPEC.LexerPairs(Kind.VOp2) + \
614	ID_SPEC.LexerPairs(Kind.VOp3) + [
615	C('}', Id.Right_DollarBrace),
616
617	C('\\\n', Id.Ignored_LineCont),
618	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
619	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
620	]
621
622	_EXPR_ARITH_SHARED = [
623	C('\\\n', Id.Ignored_LineCont),
624	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
625	]
626
627	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
628	LEXER_DEF[lex_mode_e.Arith] = \
629	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
630
631	# Arithmetic expressions can cross newlines.
632	R(r'[ \t\r\n]+', Id.Ignored_Space),
633
634	# Examples of arith constants:
635	# 64#azAZ
636	# 0xabc 0xABC
637	# 0123
638	# A separate digits token makes this easier to parse STATICALLY. But this
639	# doesn't help with DYNAMIC parsing.
640	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
641	R(r'[0-9]+', Id.Lit_Digits),
642	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
643	C('#', Id.Lit_Pound), # for 64#a
644
645	# TODO: 64#@ interferes with VS_AT. Hm.
646	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
647
648	# A lexer for the parser that converts globs to extended regexes. Since we're
649	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
650	# don't need lexer modes here.
651	GLOB_DEF = [
652	# These could be operators in the glob, or just literals in a char class,
653	# e.g. touch '?'; echo [?].
654	C('*', Id.Glob_Star),
655	C('?', Id.Glob_QMark),
656
657	# For negation. Treated as operators inside [], but literals outside.
658	C('!', Id.Glob_Bang),
659	C('^', Id.Glob_Caret),
660
661	# Character classes.
662	C('[', Id.Glob_LBracket),
663	C(']', Id.Glob_RBracket),
664
665	# There is no whitelist of characters; backslashes are unconditionally
666	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
667	# See libc_test.py.
668	R(r'\\[^\0]', Id.Glob_EscapedChar),
669	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
670
671	# For efficiency, combine other characters into a single token, e.g. 'py' in
672	# '*.py' or 'alpha' in '[[:alpha:]]'.
673	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
674	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
675	]
676
677	# History expansion. We're doing this as "pre-lexing" since that's what bash
678	# and zsh seem to do. Example:
679	#
680	# $ foo=x
681	# $ echo $
682	# $ !!foo # expands to echo $foo and prints x
683	#
684	# We can also reuse this in the RootCompleter to expand history interactively.
685	#
686	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
687	# quotes AGAIN.
688	#
689	# Note: \! gets expanded to literal \! for the real lexer, but no history
690	# expansion occurs.
691
692	HISTORY_DEF = [
693	# Common operators.
694	R(r'![!*^$]', Id.History_Op),
695
696	# By command number.
697	R(r'!-?[0-9]+', Id.History_Num),
698
699	# Search by prefix of substring (optional '?').
700	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
701	# No hyphen since it conflits with $-1 too.
702	#
703	# Required trailing whitespace is there to avoid conflict with [!charclass]
704	# and ${!indirect}. This is a simpler hack than the one bash has. See
705	# frontend/lex_test.py.
706	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
707
708	# Comment is until end of line
709	R(r"#[^\0]*", Id.History_Other),
710
711	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
712	# end of string.
713	R(r"'[^'\0]*'?", Id.History_Other),
714
715	# Runs of chars that are definitely not special
716	R(r"[^!\\'#\0]+", Id.History_Other),
717
718	# Escaped characters. \! disables history
719	R(r'\\[^\0]', Id.History_Other),
720	# Other single chars, like a trailing \ or !
721	R(r'[^\0]', Id.History_Other),
722	]
723
724	BRACE_RANGE_DEF = [
725	R(r'-?[0-9]+', Id.Range_Int),
726	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
727	R(r'\.\.', Id.Range_Dots),
728	R(r'[^\0]', Id.Range_Other), # invalid
729	]
730
731	#
732	# YSH lexing
733	#
734
735	# Valid in lex_mode_e.{Expr,DQ}
736	# Used by ysh/grammar_gen.py
737	YSH_LEFT_SUBS = [
738	C('$(', Id.Left_DollarParen),
739	C('${', Id.Left_DollarBrace),
740	C('$[', Id.Left_DollarBracket), # TODO: Implement $[x]
741	]
742
743	# Valid in lex_mode_e.Expr, but not valid in DQ
744	# Used by ysh/grammar_gen.py
745
746	YSH_LEFT_UNQUOTED = [
747	C('"', Id.Left_DoubleQuote),
748	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
749	C("'", Id.Left_SingleQuote),
750	C("r'", Id.Left_RSingleQuote),
751	C("$'", Id.Left_DollarSingleQuote),
752	C('"""', Id.Left_TDoubleQuote),
753	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
754	C("'''", Id.Left_TSingleQuote),
755	C("r'''", Id.Left_RTSingleQuote),
756	C("$'''", Id.Left_DollarTSingleQuote),
757	C('@(', Id.Left_AtParen), # Split Command Sub
758	C('^(', Id.Left_CaretParen), # Block literals in expression mode
759	C('^[', Id.Left_CaretBracket), # Expr literals
760	C('^{', Id.Left_CaretBrace), # Unused
761	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
762	C('%(', Id.Left_PercentParen), # old syntax for shell-like word arrays.
763	C('%[', Id.Expr_Reserved), # Maybe: like %() without unquoted [], {}
764	C('%{', Id.Expr_Reserved), # Table literals
765	# t = %{
766	# name:Str age:Int
767	# 'andy c' 10
768	# }
769	# Significant newlines. No unquoted [], {}
770
771	# Not sure if we'll use these
772	C('@{', Id.Expr_Reserved),
773	C('@[', Id.Expr_Reserved),
774
775	# Idea: Set literals are #{a, b} like Clojure
776	]
777
778	# Used by ysh/grammar_gen.py
779	EXPR_OPS = [
780	# Terminator
781	C(';', Id.Op_Semi),
782	C('(', Id.Op_LParen),
783	C(')', Id.Op_RParen),
784	# NOTE: type expressions are expressions, e.g. Dict[Str, Int]
785	C('[', Id.Op_LBracket),
786	C(']', Id.Op_RBracket),
787	C('{', Id.Op_LBrace),
788	C('}', Id.Op_RBrace),
789	]
790
791	# Newline is significant, but sometimes elided by expr_parse.py.
792	_EXPR_NEWLINE_COMMENT = [
793	C('\n', Id.Op_Newline),
794	R(r'#[^\n\0]*', Id.Ignored_Comment),
795	R(r'[ \t\r]+', Id.Ignored_Space),
796	]
797
798	_WHITESPACE = r'[ \t\r\n]*' # not including legacy \f \v
799
800	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
801	# consistent, and avoid '00' turning into a float!
802	_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
803
804	# Used for YSH comparison operators > >= < <=
805	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?' + _DECIMAL_INT_RE + _WHITESPACE
806
807	_FLOAT_RE = (
808	_DECIMAL_INT_RE +
809	# Unlike Python, exponent can't be like 42e5_000. There's no use because
810	# 1e309 is already inf. Let's keep our code simple.
811	r'(\.' + _DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
812
813	# Ditto, used for comparison operators
814	# Added optional Optional -?
815	# Example: -3_000_000.000_001e12
816	LOOKS_LIKE_FLOAT = _WHITESPACE + '-?' + _FLOAT_RE + _WHITESPACE
817
818	# Python 3 float literals:
819
820	# digitpart ::= digit (["_"] digit)*
821	# fraction ::= "." digitpart
822	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
823	# pointfloat ::= [digitpart] fraction \| digitpart "."
824	# exponentfloat ::= (digitpart \| pointfloat) exponent
825	# floatnumber ::= pointfloat \| exponentfloat
826
827	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
828	LEXER_DEF[lex_mode_e.Expr] = \
829	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
830	EXPR_CHARS + [
831
832	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
833	#
834	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
835	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
836	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
837	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
838	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
839	# nonzerodigit ::= "1"..."9"
840	# digit ::= "0"..."9"
841	# bindigit ::= "0" \| "1"
842	# octdigit ::= "0"..."7"
843	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
844
845	R(_DECIMAL_INT_RE, Id.Expr_DecInt),
846
847	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
848	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
849	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
850
851	R(_FLOAT_RE, Id.Expr_Float),
852
853	# These can be looked up as keywords separately, so you enforce that they have
854	# space around them?
855	R(VAR_NAME_RE, Id.Expr_Name),
856
857	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
858
859	#
860	# Arith
861	#
862
863	C(',', Id.Arith_Comma),
864	C(':', Id.Arith_Colon), # for slicing a[1:2], and mylist:pop()
865
866	C('?', Id.Arith_QMark), # regex postfix
867
868	C('+', Id.Arith_Plus), # arith infix, regex postfix
869	C('-', Id.Arith_Minus), # arith infix, regex postfix
870	C('*', Id.Arith_Star),
871	C('^', Id.Arith_Caret), # xor
872	C('/', Id.Arith_Slash),
873	C('%', Id.Arith_Percent),
874
875	C('**', Id.Arith_DStar), # exponentiation
876	C('++', Id.Arith_DPlus), # Option for string/list concatenation
877
878	C('<', Id.Arith_Less),
879	C('>', Id.Arith_Great),
880	C('<=', Id.Arith_LessEqual),
881	C('>=', Id.Arith_GreatEqual),
882	C('===', Id.Expr_TEqual),
883	C('!==', Id.Expr_NotDEqual),
884
885	C('==', Id.Unknown_DEqual), # user must choose === or ~==
886
887	# Bitwise operators
888	C('&', Id.Arith_Amp),
889	C('\|', Id.Arith_Pipe),
890	C('>>', Id.Arith_DGreat),
891	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
892
893	# Bitwise complement, as well as infix pattern matching
894	C('~', Id.Arith_Tilde),
895	C('!~', Id.Expr_NotTilde),
896	C('~~', Id.Expr_DTilde),
897	C('!~~', Id.Expr_NotDTilde),
898
899	# Left out for now:
900	# ++ -- -- needed for loops, awk?
901	# ! && \|\| -- needed for find dialect
902	# = += etc.
903
904	C('=', Id.Arith_Equal),
905
906	C('+=', Id.Arith_PlusEqual),
907	C('-=', Id.Arith_MinusEqual),
908	C('*=', Id.Arith_StarEqual),
909	C('/=', Id.Arith_SlashEqual),
910	C('%=', Id.Arith_PercentEqual),
911
912	C('>>=', Id.Arith_DGreatEqual),
913	C('<<=', Id.Arith_DLessEqual),
914	C('&=', Id.Arith_AmpEqual),
915	C('\|=', Id.Arith_PipeEqual),
916	C('^=', Id.Arith_CaretEqual), # Exponentiation
917
918	# Augmented assignment that YSH has, but sh and OSH don't have
919	C('**=', Id.Expr_DStarEqual),
920	C('//=', Id.Expr_DSlashEqual),
921
922	#
923	# Expr
924	#
925
926	C('!', Id.Expr_Bang), # For eggex negation
927
928	C('//', Id.Expr_DSlash), # For YSH integer division
929	C('~==', Id.Expr_TildeDEqual), # approximate equality
930
931	C('.', Id.Expr_Dot), # d.key is alias for d['key']
932	C('..', Id.Expr_DDot), # range 1..5
933	C('->', Id.Expr_RArrow), # s->startswith()
934	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
935
936	# Reserved this. Go uses it for channels, etc.
937	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
938	C('<-', Id.Expr_Reserved),
939	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
940	# and match (x) { 1 => "one" }
941	# note: other languages use \|>
942	# R/dplyr uses %>%
943
944	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
945
946	# For multiline regex literals?
947	C('///', Id.Expr_Reserved),
948
949	# Splat operators
950	C('@', Id.Expr_At),
951	# NOTE: Unused
952	C('@@', Id.Expr_DoubleAt),
953	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
954
955	LEXER_DEF[lex_mode_e.FuncParens] = [
956	# () with spaces
957	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
958	# anything else
959	R(r'[^\0]', Id.Unknown_Tok)
960	]