frontend/lexer

OILS / frontend / lexer_def.py View on Github | oilshell.org

1023 lines, 534 significant

1	"""
2	lexer_def.py -- A lexer for both OSH and YSH.
3
4	It consists of a series of lexer modes, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/dev.sh all
9
10	or at least:
11
12	build/dev.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0. The core/lexer_gen.py code
22	generator adds and extra rule for \0.
23
24	For example, use [^'\0]+ instead of [^']+ .
25
26	If this rule isn't followed, we would read uninitialized memory past the
27	sentinel. Python's regex engine knows where the end of the input string is, so
28	it doesn't require need a sentinel like \0.
29	"""
30
31	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
32	from _devbuild.gen.types_asdl import lex_mode_e
33
34	from frontend import id_kind_def
35
36	from typing import Tuple
37
38	# Initialize spec that the lexer depends on.
39	ID_SPEC = id_kind_def.IdSpec({}, {})
40
41	id_kind_def.AddKinds(ID_SPEC)
42	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
43	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
44
45
46	def C(pat, tok_type):
47	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
48	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
49	return (False, pat, tok_type)
50
51
52	def R(pat, tok_type):
53	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
54	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
55	return (True, pat, tok_type)
56
57
58	# See unit tests in frontend/match_test.py.
59	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
60	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
61
62	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
63
64	# Tilde expansion chars are Lit_Chars, but WITHOUT the /. The NEXT token (if
65	# any) after this TildeLike token should start with a /.
66	#
67	# It would have been REALLY NICE to add an optional /? at the end of THIS
68	# token, but we can't do that because of ${x//~/replace}. The third / is not
69	# part of the tilde sub!!!
70	_TILDE_LIKE = R(r'~[a-zA-Z0-9_.-]*', Id.Lit_TildeLike)
71
72	_BACKSLASH = [
73	# To be conservative, we could deny a set of chars similar to
74	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
75	# like \( and \;.
76	#
77	# strict_backslash makes this stricter.
78	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
79	C('\\\n', Id.Ignored_LineCont),
80	]
81
82	# Only 4 characters are backslash escaped inside "".
83	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
84	_DQ_BACKSLASH = [
85	R(r'\\[$`"\\]', Id.Lit_EscapedChar),
86	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
87	]
88
89	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
90
91	# All Kind.VSub
92	_VARS = [
93	# Unbraced variables
94	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
95	R(r'\$[0-9]', Id.VSub_Number),
96	C(r'$!', Id.VSub_Bang),
97	C(r'$@', Id.VSub_At),
98	C(r'$#', Id.VSub_Pound),
99	C(r'$$', Id.VSub_Dollar),
100	C(r'$*', Id.VSub_Star),
101	C(r'$-', Id.VSub_Hyphen),
102	C(r'$?', Id.VSub_QMark),
103	]
104
105	# Kind.Left that are valid in double-quoted modes.
106
107	_LEFT_SUBS = [
108	C('`', Id.Left_Backtick),
109	C('$(', Id.Left_DollarParen),
110	C('${', Id.Left_DollarBrace),
111	C('$((', Id.Left_DollarDParen),
112	C('$[', Id.Left_DollarBracket),
113	]
114
115	# Additional Kind.Left that are valid in unquoted modes.
116	_LEFT_UNQUOTED = [
117	C('"', Id.Left_DoubleQuote),
118	C("'", Id.Left_SingleQuote),
119	C('$"', Id.Left_DollarDoubleQuote),
120	C("$'", Id.Left_DollarSingleQuote),
121	]
122
123	_LEFT_PROCSUB = [
124	C('<(', Id.Left_ProcSubIn),
125	C('>(', Id.Left_ProcSubOut),
126	]
127
128	# The regexes below are in Python syntax, but are translate to re2c syntax by
129	# frontend/lexer_gen.py.
130	#
131	# http://re2c.org/manual/syntax/syntax.html
132	# https://docs.python.org/2/library/re.html
133	#
134	# We use a limited set of constructs:
135	# - + and * for repetition
136	# - Character classes [] with simple ranges and negation
137	# - Escapes like \n \0
138
139	LEXER_DEF = {} # TODO: Should be a list so we enforce order.
140
141	# Anything until the end of the line is a comment. Does not match the newline
142	# itself. We want to switch modes and possibly process Op_Newline for here
143	# docs, etc.
144	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
145
146	# A whitelist for efficiency. The shell language says that "anything else" is
147	# a literal character. In other words, a single $ \ or ! is a literal, not a
148	# syntax error. It's defined negatively, but let's define positive runs here.
149	# TODO: Add + here because it's never special? It's different for YSH though.
150
151	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
152	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_/.\-]+'
153
154	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
155	# NOTE: We could add anything 128 and above to this character class? So
156	# utf-8 characters don't get split?
157	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
158	_TILDE_LIKE,
159	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
160	C('$', Id.Lit_Dollar), # shopt -u parse_dollar
161	C('#', Id.Lit_Pound), # For comments
162	_SIGNIFICANT_SPACE,
163	C('\n', Id.Op_Newline),
164	C('&', Id.Op_Amp),
165	C('\|', Id.Op_Pipe),
166	C('\|&', Id.Op_PipeAmp),
167	C('&&', Id.Op_DAmp),
168	C('\|\|', Id.Op_DPipe),
169	C(';', Id.Op_Semi),
170	C(';;', Id.Op_DSemi),
171	C('(', Id.Op_LParen),
172	C(')', Id.Op_RParen),
173	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
174	]
175
176	# In ShCommand and DBracket states.
177	_EXTGLOB_BEGIN = [
178	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
179	C('@(', Id.ExtGlob_At),
180	C('*(', Id.ExtGlob_Star),
181	C('+(', Id.ExtGlob_Plus),
182	C('?(', Id.ExtGlob_QMark),
183	C('!(', Id.ExtGlob_Bang),
184	]
185
186	_KEYWORDS = [
187	# NOTE: { is matched elsewhere
188	C('[[', Id.KW_DLeftBracket),
189	C('!', Id.KW_Bang),
190	C('for', Id.KW_For),
191	C('while', Id.KW_While),
192	C('until', Id.KW_Until),
193	C('do', Id.KW_Do),
194	C('done', Id.KW_Done),
195	C('in', Id.KW_In),
196	C('case', Id.KW_Case),
197	C('esac', Id.KW_Esac),
198	C('if', Id.KW_If),
199	C('fi', Id.KW_Fi),
200	C('then', Id.KW_Then),
201	C('else', Id.KW_Else),
202	C('elif', Id.KW_Elif),
203	C('function', Id.KW_Function),
204	C('time', Id.KW_Time),
205
206	# YSH
207	C('const', Id.KW_Const), # maybe remove this
208	C('var', Id.KW_Var),
209	C('setvar', Id.KW_SetVar),
210	C('setglobal', Id.KW_SetGlobal),
211	C('call', Id.KW_Call),
212	C('proc', Id.KW_Proc),
213	C('func', Id.KW_Func),
214
215	# for future use
216	C('class', Id.KW_Class),
217	C('data', Id.KW_Data),
218	C('enum', Id.KW_Enum),
219	]
220
221	# These are treated like builtins in bash, but keywords in OSH. However, we
222	# maintain compatibility with bash for the 'type' builtin.
223	_CONTROL_FLOW = [
224	C('break', Id.ControlFlow_Break),
225	C('continue', Id.ControlFlow_Continue),
226	C('return', Id.ControlFlow_Return),
227	C('exit', Id.ControlFlow_Exit),
228	]
229
230	# Used by ysh/grammar_gen.py too
231	EXPR_WORDS = [
232	C('null', Id.Expr_Null),
233	C('true', Id.Expr_True),
234	C('false', Id.Expr_False),
235	C('and', Id.Expr_And),
236	C('or', Id.Expr_Or),
237	C('not', Id.Expr_Not),
238	C('for', Id.Expr_For),
239	C('while', Id.Expr_While),
240	C('is', Id.Expr_Is),
241	C('in', Id.Expr_In),
242	C('if', Id.Expr_If),
243	C('else', Id.Expr_Else),
244
245	# for function literals
246	C('func', Id.Expr_Func),
247
248	# / <capture d+/
249	C('capture', Id.Expr_Capture),
250	# / <capture d+ as date> /
251	C('as', Id.Expr_As),
252
253	# Tea Control Flow Operators
254	C('break', Id.Expr_Break),
255	C('continue', Id.Expr_Continue),
256	C('return', Id.Expr_Return),
257	]
258
259	CONTROL_FLOW_NAMES = [name for _, name, _ in _CONTROL_FLOW]
260
261	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
262
263	# file descriptors can only have two digits, like mksh
264	# dash/zsh/etc. can have one
265	FD_NUM = r'[0-9]?[0-9]?'
266
267	# These two can must be recognized in the ShCommand state, but can't nested
268	# within [[.
269	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
270	# of <Lit_Chars "if">.
271	LEXER_DEF[lex_mode_e.ShCommand] = [
272	# These four are not allowed within [[, so they are in ShCommand but not
273	# _UNQUOTED.
274
275	# e.g. beginning of NAME=val, which will always be longer than
276	# _LITERAL_WHITELIST_REGEX.
277	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
278	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
279	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
280	C('((', Id.Op_DLeftParen),
281
282	# For static globbing, and [] for array literals
283	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
284	C(']', Id.Lit_RBracket), # e.g. *.[ch]
285	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
286	C('*', Id.Lit_Star),
287	C('?', Id.Lit_QMark),
288	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
289	C('...', Id.Lit_TDot), # ... for multiline commands
290
291	# For brace expansion {a,b}
292	C('{', Id.Lit_LBrace),
293	C('}', Id.Lit_RBrace), # Also for var sub ${a}
294	C(',', Id.Lit_Comma),
295	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
296	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
297
298	# @array and @func(1, c)
299	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
300	C('@[', Id.Lit_AtLBracket), # @[split(x)]
301	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
302	R(FD_NUM + r'<', Id.Redir_Less),
303	R(FD_NUM + r'>', Id.Redir_Great),
304	R(FD_NUM + r'<<', Id.Redir_DLess),
305	R(FD_NUM + r'<<<', Id.Redir_TLess),
306	R(FD_NUM + r'>>', Id.Redir_DGreat),
307	R(FD_NUM + r'<<-', Id.Redir_DLessDash),
308	R(FD_NUM + r'>&', Id.Redir_GreatAnd),
309	R(FD_NUM + r'<&', Id.Redir_LessAnd),
310	R(FD_NUM + r'<>', Id.Redir_LessGreat),
311	R(FD_NUM + r'>\\|', Id.Redir_Clobber),
312	R(FD_VAR_NAME + r'<', Id.Redir_Less),
313	R(FD_VAR_NAME + r'>', Id.Redir_Great),
314	R(FD_VAR_NAME + r'<<', Id.Redir_DLess),
315	R(FD_VAR_NAME + r'<<<', Id.Redir_TLess),
316	R(FD_VAR_NAME + r'>>', Id.Redir_DGreat),
317	R(FD_VAR_NAME + r'<<-', Id.Redir_DLessDash),
318	R(FD_VAR_NAME + r'>&', Id.Redir_GreatAnd),
319	R(FD_VAR_NAME + r'<&', Id.Redir_LessAnd),
320	R(FD_VAR_NAME + r'<>', Id.Redir_LessGreat),
321	R(FD_VAR_NAME + r'>\\|', Id.Redir_Clobber),
322
323	# No leading descriptor (2 is implied)
324	C(r'&>', Id.Redir_AndGreat),
325	C(r'&>>', Id.Redir_AndDGreat),
326	] + _KEYWORDS + _CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
327
328	# Preprocessing before ShCommand
329	LEXER_DEF[lex_mode_e.Backtick] = [
330	C(r'`', Id.Backtick_Right),
331	# A backslash, and then $ or ` or \
332	R(r'\\[$`\\]', Id.Backtick_Quoted),
333	# \" treated specially, depending on whether bacticks are double-quoted!
334	R(r'\\"', Id.Backtick_DoubleQuote),
335	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
336	R(r'[^\0]', Id.Backtick_Other), # anything else
337	]
338
339	# DBRACKET: can be like ShCommand, except:
340	# - Don't really need redirects either... Redir_Less could be Op_Less
341	# - Id.Op_DLeftParen can't be nested inside.
342	LEXER_DEF[lex_mode_e.DBracket] = [
343	C(']]', Id.Lit_DRightBracket),
344	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
345	# in addition to [[ ! a && b ]]
346	C('!', Id.KW_Bang),
347	C('<', Id.Op_Less),
348	C('>', Id.Op_Great),
349	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
350	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
351	_UNQUOTED + _EXTGLOB_BEGIN
352
353	# Inside an extended glob, most characters are literals, including spaces and
354	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
355	# nested, so _EXTGLOB_BEGIN appears here.
356	#
357	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
358	LEXER_DEF[lex_mode_e.ExtGlob] = \
359	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
360	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
361	C('\|', Id.Op_Pipe),
362	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
363	R(r'[^\0]', Id.Lit_Other), # everything else is literal
364	]
365
366	# Notes on BASH_REGEX states
367	#
368	# From bash manual:
369	#
370	# - Any part of the pattern may be quoted to force the quoted portion to be
371	# matched as a string.
372	# - Bracket expressions in regular expressions must be treated carefully, since
373	# normal quoting characters lose their meanings between brackets.
374	# - If the pattern is stored in a shell variable, quoting the variable
375	# expansion forces the entire pattern to be matched as a string.
376	#
377	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
378	#
379	# TODO: For testing, write a script to extract and save regexes... and compile
380	# them with regcomp. I've only seen constant regexes.
381	#
382	# From code: ( \| ) are treated special.
383
384	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
385
386	# NOTE: bash accounts for spaces and non-word punctuation like ; inside ()
387	# and []. We will avoid that and ask the user to extract a variable?
388	R(r'[a-zA-Z0-9_/-]+', Id.Lit_Chars), # not including period
389	_TILDE_LIKE, # bash weirdness: RHS of [[ x =~ ~ ]] is expanded
390	_SIGNIFICANT_SPACE,
391
392	# Normally, \x evaluates to x. But quoted regex metacharacters like \* should
393	# evaluate to \*. Compare with ( \| ).
394	R(r'\\[*+?.^$\[\]]', Id.Lit_RegexMeta),
395
396	# NOTE: ( \| and ) aren't operators!
397	R(r'[^\0]', Id.Lit_Other), # Everything else is a literal
398	] + _BACKSLASH # These have to come after RegexMeta
399
400	LEXER_DEF[lex_mode_e.DQ] = _DQ_BACKSLASH + [
401	C('\\\n', Id.Ignored_LineCont),
402	] + _LEFT_SUBS + _VARS + [
403	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
404	C('$', Id.Lit_Dollar), # completion of var names relies on this
405	# NOTE: When parsing here doc line, this token doesn't end it.
406	C('"', Id.Right_DoubleQuote),
407	]
408
409	_VS_ARG_COMMON = [
410	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
411	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
412	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
413	C('}', Id.Right_DollarBrace), # For var sub "${a}"
414	C('$', Id.Lit_Dollar), # completion of var names relies on this
415	]
416
417	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
418	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
419	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
420	_VARS + _EXTGLOB_BEGIN + [
421
422	_TILDE_LIKE,
423	# - doesn't match < and > so it doesn't eat <()
424	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
425	# not enough
426	R(r'[^$`/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
427	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
428	]
429
430	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
431	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = \
432	_DQ_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
433
434	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
435
436	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
437
438	# Weird wart: even in double quoted state, double quotes are allowed
439	C('"', Id.Left_DoubleQuote),
440
441	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
442	C("$'", Id.Left_DollarSingleQuote),
443	]
444
445	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
446	# state.
447	LEXER_DEF[lex_mode_e.SQ_Raw] = [
448	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
449	C("'", Id.Right_SingleQuote),
450	]
451
452	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
453	#
454	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
455	# 0x1234. And \0 is 0x0.
456
457	# In Python:
458	# chr(0x00012345) == u'\U00012345'
459	#
460	# In YSH:
461	# 0x00012345 == \u{12345}
462	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
463
464	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
465
466	_X_CHAR_LOOSE = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex) # bash
467	_X_CHAR_STRICT = R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex) # YSH
468
469	_U4_CHAR_LOOSE = R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4) # bash
470
471	_U4_CHAR_STRICT = R(r'\\u[0-9a-fA-F]{4}', Id.Char_Unicode4) # JSON-only
472
473	EXPR_CHARS = [
474	# This is like Rust. We don't have the legacy C escapes like \b.
475
476	# NOTE: \' and \" are more readable versions of '"' and "'" in regexs
477	R(r'\\[0rtn\\"%s]' % "'", Id.Char_OneChar),
478	_X_CHAR_STRICT,
479
480	# Because 'a' is a string, we use the syntax #'a' for char literals.
481	# We explicitly leave out #''' because it's confusing.
482	# Note: we're not doing utf-8 validation here.
483	R(r"#'[^'\0]'", Id.Char_Pound),
484	_U_BRACED_CHAR,
485	]
486
487	# Shared between echo -e and $''.
488	_C_STRING_COMMON = [
489
490	# \x6 is valid in bash
491	_X_CHAR_LOOSE,
492	_U4_CHAR_LOOSE,
493	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
494	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
495
496	# Backslash that ends a line. Note '.' doesn't match a newline character.
497	C('\\\n', Id.Char_Literals),
498
499	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
500	# but a lint tool could warn about it.
501	C('\\', Id.Unknown_Backslash),
502
503	# could be at the end of the line
504	#R('\\[uU]', Id.Unknown_BackslashU),
505	]
506
507	ECHO_E_DEF = _C_STRING_COMMON + [
508	# Note: tokens above \0377 can either be truncated or be flagged a syntax
509	# error in strict mode.
510	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
511	C(r'\c', Id.Char_Stop),
512
513	# e.g. 'foo', anything that's not a backslash escape
514	R(r'[^\\\0]+', Id.Char_Literals),
515	]
516
517	# https://json.org/
518
519	# Note that [0-9] has to come second, because Python chooses the first match.
520	_JSON_INT = r'([1-9][0-9]*\|[0-9])' # Numbers can't start with leading 0
521	_JSON_FRACTION = r'(\.[0-9]+)?'
522	_JSON_EXP = r'([eE][-+]?[0-9]+)?'
523
524	J8_DEF = [
525	C('"', Id.Left_DoubleQuote), # JSON string
526	C("u'", Id.Left_USingleQuote), # unicode string
527	C("'", Id.Left_USingleQuote), # '' is alias for u'' in data, not in code
528	C("b'", Id.Left_BSingleQuote), # byte string
529	C('[', Id.J8_LBracket),
530	C(']', Id.J8_RBracket),
531	C('{', Id.J8_LBrace),
532	C('}', Id.J8_RBrace),
533	C('(', Id.J8_LParen), # TYG8 only
534	C(')', Id.J8_RParen), # TYG8 only
535	C(',', Id.J8_Comma),
536	C(':', Id.J8_Colon),
537	C('null', Id.J8_Null),
538	C('true', Id.J8_Bool),
539	C('false', Id.J8_Bool),
540	R(_JSON_INT, Id.J8_Int),
541	R(_JSON_INT + _JSON_FRACTION + _JSON_EXP, Id.J8_Float),
542
543	# TODO: emit Id.Ignored_Newline to count lines for error messages?
544	R(r'[ \r\n\t]+', Id.Ignored_Space),
545
546	# This will reject ASCII control chars
547	R(r'[^\0]', Id.Unknown_Tok),
548	]
549
550	# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data
551	# But \n has to be allowed in multi-line strings
552	_ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
553
554	# https://json.org list of chars, plus '
555	_JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
556
557	# Union of escapes that "" u"" b"" accept. Validation is separate.
558	J8_STR_DEF = [
559	C("'", Id.Right_SingleQuote), # end for J8
560	_JSON_ONE_CHAR,
561	C("\\'", Id.Char_OneChar),
562	R(r'\\y[0-9a-fA-F]{2}', Id.Char_YHex), # \yff - J8 only
563	_U_BRACED_CHAR, # \u{123456} - J8 only
564	_ASCII_CONTROL,
565
566	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
567	R(r'''[^\\'\0]+''', Id.Char_Literals),
568	R(r'[^\0]', Id.Unknown_Tok),
569	]
570
571	# For "JSON strings \" \u1234"
572	JSON_STR_DEF = [
573	C('"', Id.Right_DoubleQuote), # end for JSON
574	_JSON_ONE_CHAR,
575	_U4_CHAR_STRICT, # \u1234 - JSON only
576
577	# High surrogate [\uD800, \uDC00)
578	# Low surrogate [\uDC00, \uE000)
579	# This pattern makes it easier to decode. Unpaired surrogates because Id.Char_Unicode4.
580	R(
581	r'\\u[dD][89aAbB][0-9a-fA-F][0-9a-fA-F]\\u[dD][cCdDeEfF][0-9a-fA-F][0-9a-fA-F]',
582	Id.Char_SurrogatePair),
583	_ASCII_CONTROL,
584
585	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
586	R(r'[^\\"\0]+', Id.Char_Literals),
587	R(r'[^\0]', Id.Unknown_Tok),
588	]
589
590	LEXER_DEF[lex_mode_e.J8_Str] = J8_STR_DEF
591
592	OCTAL3_RE = r'\\[0-7]{1,3}'
593
594	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
595	PS1_DEF = [
596	R(OCTAL3_RE, Id.PS_Octal3),
597	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
598	# \D{%H:%M} strftime format
599	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
600	C(r'\[', Id.PS_LBrace), # non-printing
601	C(r'\]', Id.PS_RBrace),
602	R(r'[^\\\0]+', Id.PS_Literals),
603	# e.g. \x is not a valid escape.
604	C('\\', Id.PS_BadBackslash),
605	]
606
607	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
608	# point of it is that supports other backslash escapes like \n! It just
609	# becomes a regular backslash.
610	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
611	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
612	# with no leading 0.
613	R(OCTAL3_RE, Id.Char_Octal3),
614
615	# ' and " are escaped in $'' mode, but not echo -e.
616	C(r"\'", Id.Char_OneChar),
617	C(r'\"', Id.Char_OneChar),
618
619	# e.g. 'foo', anything that's not a backslash escape or '
620	R(r"[^\\'\0]+", Id.Char_Literals),
621	C("'", Id.Right_SingleQuote),
622
623	# Backslash that ends the file! Caught by re2c exhaustiveness check. Parser
624	# will assert; should give a better syntax error.
625	C('\\\0', Id.Unknown_Tok),
626	]
627
628	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
629	R(OCTAL3_RE, Id.Char_Octal3),
630	R(r"[^%\\\0]+", Id.Char_Literals),
631	C('%%', Id.Format_EscapedPercent),
632	C('%', Id.Format_Percent),
633	]
634
635	# Maybe: bash also supports %(strftime)T
636	LEXER_DEF[lex_mode_e.PrintfPercent] = [
637	# Flags
638	R('[- +#]', Id.Format_Flag),
639	C('0', Id.Format_Zero),
640	R('[1-9][0-9]*', Id.Format_Num),
641	C('*', Id.Format_Star),
642	C('.', Id.Format_Dot),
643	# We support dsq. The others we parse to display an error message.
644	R('[disqbcouxXeEfFgG]', Id.Format_Type),
645	R('$[^()\0]*$T', Id.Format_Time),
646	R(r'[^\0]', Id.Unknown_Tok), # any other char
647	]
648
649	LEXER_DEF[lex_mode_e.VSub_1] = [
650	R(VAR_NAME_RE, Id.VSub_Name),
651	# ${11} is valid, compared to $11 which is $1 and then literal 1.
652	R(r'[0-9]+', Id.VSub_Number),
653	C('!', Id.VSub_Bang),
654	C('@', Id.VSub_At),
655	C('#', Id.VSub_Pound),
656	C('$', Id.VSub_Dollar),
657	C('*', Id.VSub_Star),
658	C('-', Id.VSub_Hyphen),
659	C('?', Id.VSub_QMark),
660	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
661	C('}', Id.Right_DollarBrace),
662	C('\\\n', Id.Ignored_LineCont),
663	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
664	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
665	]
666
667	LEXER_DEF[lex_mode_e.VSub_2] = \
668	ID_SPEC.LexerPairs(Kind.VTest) + \
669	ID_SPEC.LexerPairs(Kind.VOp0) + \
670	ID_SPEC.LexerPairs(Kind.VOpOil) + \
671	ID_SPEC.LexerPairs(Kind.VOp1) + \
672	ID_SPEC.LexerPairs(Kind.VOp2) + \
673	ID_SPEC.LexerPairs(Kind.VOp3) + [
674	C('}', Id.Right_DollarBrace),
675
676	C('\\\n', Id.Ignored_LineCont),
677	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
678	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
679	]
680
681	_EXPR_ARITH_SHARED = [
682	C('\\\n', Id.Ignored_LineCont),
683	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
684	]
685
686	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
687	LEXER_DEF[lex_mode_e.Arith] = \
688	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
689
690	# Arithmetic expressions can cross newlines.
691	R(r'[ \t\r\n]+', Id.Ignored_Space),
692
693	# Examples of arith constants:
694	# 64#azAZ
695	# 0xabc 0xABC
696	# 0123
697	# A separate digits token makes this easier to parse STATICALLY. But this
698	# doesn't help with DYNAMIC parsing.
699	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
700	R(r'[0-9]+', Id.Lit_Digits),
701	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
702	C('#', Id.Lit_Pound), # for 64#a
703
704	# TODO: 64#@ interferes with VS_AT. Hm.
705	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
706
707	# A lexer for the parser that converts globs to extended regexes. Since we're
708	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
709	# don't need lexer modes here.
710	GLOB_DEF = [
711	# These could be operators in the glob, or just literals in a char class,
712	# e.g. touch '?'; echo [?].
713	C('*', Id.Glob_Star),
714	C('?', Id.Glob_QMark),
715
716	# For negation. Treated as operators inside [], but literals outside.
717	C('!', Id.Glob_Bang),
718	C('^', Id.Glob_Caret),
719
720	# Character classes.
721	C('[', Id.Glob_LBracket),
722	C(']', Id.Glob_RBracket),
723
724	# There is no whitelist of characters; backslashes are unconditionally
725	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
726	# See libc_test.py.
727	R(r'\\[^\0]', Id.Glob_EscapedChar),
728	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
729
730	# For efficiency, combine other characters into a single token, e.g. 'py' in
731	# '*.py' or 'alpha' in '[[:alpha:]]'.
732	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
733	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
734	]
735
736	# History expansion. We're doing this as "pre-lexing" since that's what bash
737	# and zsh seem to do. Example:
738	#
739	# $ foo=x
740	# $ echo $
741	# $ !!foo # expands to echo $foo and prints x
742	#
743	# We can also reuse this in the RootCompleter to expand history interactively.
744	#
745	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
746	# quotes AGAIN.
747	#
748	# Note: \! gets expanded to literal \! for the real lexer, but no history
749	# expansion occurs.
750
751	HISTORY_DEF = [
752	# Common operators.
753	R(r'![!*^$]', Id.History_Op),
754
755	# By command number.
756	R(r'!-?[0-9]+', Id.History_Num),
757
758	# Search by prefix of substring (optional '?').
759	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
760	# No hyphen since it conflits with $-1 too.
761	#
762	# Required trailing whitespace is there to avoid conflict with [!charclass]
763	# and ${!indirect}. This is a simpler hack than the one bash has. See
764	# frontend/lex_test.py.
765	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
766
767	# Comment is until end of line
768	R(r"#[^\0]*", Id.History_Other),
769
770	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
771	# end of string.
772	R(r"'[^'\0]*'?", Id.History_Other),
773
774	# Runs of chars that are definitely not special
775	R(r"[^!\\'#\0]+", Id.History_Other),
776
777	# Escaped characters. \! disables history
778	R(r'\\[^\0]', Id.History_Other),
779	# Other single chars, like a trailing \ or !
780	R(r'[^\0]', Id.History_Other),
781	]
782
783	BRACE_RANGE_DEF = [
784	R(r'-?[0-9]+', Id.Range_Int),
785	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
786	R(r'\.\.', Id.Range_Dots),
787	R(r'[^\0]', Id.Range_Other), # invalid
788	]
789
790	#
791	# YSH lexing
792	#
793
794	# Valid in lex_mode_e.{Expr,DQ}
795	# Used by ysh/grammar_gen.py
796	YSH_LEFT_SUBS = [
797	C('$(', Id.Left_DollarParen),
798	C('${', Id.Left_DollarBrace),
799	C('$[', Id.Left_DollarBracket), # TODO: Implement $[x]
800	]
801
802	# Valid in lex_mode_e.Expr, but not valid in DQ
803	# Used by ysh/grammar_gen.py
804
805	YSH_LEFT_UNQUOTED = [
806	C('"', Id.Left_DoubleQuote),
807	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
808	C("'", Id.Left_SingleQuote),
809	C("r'", Id.Left_RSingleQuote),
810	C("u'", Id.Left_USingleQuote),
811	C("b'", Id.Left_BSingleQuote),
812	C("$'", Id.Left_DollarSingleQuote),
813	C('^"', Id.Left_CaretDoubleQuote),
814	C('"""', Id.Left_TDoubleQuote),
815	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
816	C("'''", Id.Left_TSingleQuote),
817	C("r'''", Id.Left_RTSingleQuote),
818	C("u'''", Id.Left_UTSingleQuote),
819	C("b'''", Id.Left_BTSingleQuote),
820	C('@(', Id.Left_AtParen), # Split Command Sub
821	C('^(', Id.Left_CaretParen), # Block literals in expression mode
822	C('^[', Id.Left_CaretBracket), # Expr literals
823	C('^{', Id.Left_CaretBrace), # Unused
824	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
825	C('%(', Id.Left_PercentParen), # old syntax for shell-like word arrays.
826	C('%[', Id.Expr_Reserved), # Maybe: like %() without unquoted [], {}
827	C('%{', Id.Expr_Reserved), # Table literals
828	# t = %{
829	# name:Str age:Int
830	# 'andy c' 10
831	# }
832	# Significant newlines. No unquoted [], {}
833
834	# Not sure if we'll use these
835	C('@{', Id.Expr_Reserved),
836	C('@[', Id.Expr_Reserved),
837
838	# Idea: Set literals are #{a, b} like Clojure
839	]
840
841	# Used by ysh/grammar_gen.py
842	EXPR_OPS = [
843	# Terminator
844	C(';', Id.Op_Semi),
845	C('(', Id.Op_LParen),
846	C(')', Id.Op_RParen),
847	# NOTE: type expressions are expressions, e.g. Dict[Str, Int]
848	C('[', Id.Op_LBracket),
849	C(']', Id.Op_RBracket),
850	C('{', Id.Op_LBrace),
851	C('}', Id.Op_RBrace),
852	]
853
854	# Newline is significant, but sometimes elided by expr_parse.py.
855	_EXPR_NEWLINE_COMMENT = [
856	C('\n', Id.Op_Newline),
857	R(r'#[^\n\0]*', Id.Ignored_Comment),
858	R(r'[ \t\r]+', Id.Ignored_Space),
859	]
860
861	_WHITESPACE = r'[ \t\r\n]*' # not including legacy \f \v
862
863	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
864	# consistent, and avoid '00' turning into a float!
865	_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
866
867	# Used for YSH comparison operators > >= < <=
868	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?' + _DECIMAL_INT_RE + _WHITESPACE
869
870	_FLOAT_RE = (
871	_DECIMAL_INT_RE +
872	# Unlike Python, exponent can't be like 42e5_000. There's no use because
873	# 1e309 is already inf. Let's keep our code simple.
874	r'(\.' + _DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
875
876	# Ditto, used for comparison operators
877	# Added optional Optional -?
878	# Example: -3_000_000.000_001e12
879	LOOKS_LIKE_FLOAT = _WHITESPACE + '-?' + _FLOAT_RE + _WHITESPACE
880
881	# Python 3 float literals:
882
883	# digitpart ::= digit (["_"] digit)*
884	# fraction ::= "." digitpart
885	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
886	# pointfloat ::= [digitpart] fraction \| digitpart "."
887	# exponentfloat ::= (digitpart \| pointfloat) exponent
888	# floatnumber ::= pointfloat \| exponentfloat
889
890	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
891	LEXER_DEF[lex_mode_e.Expr] = \
892	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
893	EXPR_CHARS + [
894
895	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
896	#
897	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
898	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
899	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
900	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
901	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
902	# nonzerodigit ::= "1"..."9"
903	# digit ::= "0"..."9"
904	# bindigit ::= "0" \| "1"
905	# octdigit ::= "0"..."7"
906	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
907
908	R(_DECIMAL_INT_RE, Id.Expr_DecInt),
909
910	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
911	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
912	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
913
914	R(_FLOAT_RE, Id.Expr_Float),
915
916	# These can be looked up as keywords separately, so you enforce that they have
917	# space around them?
918	R(VAR_NAME_RE, Id.Expr_Name),
919
920	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
921
922	#
923	# Arith
924	#
925
926	C(',', Id.Arith_Comma),
927	C(':', Id.Arith_Colon), # for slicing a[1:2], and mylist:pop()
928
929	C('?', Id.Arith_QMark), # regex postfix
930
931	C('+', Id.Arith_Plus), # arith infix, regex postfix
932	C('-', Id.Arith_Minus), # arith infix, regex postfix
933	C('*', Id.Arith_Star),
934	C('^', Id.Arith_Caret), # xor
935	C('/', Id.Arith_Slash),
936	C('%', Id.Arith_Percent),
937
938	C('**', Id.Arith_DStar), # exponentiation
939	C('++', Id.Arith_DPlus), # Option for string/list concatenation
940
941	C('<', Id.Arith_Less),
942	C('>', Id.Arith_Great),
943	C('<=', Id.Arith_LessEqual),
944	C('>=', Id.Arith_GreatEqual),
945	C('===', Id.Expr_TEqual),
946	C('!==', Id.Expr_NotDEqual),
947
948	C('==', Id.Unknown_DEqual), # user must choose === or ~==
949
950	# Bitwise operators
951	C('&', Id.Arith_Amp),
952	C('\|', Id.Arith_Pipe),
953	C('>>', Id.Arith_DGreat),
954	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
955
956	# Bitwise complement, as well as infix pattern matching
957	C('~', Id.Arith_Tilde),
958	C('!~', Id.Expr_NotTilde),
959	C('~~', Id.Expr_DTilde),
960	C('!~~', Id.Expr_NotDTilde),
961
962	# Left out for now:
963	# ++ -- -- needed for loops, awk?
964	# ! && \|\| -- needed for find dialect
965	# = += etc.
966
967	C('=', Id.Arith_Equal),
968
969	C('+=', Id.Arith_PlusEqual),
970	C('-=', Id.Arith_MinusEqual),
971	C('*=', Id.Arith_StarEqual),
972	C('/=', Id.Arith_SlashEqual),
973	C('%=', Id.Arith_PercentEqual),
974
975	C('>>=', Id.Arith_DGreatEqual),
976	C('<<=', Id.Arith_DLessEqual),
977	C('&=', Id.Arith_AmpEqual),
978	C('\|=', Id.Arith_PipeEqual),
979	C('^=', Id.Arith_CaretEqual), # Exponentiation
980
981	# Augmented assignment that YSH has, but sh and OSH don't have
982	C('**=', Id.Expr_DStarEqual),
983	C('//=', Id.Expr_DSlashEqual),
984
985	#
986	# Expr
987	#
988
989	C('!', Id.Expr_Bang), # For eggex negation
990
991	C('//', Id.Expr_DSlash), # For YSH integer division
992	C('~==', Id.Expr_TildeDEqual), # approximate equality
993
994	C('.', Id.Expr_Dot), # d.key is alias for d['key']
995	C('..', Id.Expr_DDot), # range 1..5
996	C('->', Id.Expr_RArrow), # s->startswith()
997	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
998
999	# Reserved this. Go uses it for channels, etc.
1000	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
1001	C('<-', Id.Expr_Reserved),
1002	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
1003	# and match (x) { 1 => "one" }
1004	# note: other languages use \|>
1005	# R/dplyr uses %>%
1006
1007	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
1008
1009	# For multiline regex literals?
1010	C('///', Id.Expr_Reserved),
1011
1012	# Splat operators
1013	C('@', Id.Expr_At),
1014	# NOTE: Unused
1015	C('@@', Id.Expr_DoubleAt),
1016	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
1017
1018	LEXER_DEF[lex_mode_e.FuncParens] = [
1019	# () with spaces
1020	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
1021	# anything else
1022	R(r'[^\0]', Id.Unknown_Tok)
1023	]