OILS / frontend / syntax.asdl View on Github | oilshell.org

633 lines, 286 significant
1# Data types for the Oils AST, aka "Lossless Syntax Tree".
2#
3# Invariant: the source text can be reconstructed byte-for-byte from this
4# tree. The test/arena.sh file partially verifies this.
5#
6# Exceptions:
7# - <<- here docs with leading tabs, since we don't want those for
8# conversion. We don't want files with mixed tabs and spaces.
9# - Distinguishing between function styles wasn't necessary:
10# - foo() { } vs function foo { } # ksh style
11
12# We usually try to preserve the physical order of the source in the ASDL
13# fields. One exception is the order of redirects:
14#
15# echo >out.txt hi
16# # versus
17# echo hi >out.txt
18
19# Unrepresented:
20# - let arithmetic (rarely used)
21# - coprocesses # one with arg and one without
22# - select block
23# - case fallthrough ;& and ;;&
24
25# Possible refactorings:
26#
27# printf_part = Literal %Token | ...
28#
29# # %CompoundWord as first class variant:
30# bool_expr = WordTest %CompoundWord | ...
31#
32# # Can DoubleQuoted have a subset of parts compared with CompoundWord?
33# string_part = ... # subset of word_part
34#
35# - Distinguish word_t with BracedTree vs. those without? seq_word_t?
36# - Remove command.NoOp ?
37
38module syntax
39{
40 # More efficient than the List[bool] pattern we've been using
41 BoolParamBox = (bool b)
42 IntParamBox = (int i)
43
44 # core/main_loop.py
45 parse_result = EmptyLine | Eof | Node(command cmd)
46
47 # 'source' represents the location of a line / token.
48 source =
49 Interactive
50 | Headless
51 | Unused(str comment) # completion and history never show parse errors?
52 | CFlag
53 | Stdin(str comment)
54
55 # TODO: if it's not the main script, it's sourced, and you could provide
56 # a chain of locations back to the sourced script!
57 # MainFile(str path) or SourcedFile(str path, loc location)
58 | MainFile(str path)
59 | SourcedFile(str path, loc location)
60
61 # code parsed from a word
62 # used for 'eval', 'trap', 'printf', 'complete -W', etc.
63 | ArgvWord(str what, loc location)
64
65 # code parsed from the value of a variable
66 # used for $PS1 $PROMPT_COMMAND
67 | Variable(str var_name, loc location)
68
69 # Point to the original variable reference
70 | VarRef(Token orig_tok)
71
72 # alias expansion (location of first word)
73 | Alias(str argv0, loc argv0_loc)
74
75 # 2 kinds of reparsing: backticks, and x+1 in a[x+1]=y
76 # TODO: use this for eval_unsafe_arith instead of Variable
77 | Reparsed(str what, Token left_token, Token right_token)
78
79 # For --location-str
80 | Synthetic(str s)
81
82 SourceLine = (int line_num, str content, source src)
83
84 # Two ways to make Token smaller:
85 # - remove .tval field. If necessary, the string value could be manually
86 # computed and attached to specific LST nodes.
87 # - or is it easier to compute on demand?
88 # - Get rid of span_id, and re-compute length on demand too
89 # - span_id is only used by tools/ysh_ify.py
90 # - but length is used when computing .tval
91 Token = (id id, int col, int length, int span_id, SourceLine? line, str tval)
92
93 # Slight ASDL bug: CompoundWord has to be defined before using it as a shared
94 # variant. The _product_counter algorithm should be moved into a separate
95 # tag-assigning pass, and shared between gen_python.py and gen_cpp.py.
96 CompoundWord = (List[word_part] parts)
97
98 # Source location for errors
99 loc =
100 Missing # equivalent of runtime.NO_SPID
101 | Token %Token
102 # Very common case: argv arrays need original location
103 | ArgWord %CompoundWord
104 | WordPart(word_part p)
105 | Word(word w)
106 | Arith(arith_expr a)
107 # e.g. for errexit blaming
108 | Command(command c)
109
110 debug_frame =
111 Main(str dollar0)
112 # call_loc => BASH_LINENO
113 # call_loc may be None with new --source flag?
114 | Source(Token? call_tok, str source_name)
115 # def_tok => BASH_SOURCE
116 # call_loc may be None if invoked via RunFuncForCompletion?
117 | Call(Token? call_tok, Token def_tok, str func_name)
118
119 #
120 # Shell language
121 #
122
123 bracket_op =
124 WholeArray(id op_id) # * or @
125 | ArrayIndex(arith_expr expr)
126
127 suffix_op =
128 Nullary %Token # ${x@Q} or ${!prefix@} (which also has prefix_op)
129 | Unary(Token op, rhs_word arg_word) # e.g. ${v:-default}
130 # TODO: Implement YSH ${x|html} and ${x %.3f}
131 | Static(Token tok, str arg)
132 | PatSub(CompoundWord pat, rhs_word replace, id replace_mode, Token slash_tok)
133 # begin is optional with ${array::1}
134 | Slice(arith_expr? begin, arith_expr? length)
135
136 BracedVarSub = (
137 Token left, # in dynamic ParseVarRef, same as name_tok
138 Token token, # location for the name
139 str var_name, # the name
140 Token? prefix_op, # prefix # or ! operators
141 bracket_op? bracket_op,
142 suffix_op? suffix_op,
143 Token right # in dynamic ParseVarRef, same as name_tok
144 )
145
146 # Variants:
147 # - Look at left token ID for $'' c'' vs r'' '' e.g. Id.Left_DollarSingleQuote
148 # - And """ and ''' e.g. Id.Left_TDoubleQuote
149 DoubleQuoted = (Token left, List[word_part] parts, Token right)
150 SingleQuoted = (Token left, List[Token] tokens, Token right)
151
152 SimpleVarSub = (Token left, str var_name)
153
154 CommandSub = (Token left_token, command child, Token right)
155
156 # - can contain word.BracedTree
157 # - no 'Token right' for now, doesn't appear to be used
158 ShArrayLiteral = (Token left, List[word] words, Token right)
159
160 # Unevaluated, typed arguments for func and proc.
161 # Note that ...arg is expr.Spread.
162 ArgList = (
163 Token left, List[expr] pos_args,
164 Token? semi_tok, List[NamedArg] named_args,
165 Token right
166 )
167
168 AssocPair = (CompoundWord key, CompoundWord value)
169
170 word_part =
171 ShArrayLiteral %ShArrayLiteral
172 | BashAssocLiteral(Token left, List[AssocPair] pairs, Token right)
173 | Literal %Token
174 # escaped case is separate so the evaluator doesn't have to check token ID
175 | EscapedLiteral(Token token, str ch)
176 | SingleQuoted %SingleQuoted
177 | DoubleQuoted %DoubleQuoted
178 | SimpleVarSub %SimpleVarSub
179 | BracedVarSub %BracedVarSub
180 # For command sub and process sub: $(...) <(...) >(...)
181 | CommandSub %CommandSub
182 # ~ or ~bob
183 | TildeSub(Token token, str? user_name)
184 | ArithSub(Token left, arith_expr anode, Token right)
185 # {a,b,c}
186 | BracedTuple(List[CompoundWord] words)
187 # {1..10} or {-5..10..2} or {01..10} (leading zeros matter)
188 # {a..f} or {a..f..2} or {a..f..-2}
189 # the whole range is one Token,
190 | BracedRange(Token blame_tok, id kind, str start, str end, int step)
191 # note: optional int may need special handling in ASDL
192 # extended globs are parsed statically, unlike globs
193 | ExtGlob(Token op, List[CompoundWord] arms, Token right)
194
195 # YSH word_part extensions
196
197 # @myarray
198 | Splice(Token blame_tok, str var_name)
199 # $[d.key], etc.
200 | ExprSub(Token left, expr child, Token right)
201
202 # Use cases for Empty: RHS of 'x=', the argument in "${x:-}".
203 # The latter is semantically necessary. (See osh/word_parse.py).
204 # At runtime: RHS of 'declare x='.
205 rhs_word = Empty | Compound %CompoundWord
206
207 word =
208 # Returns from WordParser, but not generally stored in LST
209 Operator %Token
210 # A Compound word can contain any word_part except the Braced*Part.
211 # We could model this with another variant type but it incurs runtime
212 # overhead and seems like overkill. Note that DoubleQuoted can't
213 # contain a SingleQuoted, etc. either.
214 | Compound %CompoundWord
215 # For word sequences command.Simple, ShArrayLiteral, for_iter.Words
216 # Could be its own type
217 | BracedTree(List[word_part] parts)
218 # For dynamic parsing of test aka [ - the string is already evaluated.
219 | String(id id, str s, CompoundWord? blame_loc)
220
221 # Note: the name 'foo' is derived from token value 'foo=' or 'foo+='
222 sh_lhs =
223 Name(Token left, str name)
224 | IndexedName(Token left, str name, arith_expr index)
225 | UnparsedIndex(Token left, str name, str index) # for translation
226
227 arith_expr =
228 VarSub %SimpleVarSub # e.g. $(( x ))
229 | Word %CompoundWord # e.g. $(( 123'456'$y ))
230
231 | UnaryAssign(id op_id, arith_expr child)
232 | BinaryAssign(id op_id, arith_expr left, arith_expr right)
233
234 | Unary(id op_id, arith_expr child)
235 # TODO: op should be token, e.g. for divide by zero
236 | Binary(id op_id, arith_expr left, arith_expr right)
237 | TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr)
238
239 bool_expr =
240 WordTest(word w) # e.g. [[ myword ]]
241 | Binary(id op_id, word left, word right)
242 | Unary(id op_id, word child)
243 | LogicalNot(bool_expr child)
244 | LogicalAnd(bool_expr left, bool_expr right)
245 | LogicalOr(bool_expr left, bool_expr right)
246
247 redir_loc =
248 Fd(int fd) | VarName(str name)
249
250 redir_param =
251 Word %CompoundWord
252 | HereDoc(word here_begin, # e.g. EOF or 'EOF'
253 Token? here_end_tok, # Token consisting of the whole line
254 # It's always filled in AFTER creation, but
255 # temporarily so optional
256 List[word_part] stdin_parts # one for each line
257 )
258
259 Redir = (Token op, redir_loc loc, redir_param arg)
260
261 assign_op = Equal | PlusEqual
262 AssignPair = (Token left, sh_lhs lhs, assign_op op, rhs_word rhs)
263 EnvPair = (Token left, str name, rhs_word val)
264
265 condition =
266 Shell(List[command] commands) # if false; true; then echo hi; fi
267 | YshExpr(expr e) # if (x > 0) { echo hi }
268 # TODO: add more specific blame location
269
270 # Each arm tests one word against multiple words
271 # shell: *.cc|*.h) echo C++ ;;
272 # YSH: *.cc|*.h { echo C++ }
273 #
274 # Three location tokens:
275 # 1. left - shell has ( or *.cc ysh has *.cc
276 # 2. middle - shell has ) ysh has {
277 # 3. right - shell has optional ;; ysh has required }
278 #
279 # For YSH typed case, left can be ( and /
280 # And case_pat may contain more details
281 CaseArm = (
282 Token left, pat pattern, Token middle, List[command] action,
283 Token? right
284 )
285
286 # The argument to match against in a case command
287 # In YSH-style case commands we match against an `expr`, but in sh-style case
288 # commands we match against a word.
289 case_arg =
290 Word(word w)
291 | YshExpr(expr e)
292
293 EggexFlag = (bool negated, Token flag)
294
295 # canonical_flags can be compared for equality. This is needed to splice
296 # eggexes correctly, e.g. / 'abc' @pat ; i /
297 Eggex = (
298 Token left, re regex, List[EggexFlag] flags, Token? trans_pref,
299 str? canonical_flags)
300
301 pat =
302 Else
303 | Words(List[word] words)
304 | YshExprs(List[expr] exprs)
305 | Eggex %Eggex
306
307 # Each if arm starts with either an "if" or "elif" keyword
308 # In YSH, the then keyword is not used (replaced by braces {})
309 IfArm = (
310 Token keyword, condition cond, Token? then_kw, List[command] action,
311 List[int] spids)
312
313 for_iter =
314 Args # for x; do echo $x; done # implicit "$@"
315 | Words(List[word] words) # for x in 'foo' *.py { echo $x }
316 # like ShArrayLiteral, but no location for %(
317 | YshExpr(expr e, Token blame) # for x in (mylist) { echo $x }
318
319 BraceGroup = (
320 Token left, Token? doc_token, List[command] children,
321 List[Redir] redirects, Token right
322 )
323
324 Param = (Token blame_tok, str name, TypeExpr? type, expr? default_val)
325 RestParam = (Token blame_tok, str name)
326
327 ParamGroup = (List[Param] params, RestParam? rest_of)
328
329 # 'open' is for proc p { }; closed is for proc p () { }
330 proc_sig =
331 Open
332 | Closed(ParamGroup? word, ParamGroup? positional, ParamGroup? named,
333 Param? block_param)
334
335 Proc = (Token keyword, Token name, proc_sig sig, command body)
336
337 Func = (
338 Token keyword, Token name,
339 ParamGroup? positional, ParamGroup? named,
340 command body
341 )
342
343 # Retain references to lines
344 LiteralBlock = (BraceGroup brace_group, List[SourceLine] lines)
345
346 # Represents all these case: s=1 s+=1 s[x]=1 ...
347 ParsedAssignment = (Token? left, Token? close, int part_offset, CompoundWord w)
348
349 command =
350 NoOp
351 | Simple(Token? blame_tok, # TODO: make required (BracedTuple?)
352 List[EnvPair] more_env,
353 List[word] words, List[Redir] redirects,
354 ArgList? typed_args, LiteralBlock? block,
355 # do_fork is semantic, not syntactic
356 bool do_fork)
357 # This doesn't technically belong in the LST, but it's convenient for
358 # execution
359 | ExpandedAlias(command child, List[Redir] redirects, List[EnvPair] more_env)
360 | Sentence(command child, Token terminator)
361 # Represents "bare assignment"
362 # Token left is redundant with pairs[0].left
363 | ShAssignment(Token left, List[AssignPair] pairs, List[Redir] redirects)
364 | Retval(Token keyword, expr val)
365 | ControlFlow(Token keyword, word? arg_word)
366 # ops are | |&
367 | Pipeline(Token? negated, List[command] children, List[Token] ops)
368 # ops are && ||
369 | AndOr(List[command] children, List[Token] ops)
370 # Part of for, while, until (but not if, case, ShFunction). No redirects.
371 | DoGroup(Token left, List[command] children, Token right)
372 # A brace group is a compound command, with redirects.
373 | BraceGroup %BraceGroup
374 # Contains a single child, like CommandSub
375 | Subshell(Token left, command child, Token right, List[Redir] redirects)
376 | DParen(Token left, arith_expr child, Token right, List[Redir] redirects)
377 | DBracket(Token left, bool_expr expr, Token right, List[Redir] redirects)
378 # up to 3 iterations variables
379 | ForEach(Token keyword, List[str] iter_names, for_iter iterable,
380 Token? semi_tok, command body, List[Redir] redirects)
381 # C-style for loop. Any of the 3 expressions can be omitted.
382 # Note: body is required, but only optional here because of initialization
383 # order.
384 | ForExpr(Token keyword, arith_expr? init, arith_expr? cond,
385 arith_expr? update, command? body, List[Redir] redirects)
386 | WhileUntil(Token keyword, condition cond, command body, List[Redir] redirects)
387 | If(Token if_kw, List[IfArm] arms, Token? else_kw, List[command] else_action,
388 Token? fi_kw, List[Redir] redirects)
389 | Case(Token case_kw, case_arg to_match, Token arms_start, List[CaseArm] arms,
390 Token arms_end, List[Redir] redirects)
391 # The keyword is optional in the case of bash-style functions
392 # (ie. "foo() { ... }") which do not have one.
393 | ShFunction(Token? keyword, Token name_tok, str name, command body)
394 | TimeBlock(Token keyword, command pipeline)
395 # Some nodes optimize it out as List[command], but we use CommandList for
396 # 1. the top level
397 # 2. ls ; ls & ls (same line)
398 # 3. CommandSub # single child that's a CommandList
399 # 4. Subshell # single child that's a CommandList
400 | CommandList(List[command] children)
401
402 # YSH command constructs
403
404 # var, const.
405 # - Keyword is None for hay blocks
406 # - RHS is None, for use with value.Place
407 # - TODO: consider using BareDecl
408 | VarDecl(Token? keyword, List[NameType] lhs, expr? rhs)
409
410 # this can behave like 'var', can be desugared
411 | BareDecl(Token lhs, expr rhs)
412
413 # setvar, maybe 'auto' later
414 | Mutation(Token keyword, List[y_lhs] lhs, Token op, expr rhs)
415 # = keyword
416 | Expr(Token keyword, expr e)
417 | Proc %Proc
418 | Func %Func
419
420 #
421 # Glob representation, for converting ${x//} to extended regexes.
422 #
423
424 # Example: *.[ch] is:
425 # GlobOp(<Glob_Star '*'>),
426 # GlobLit(Glob_OtherLiteral, '.'),
427 # CharClass(False, ['ch']) # from Glob_CleanLiterals token
428
429 glob_part =
430 Literal(id id, str s)
431 | Operator(id op_id) # * or ?
432 | CharClass(bool negated, List[str] strs)
433
434 # Char classes are opaque for now. If we ever need them:
435 # - Collating symbols are [. .]
436 # - Equivalence classes are [=
437
438 printf_part =
439 Literal(Token token)
440 # flags are 0 hyphen space + #
441 # type is 's' for %s, etc.
442 | Percent(List[Token] flags, Token? width, Token? precision, Token type)
443
444 #
445 # YSH Language
446 #
447 # Copied and modified from Python-3.7/Parser/Python.asdl !
448
449 expr_context = Load | Store | Del | AugLoad | AugStore | Param
450
451 # Type expressions: Int List[Int] Dict[Str, Any]
452 # Do we have Func[Int, Int => Int] ? I guess we can parse that into this
453 # system.
454 TypeExpr = (Token tok, str name, List[TypeExpr] params)
455
456 # LHS bindings in var/const, and eggex
457 # TODO: need str name, Token blame_tok
458 NameType = (Token name, TypeExpr? typ)
459
460 # TODO: Inline this into GenExp and ListComp? Just use a flag there?
461 Comprehension = (List[NameType] lhs, expr iter, expr? cond)
462
463 # Named arguments supplied to call. Token is null for f(; ...named).
464 NamedArg = (Token? name, expr value)
465
466 # Subscripts are lists of expressions
467 # a[:i, n] (we don't have matrices, but we have data frames)
468 Subscript = (Token left, expr obj, expr index)
469
470 # Attributes are obj.attr, d->key, name::scope,
471 Attribute = (expr obj, Token op, Token attr, expr_context ctx)
472
473 y_lhs =
474 Var(Token name) # TODO: add str var_name
475 | Subscript %Subscript
476 | Attribute %Attribute
477
478 place_op =
479 # &a[i+1]
480 Subscript(Token op, expr index)
481 # &d.mykey
482 | Attribute(Token op, Token attr)
483
484 expr =
485 # a variable name to evaluate
486 Var(Token name) # TODO: add str var_name
487 # For null, Bool, Int, Float
488 # Python uses Num(object n), which doesn't respect our "LST" invariant.
489 | Const(Token c)
490
491 # read(&x) json read (&x[0])
492 | Place(Token blame_tok, str var_name, place_op* ops)
493
494 # :| one 'two' "$three" |
495 | ShArrayLiteral %ShArrayLiteral
496
497 # / d+ ; ignorecase; %python /
498 | Eggex %Eggex
499
500 | SimpleVarSub %SimpleVarSub
501 | BracedVarSub %BracedVarSub
502 | CommandSub %CommandSub
503 | SingleQuoted %SingleQuoted
504 | DoubleQuoted %DoubleQuoted
505
506 | Literal(expr inner)
507 | Lambda(List[NameType] params, expr body)
508
509 | Unary(Token op, expr child)
510 | Binary(Token op, expr left, expr right)
511 # x < 4 < 3 and (x < 4) < 3
512 | Compare(expr left, List[Token] ops, List[expr] comparators)
513 | FuncCall(expr func, ArgList args)
514
515 # TODO: Need a representation for method call. We don't just want
516 # Attribute() and then Call()
517
518 | IfExp(expr test, expr body, expr orelse)
519 | Tuple(Token left, List[expr] elts, expr_context ctx)
520
521 | List(Token left, List[expr] elts, expr_context ctx)
522 | Dict(Token left, List[expr] keys, List[expr] values)
523 # For the values in {n1, n2}
524 | Implicit
525
526 | ListComp(Token left, expr elt, List[Comprehension] generators)
527 # not implemented
528 | DictComp(Token left, expr key, expr value, List[Comprehension] generators)
529 | GeneratorExp(expr elt, List[Comprehension] generators)
530
531 # Ranges are written 1:2, with first class expression syntax. There is no
532 # step as in Python. Use range(0, 10, step=2) for that.
533 | Range(expr lower, Token op, expr upper)
534
535 # Slices occur within [] only. Unlike ranges, the start/end can be #
536 # implicit. Like ranges, denote a step with slice(0, 10, step=2).
537 # a[3:] a[:i]
538 | Slice(expr? lower, Token op, expr? upper)
539
540 | Subscript %Subscript
541 | Attribute %Attribute
542
543 # Ellipsis is like 'Starred' within Python, which are valid on the LHS in
544 # Python for unpacking, and # within list literals for splicing.
545 # (Starred is NOT used for {k:v, **a}. That used a blank "keys"
546 # attribute.)
547
548 # I think we can use { **pairs } like Python
549 | Spread(Token left, expr child)
550
551 #
552 # Regex Language (Eggex)
553 #
554
555 # e.g. alnum digit
556 PosixClass = (Token? negated, str name)
557 # e.g. d w s
558 PerlClass = (Token? negated, str name)
559
560 # Note: .NET has && in character classes, making it a recursive language
561
562 class_literal_term =
563 PosixClass %PosixClass
564 | PerlClass %PerlClass
565 # [a-z] ~[a-z] TODO: Doesn't respect LST invariant
566
567 | Range(Token start, Token end)
568 | CharLiteral(Token tok)
569
570 | SingleQuoted %SingleQuoted
571 # @chars
572 | Splice(Token name, str var_name)
573
574 # Char Sets and Ranges both use Char Codes
575 # with u_braced == true : \u{ff}
576 # with u_braced == false: \xff \\ 'a' a '0' 0
577 # ERE doesn't make a distinction, but compiling to Python/PCRE can use it
578 CharCode = (int i, bool u_braced, Token blame_tok)
579
580 # evaluated version of class_literal_term (could be in runtime.asdl)
581 char_class_term =
582 PosixClass %PosixClass
583 | PerlClass %PerlClass
584
585 | Range(CharCode start, CharCode end)
586
587 # For [ \x00 \\ ]
588 | CharCode %CharCode
589
590 # NOTE: modifier is unused now, can represent L or P
591 re_repeat =
592 Op(Token op)
593 | Num(Token times)
594 # dot{1,2}
595 | Range(Token? lower, Token? upper)
596 # Haven't implemented the modifier, e.g. x{+ P}
597 # | Num(Token times, id modifier)
598 # | Range(Token? lower, Token? upper, id modifier)
599
600 re =
601 # e.g. . ^ $ %begin \u123
602 Token %Token
603 | PosixClass %PosixClass
604 | PerlClass %PerlClass
605 # syntax [ $x \n ]
606 | CharClassLiteral(bool negated, List[class_literal_term] terms)
607 # evaluated [ 'abc' \n ]
608 | CharClass(bool negated, List[char_class_term] terms)
609
610 # @D
611 | Splice(Token name, str var_name)
612
613 | SingleQuoted %SingleQuoted
614
615 # Compound:
616 | Repeat(re child, re_repeat op)
617 | Seq(List[re] children)
618 | Alt(List[re] children)
619
620 | Group(re child)
621 # convert_func is filled in on evaluation
622 # TODO: name and func_name can be expanded to strings
623 | Capture(re child, Token? name, Token? func_name)
624 | Backtracking(bool negated, Token name, re child)
625
626 # These nodes are never parsed; they're part of execution.
627 # Right now we do them in _EvalRegex, though many transformations could be
628 # done as constant evaluation.
629
630 | Primitive(id id) # . dot etc.
631 # String substitutions are evaluated into literals
632 | LiteralChars(str s, Token blame_tok)
633}