1 #!/usr/bin/env python2
2 """Builtin_printf.py."""
3 from __future__ import print_function
4
5 import time as time_ # avoid name conflict
6
7 from _devbuild.gen import arg_types
8 from _devbuild.gen.id_kind_asdl import Id, Kind, Id_t, Kind_t
9 from _devbuild.gen.runtime_asdl import cmd_value, value, value_e
10 from _devbuild.gen.syntax_asdl import (
11 loc,
12 loc_e,
13 loc_t,
14 source,
15 Token,
16 CompoundWord,
17 printf_part,
18 printf_part_e,
19 printf_part_t,
20 )
21 from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22
23 from core import alloc
24 from core import error
25 from core.error import e_die, p_die
26 from core import state
27 from core import vm
28 from frontend import flag_spec
29 from frontend import consts
30 from frontend import lexer
31 from frontend import match
32 from frontend import reader
33 from mycpp import mylib
34 from mycpp.mylib import log
35 from osh import sh_expr_eval
36 from osh import word_compile
37 from data_lang import qsn
38
39 import posix_ as posix
40
41 from typing import Dict, List, TYPE_CHECKING, cast
42
43 if TYPE_CHECKING:
44 from core import ui
45 from core.state import Mem
46 from frontend import parse_lib
47
48 _ = log
49
50
51 class _FormatStringParser(object):
52 """
53 Grammar:
54
55 width = Num | Star
56 precision = Dot (Num | Star | Zero)?
57 fmt = Percent (Flag | Zero)* width? precision? (Type | Time)
58 part = Char_* | Format_EscapedPercent | fmt
59 printf_format = part* Eof_Real # we're using the main lexer
60
61 Maybe: bash also supports %(strftime)T
62 """
63
64 def __init__(self, lexer):
65 # type: (lexer.Lexer) -> None
66 self.lexer = lexer
67
68 # uninitialized values
69 self.cur_token = None # type: Token
70 self.token_type = Id.Undefined_Tok # type: Id_t
71 self.token_kind = Kind.Undefined # type: Kind_t
72
73 def _Next(self, lex_mode):
74 # type: (lex_mode_t) -> None
75 """Advance a token."""
76 self.cur_token = self.lexer.Read(lex_mode)
77 self.token_type = self.cur_token.id
78 self.token_kind = consts.GetKind(self.token_type)
79
80 def _ParseFormatStr(self):
81 # type: () -> printf_part_t
82 """Fmt production."""
83 self._Next(lex_mode_e.PrintfPercent) # move past %
84
85 part = printf_part.Percent.CreateNull(alloc_lists=True)
86 while self.token_type in (Id.Format_Flag, Id.Format_Zero):
87 # space and + could be implemented
88 flag = lexer.TokenVal(self.cur_token) # allocation will be cached
89 if flag in '# +':
90 p_die("osh printf doesn't support the %r flag" % flag,
91 self.cur_token)
92
93 part.flags.append(self.cur_token)
94 self._Next(lex_mode_e.PrintfPercent)
95
96 if self.token_type in (Id.Format_Num, Id.Format_Star):
97 part.width = self.cur_token
98 self._Next(lex_mode_e.PrintfPercent)
99
100 if self.token_type == Id.Format_Dot:
101 part.precision = self.cur_token
102 self._Next(lex_mode_e.PrintfPercent) # past dot
103 if self.token_type in (Id.Format_Num, Id.Format_Star,
104 Id.Format_Zero):
105 part.precision = self.cur_token
106 self._Next(lex_mode_e.PrintfPercent)
107
108 if self.token_type in (Id.Format_Type, Id.Format_Time):
109 part.type = self.cur_token
110
111 # ADDITIONAL VALIDATION outside the "grammar".
112 type_val = lexer.TokenVal(part.type) # allocation will be cached
113 if type_val in 'eEfFgG':
114 p_die("osh printf doesn't support floating point", part.type)
115 # These two could be implemented. %c needs utf-8 decoding.
116 if type_val == 'c':
117 p_die("osh printf doesn't support single characters (bytes)",
118 part.type)
119
120 elif self.token_type == Id.Unknown_Tok:
121 p_die('Invalid printf format character', self.cur_token)
122
123 else:
124 p_die('Expected a printf format character', self.cur_token)
125
126 return part
127
128 def Parse(self):
129 # type: () -> List[printf_part_t]
130 self._Next(lex_mode_e.PrintfOuter)
131 parts = [] # type: List[printf_part_t]
132 while True:
133 if (self.token_kind == Kind.Char or
134 self.token_type == Id.Format_EscapedPercent or
135 self.token_type == Id.Unknown_Backslash):
136
137 # Note: like in echo -e, we don't fail with Unknown_Backslash here
138 # when shopt -u parse_backslash because it's at runtime rather than
139 # parse time.
140 # Users should use $'' or the future static printf ${x %.3f}.
141
142 parts.append(printf_part.Literal(self.cur_token))
143
144 elif self.token_type == Id.Format_Percent:
145 parts.append(self._ParseFormatStr())
146
147 elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
148 # Id.Eol_Tok: special case for format string of '\x00'.
149 break
150
151 else:
152 raise AssertionError(self.token_type)
153
154 self._Next(lex_mode_e.PrintfOuter)
155
156 return parts
157
158
159 class Printf(vm._Builtin):
160 def __init__(self, mem, parse_ctx, unsafe_arith, errfmt):
161 # type: (Mem, parse_lib.ParseContext, sh_expr_eval.UnsafeArith, ui.ErrorFormatter) -> None
162 self.mem = mem
163 self.parse_ctx = parse_ctx
164 self.unsafe_arith = unsafe_arith
165 self.errfmt = errfmt
166 self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
167
168 self.shell_start_time = time_.time(
169 ) # this object initialized in main()
170
171 def _Format(self, parts, varargs, locs, out):
172 # type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
173 """Hairy printf formatting logic."""
174
175 arg_index = 0
176 num_args = len(varargs)
177 backslash_c = False
178
179 while True: # loop over arguments
180 for part in parts: # loop over parsed format string
181 UP_part = part
182 if part.tag() == printf_part_e.Literal:
183 part = cast(printf_part.Literal, UP_part)
184 token = part.token
185 if token.id == Id.Format_EscapedPercent:
186 s = '%'
187 else:
188 s = word_compile.EvalCStringToken(token)
189 out.append(s)
190
191 elif part.tag() == printf_part_e.Percent:
192 # Note: This case is very long, but hard to refactor because of the
193 # error cases and "recycling" of args! (arg_index, return 1, etc.)
194 part = cast(printf_part.Percent, UP_part)
195
196 # TODO: These calculations are independent of the data, so could be
197 # cached
198 flags = [] # type: List[str]
199 if len(part.flags) > 0:
200 for flag_token in part.flags:
201 flags.append(lexer.TokenVal(flag_token))
202
203 width = -1 # nonexistent
204 if part.width:
205 if part.width.id in (Id.Format_Num, Id.Format_Zero):
206 width_str = lexer.TokenVal(part.width)
207 width_loc = part.width # type: loc_t
208 elif part.width.id == Id.Format_Star:
209 if arg_index < num_args:
210 width_str = varargs[arg_index]
211 width_loc = locs[arg_index]
212 arg_index += 1
213 else:
214 width_str = '' # invalid
215 width_loc = loc.Missing
216 else:
217 raise AssertionError()
218
219 try:
220 width = int(width_str)
221 except ValueError:
222 if width_loc.tag() == loc_e.Missing:
223 width_loc = part.width
224 self.errfmt.Print_("printf got invalid width %r" %
225 width_str,
226 blame_loc=width_loc)
227 return 1
228
229 precision = -1 # nonexistent
230 if part.precision:
231 if part.precision.id == Id.Format_Dot:
232 precision_str = '0'
233 precision_loc = part.precision # type: loc_t
234 elif part.precision.id in (Id.Format_Num,
235 Id.Format_Zero):
236 precision_str = lexer.TokenVal(part.precision)
237 precision_loc = part.precision
238 elif part.precision.id == Id.Format_Star:
239 if arg_index < num_args:
240 precision_str = varargs[arg_index]
241 precision_loc = locs[arg_index]
242 arg_index += 1
243 else:
244 precision_str = ''
245 precision_loc = loc.Missing
246 else:
247 raise AssertionError()
248
249 try:
250 precision = int(precision_str)
251 except ValueError:
252 if precision_loc.tag() == loc_e.Missing:
253 precision_loc = part.precision
254 self.errfmt.Print_(
255 'printf got invalid precision %r' %
256 precision_str,
257 blame_loc=precision_loc)
258 return 1
259
260 if arg_index < num_args:
261 s = varargs[arg_index]
262 word_loc = locs[arg_index] # type: loc_t
263 arg_index += 1
264 has_arg = True
265 else:
266 s = ''
267 word_loc = loc.Missing
268 has_arg = False
269
270 # Note: %s could be lexed into Id.Percent_S. Although small string
271 # optimization would remove the allocation as well.
272 typ = lexer.TokenVal(part.type)
273 if typ == 's':
274 if precision >= 0:
275 s = s[:precision] # truncate
276
277 elif typ == 'q':
278 # TODO: most shells give \' for single quote, while OSH gives $'\''
279 # this could matter when SSH'ing
280 s = qsn.maybe_shell_encode(s)
281
282 elif typ == 'b':
283 # Process just like echo -e, except \c handling is simpler.
284
285 c_parts = [] # type: List[str]
286 lex = match.EchoLexer(s)
287 while True:
288 id_, tok_val = lex.Next()
289 if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
290 break
291
292 # Note: DummyToken is OK because EvalCStringToken() doesn't have
293 # any syntax errors.
294 tok = lexer.DummyToken(id_, tok_val)
295 p = word_compile.EvalCStringToken(tok)
296
297 # Unusual behavior: '\c' aborts processing!
298 if p is None:
299 backslash_c = True
300 break
301
302 c_parts.append(p)
303 s = ''.join(c_parts)
304
305 elif part.type.id == Id.Format_Time or typ in 'diouxX':
306 # %(...)T and %d share this complex integer conversion logic
307
308 try:
309 d = int(
310 s
311 ) # note: spaces like ' -42 ' accepted and normalized
312
313 except ValueError:
314 # 'a is interpreted as the ASCII value of 'a'
315 if len(s) >= 1 and s[0] in '\'"':
316 # TODO: utf-8 decode s[1:] to be more correct. Probably
317 # depends on issue #366, a utf-8 library.
318 # Note: len(s) == 1 means there is a NUL (0) after the quote..
319 d = ord(s[1]) if len(s) >= 2 else 0
320
321 # No argument means -1 for %(...)T as in Bash Reference Manual
322 # 4.2 "If no argument is specified, conversion behaves as if -1
323 # had been given."
324 elif not has_arg and part.type.id == Id.Format_Time:
325 d = -1
326
327 else:
328 if has_arg:
329 blame_loc = word_loc # type: loc_t
330 else:
331 blame_loc = part.type
332 self.errfmt.Print_(
333 'printf expected an integer, got %r' % s,
334 blame_loc)
335 return 1
336
337 if part.type.id == Id.Format_Time:
338 # Initialize timezone:
339 # `localtime' uses the current timezone information initialized
340 # by `tzset'. The function `tzset' refers to the environment
341 # variable `TZ'. When the exported variable `TZ' is present,
342 # its value should be reflected in the real environment
343 # variable `TZ' before call of `tzset'.
344 #
345 # Note: unlike LANG, TZ doesn't seem to change behavior if it's
346 # not exported.
347 #
348 # TODO: In YSH, provide an API that doesn't rely on libc's global
349 # state.
350
351 tzcell = self.mem.GetCell('TZ')
352 if tzcell and tzcell.exported and tzcell.val.tag(
353 ) == value_e.Str:
354 tzval = cast(value.Str, tzcell.val)
355 posix.putenv('TZ', tzval.s)
356
357 time_.tzset()
358
359 # Handle special values:
360 # User can specify two special values -1 and -2 as in Bash
361 # Reference Manual 4.2: "Two special argument values may be
362 # used: -1 represents the current time, and -2 represents the
363 # time the shell was invoked." from
364 # https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
365 if d == -1: # the current time
366 ts = time_.time()
367 elif d == -2: # the shell start time
368 ts = self.shell_start_time
369 else:
370 ts = d
371
372 s = time_.strftime(typ[1:-2], time_.localtime(ts))
373 if precision >= 0:
374 s = s[:precision] # truncate
375
376 else: # typ in 'diouxX'
377 # Disallowed because it depends on 32- or 64- bit
378 if d < 0 and typ in 'ouxX':
379 e_die(
380 "Can't format negative number %d with %%%s"
381 % (d, typ), part.type)
382
383 if typ == 'o':
384 s = mylib.octal(d)
385 elif typ == 'x':
386 s = mylib.hex_lower(d)
387 elif typ == 'X':
388 s = mylib.hex_upper(d)
389 else: # diu
390 s = str(d) # without spaces like ' -42 '
391
392 # There are TWO different ways to ZERO PAD, and they differ on
393 # the negative sign! See spec/builtin-printf
394
395 zero_pad = 0 # no zero padding
396 if width >= 0 and '0' in flags:
397 zero_pad = 1 # style 1
398 elif precision > 0 and len(s) < precision:
399 zero_pad = 2 # style 2
400
401 if zero_pad:
402 negative = (s[0] == '-')
403 if negative:
404 digits = s[1:]
405 sign = '-'
406 if zero_pad == 1:
407 # [%06d] -42 becomes [-00042] (6 TOTAL)
408 n = width - 1
409 else:
410 # [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
411 n = precision
412 else:
413 digits = s
414 sign = ''
415 if zero_pad == 1:
416 n = width
417 else:
418 n = precision
419 s = sign + digits.rjust(n, '0')
420
421 else:
422 raise AssertionError()
423
424 if width >= 0:
425 if '-' in flags:
426 s = s.ljust(width, ' ')
427 else:
428 s = s.rjust(width, ' ')
429
430 out.append(s)
431
432 else:
433 raise AssertionError()
434
435 if backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
436 break
437
438 if arg_index == 0:
439 # We went through ALL parts and didn't consume ANY arg.
440 # Example: print x y
441 break
442 if arg_index >= num_args:
443 # We printed all args
444 break
445 # There are more arg: Implement the 'arg recycling' behavior.
446
447 return 0
448
449 def Run(self, cmd_val):
450 # type: (cmd_value.Argv) -> int
451 """
452 printf: printf [-v var] format [argument ...]
453 """
454 attrs, arg_r = flag_spec.ParseCmdVal('printf', cmd_val)
455 arg = arg_types.printf(attrs.attrs)
456
457 fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
458 varargs, locs = arg_r.Rest2()
459
460 #log('fmt %s', fmt)
461 #log('vals %s', vals)
462
463 arena = self.parse_ctx.arena
464 if fmt in self.parse_cache:
465 parts = self.parse_cache[fmt]
466 else:
467 line_reader = reader.StringLineReader(fmt, arena)
468 # TODO: Make public
469 lexer = self.parse_ctx.MakeLexer(line_reader)
470 parser = _FormatStringParser(lexer)
471
472 with alloc.ctx_SourceCode(arena, source.ArgvWord('printf', fmt_loc)):
473 try:
474 parts = parser.Parse()
475 except error.Parse as e:
476 self.errfmt.PrettyPrintError(e)
477 return 2 # parse error
478
479 self.parse_cache[fmt] = parts
480
481 if 0:
482 print()
483 for part in parts:
484 part.PrettyPrint()
485 print()
486
487 out = [] # type: List[str]
488 status = self._Format(parts, varargs, locs, out)
489 if status != 0:
490 return status # failure
491
492 result = ''.join(out)
493 if arg.v is not None:
494 # TODO: get the location for arg.v!
495 v_loc = loc.Missing
496 lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
497 state.BuiltinSetValue(self.mem, lval, value.Str(result))
498 else:
499 mylib.Stdout().write(result)
500 return 0