osh/builtin_printf.py

OILS / osh / builtin_printf.py

1	#!/usr/bin/env python2
2	"""Builtin_printf.py."""
3	from __future__ import print_function
4
5	import time as time_ # avoid name conflict
6
7	from _devbuild.gen import arg_types
8	from _devbuild.gen.id_kind_asdl import Id, Kind, Id_t, Kind_t
9	from _devbuild.gen.runtime_asdl import cmd_value, value, value_e
10	from _devbuild.gen.syntax_asdl import (
11	loc,
12	loc_e,
13	loc_t,
14	source,
15	Token,
16	CompoundWord,
17	printf_part,
18	printf_part_e,
19	printf_part_t,
20	)
21	from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22
23	from core import alloc
24	from core import error
25	from core.error import e_die, p_die
26	from core import state
27	from core import vm
28	from frontend import flag_spec
29	from frontend import consts
30	from frontend import lexer
31	from frontend import match
32	from frontend import reader
33	from mycpp import mylib
34	from mycpp.mylib import log
35	from osh import sh_expr_eval
36	from osh import word_compile
37	from data_lang import qsn
38
39	import posix_ as posix
40
41	from typing import Dict, List, TYPE_CHECKING, cast
42
43	if TYPE_CHECKING:
44	from core import ui
45	from core.state import Mem
46	from frontend import parse_lib
47
48	_ = log
49
50
51	class _FormatStringParser(object):
52	"""
53	Grammar:
54
55	width = Num \| Star
56	precision = Dot (Num \| Star \| Zero)?
57	fmt = Percent (Flag \| Zero)* width? precision? (Type \| Time)
58	part = Char_* \| Format_EscapedPercent \| fmt
59	printf_format = part* Eof_Real # we're using the main lexer
60
61	Maybe: bash also supports %(strftime)T
62	"""
63
64	def __init__(self, lexer):
65	# type: (lexer.Lexer) -> None
66	self.lexer = lexer
67
68	# uninitialized values
69	self.cur_token = None # type: Token
70	self.token_type = Id.Undefined_Tok # type: Id_t
71	self.token_kind = Kind.Undefined # type: Kind_t
72
73	def _Next(self, lex_mode):
74	# type: (lex_mode_t) -> None
75	"""Advance a token."""
76	self.cur_token = self.lexer.Read(lex_mode)
77	self.token_type = self.cur_token.id
78	self.token_kind = consts.GetKind(self.token_type)
79
80	def _ParseFormatStr(self):
81	# type: () -> printf_part_t
82	"""Fmt production."""
83	self._Next(lex_mode_e.PrintfPercent) # move past %
84
85	part = printf_part.Percent.CreateNull(alloc_lists=True)
86	while self.token_type in (Id.Format_Flag, Id.Format_Zero):
87	# space and + could be implemented
88	flag = lexer.TokenVal(self.cur_token) # allocation will be cached
89	if flag in '# +':
90	p_die("osh printf doesn't support the %r flag" % flag,
91	self.cur_token)
92
93	part.flags.append(self.cur_token)
94	self._Next(lex_mode_e.PrintfPercent)
95
96	if self.token_type in (Id.Format_Num, Id.Format_Star):
97	part.width = self.cur_token
98	self._Next(lex_mode_e.PrintfPercent)
99
100	if self.token_type == Id.Format_Dot:
101	part.precision = self.cur_token
102	self._Next(lex_mode_e.PrintfPercent) # past dot
103	if self.token_type in (Id.Format_Num, Id.Format_Star,
104	Id.Format_Zero):
105	part.precision = self.cur_token
106	self._Next(lex_mode_e.PrintfPercent)
107
108	if self.token_type in (Id.Format_Type, Id.Format_Time):
109	part.type = self.cur_token
110
111	# ADDITIONAL VALIDATION outside the "grammar".
112	type_val = lexer.TokenVal(part.type) # allocation will be cached
113	if type_val in 'eEfFgG':
114	p_die("osh printf doesn't support floating point", part.type)
115	# These two could be implemented. %c needs utf-8 decoding.
116	if type_val == 'c':
117	p_die("osh printf doesn't support single characters (bytes)",
118	part.type)
119
120	elif self.token_type == Id.Unknown_Tok:
121	p_die('Invalid printf format character', self.cur_token)
122
123	else:
124	p_die('Expected a printf format character', self.cur_token)
125
126	return part
127
128	def Parse(self):
129	# type: () -> List[printf_part_t]
130	self._Next(lex_mode_e.PrintfOuter)
131	parts = [] # type: List[printf_part_t]
132	while True:
133	if (self.token_kind == Kind.Char or
134	self.token_type == Id.Format_EscapedPercent or
135	self.token_type == Id.Unknown_Backslash):
136
137	# Note: like in echo -e, we don't fail with Unknown_Backslash here
138	# when shopt -u parse_backslash because it's at runtime rather than
139	# parse time.
140	# Users should use $'' or the future static printf ${x %.3f}.
141
142	parts.append(printf_part.Literal(self.cur_token))
143
144	elif self.token_type == Id.Format_Percent:
145	parts.append(self._ParseFormatStr())
146
147	elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
148	# Id.Eol_Tok: special case for format string of '\x00'.
149	break
150
151	else:
152	raise AssertionError(self.token_type)
153
154	self._Next(lex_mode_e.PrintfOuter)
155
156	return parts
157
158
159	class Printf(vm._Builtin):
160	def __init__(self, mem, parse_ctx, unsafe_arith, errfmt):
161	# type: (Mem, parse_lib.ParseContext, sh_expr_eval.UnsafeArith, ui.ErrorFormatter) -> None
162	self.mem = mem
163	self.parse_ctx = parse_ctx
164	self.unsafe_arith = unsafe_arith
165	self.errfmt = errfmt
166	self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
167
168	self.shell_start_time = time_.time(
169	) # this object initialized in main()
170
171	def _Format(self, parts, varargs, locs, out):
172	# type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
173	"""Hairy printf formatting logic."""
174
175	arg_index = 0
176	num_args = len(varargs)
177	backslash_c = False
178
179	while True: # loop over arguments
180	for part in parts: # loop over parsed format string
181	UP_part = part
182	if part.tag() == printf_part_e.Literal:
183	part = cast(printf_part.Literal, UP_part)
184	token = part.token
185	if token.id == Id.Format_EscapedPercent:
186	s = '%'
187	else:
188	s = word_compile.EvalCStringToken(token)
189	out.append(s)
190
191	elif part.tag() == printf_part_e.Percent:
192	# Note: This case is very long, but hard to refactor because of the
193	# error cases and "recycling" of args! (arg_index, return 1, etc.)
194	part = cast(printf_part.Percent, UP_part)
195
196	# TODO: These calculations are independent of the data, so could be
197	# cached
198	flags = [] # type: List[str]
199	if len(part.flags) > 0:
200	for flag_token in part.flags:
201	flags.append(lexer.TokenVal(flag_token))
202
203	width = -1 # nonexistent
204	if part.width:
205	if part.width.id in (Id.Format_Num, Id.Format_Zero):
206	width_str = lexer.TokenVal(part.width)
207	width_loc = part.width # type: loc_t
208	elif part.width.id == Id.Format_Star:
209	if arg_index < num_args:
210	width_str = varargs[arg_index]
211	width_loc = locs[arg_index]
212	arg_index += 1
213	else:
214	width_str = '' # invalid
215	width_loc = loc.Missing
216	else:
217	raise AssertionError()
218
219	try:
220	width = int(width_str)
221	except ValueError:
222	if width_loc.tag() == loc_e.Missing:
223	width_loc = part.width
224	self.errfmt.Print_("printf got invalid width %r" %
225	width_str,
226	blame_loc=width_loc)
227	return 1
228
229	precision = -1 # nonexistent
230	if part.precision:
231	if part.precision.id == Id.Format_Dot:
232	precision_str = '0'
233	precision_loc = part.precision # type: loc_t
234	elif part.precision.id in (Id.Format_Num,
235	Id.Format_Zero):
236	precision_str = lexer.TokenVal(part.precision)
237	precision_loc = part.precision
238	elif part.precision.id == Id.Format_Star:
239	if arg_index < num_args:
240	precision_str = varargs[arg_index]
241	precision_loc = locs[arg_index]
242	arg_index += 1
243	else:
244	precision_str = ''
245	precision_loc = loc.Missing
246	else:
247	raise AssertionError()
248
249	try:
250	precision = int(precision_str)
251	except ValueError:
252	if precision_loc.tag() == loc_e.Missing:
253	precision_loc = part.precision
254	self.errfmt.Print_(
255	'printf got invalid precision %r' %
256	precision_str,
257	blame_loc=precision_loc)
258	return 1
259
260	if arg_index < num_args:
261	s = varargs[arg_index]
262	word_loc = locs[arg_index] # type: loc_t
263	arg_index += 1
264	has_arg = True
265	else:
266	s = ''
267	word_loc = loc.Missing
268	has_arg = False
269
270	# Note: %s could be lexed into Id.Percent_S. Although small string
271	# optimization would remove the allocation as well.
272	typ = lexer.TokenVal(part.type)
273	if typ == 's':
274	if precision >= 0:
275	s = s[:precision] # truncate
276
277	elif typ == 'q':
278	# TODO: most shells give \' for single quote, while OSH gives $'\''
279	# this could matter when SSH'ing
280	s = qsn.maybe_shell_encode(s)
281
282	elif typ == 'b':
283	# Process just like echo -e, except \c handling is simpler.
284
285	c_parts = [] # type: List[str]
286	lex = match.EchoLexer(s)
287	while True:
288	id_, tok_val = lex.Next()
289	if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
290	break
291
292	# Note: DummyToken is OK because EvalCStringToken() doesn't have
293	# any syntax errors.
294	tok = lexer.DummyToken(id_, tok_val)
295	p = word_compile.EvalCStringToken(tok)
296
297	# Unusual behavior: '\c' aborts processing!
298	if p is None:
299	backslash_c = True
300	break
301
302	c_parts.append(p)
303	s = ''.join(c_parts)
304
305	elif part.type.id == Id.Format_Time or typ in 'diouxX':
306	# %(...)T and %d share this complex integer conversion logic
307
308	try:
309	d = int(
310	s
311	) # note: spaces like ' -42 ' accepted and normalized
312
313	except ValueError:
314	# 'a is interpreted as the ASCII value of 'a'
315	if len(s) >= 1 and s[0] in '\'"':
316	# TODO: utf-8 decode s[1:] to be more correct. Probably
317	# depends on issue #366, a utf-8 library.
318	# Note: len(s) == 1 means there is a NUL (0) after the quote..
319	d = ord(s[1]) if len(s) >= 2 else 0
320
321	# No argument means -1 for %(...)T as in Bash Reference Manual
322	# 4.2 "If no argument is specified, conversion behaves as if -1
323	# had been given."
324	elif not has_arg and part.type.id == Id.Format_Time:
325	d = -1
326
327	else:
328	if has_arg:
329	blame_loc = word_loc # type: loc_t
330	else:
331	blame_loc = part.type
332	self.errfmt.Print_(
333	'printf expected an integer, got %r' % s,
334	blame_loc)
335	return 1
336
337	if part.type.id == Id.Format_Time:
338	# Initialize timezone:
339	# `localtime' uses the current timezone information initialized
340	# by `tzset'. The function `tzset' refers to the environment
341	# variable `TZ'. When the exported variable `TZ' is present,
342	# its value should be reflected in the real environment
343	# variable `TZ' before call of `tzset'.
344	#
345	# Note: unlike LANG, TZ doesn't seem to change behavior if it's
346	# not exported.
347	#
348	# TODO: In YSH, provide an API that doesn't rely on libc's global
349	# state.
350
351	tzcell = self.mem.GetCell('TZ')
352	if tzcell and tzcell.exported and tzcell.val.tag(
353	) == value_e.Str:
354	tzval = cast(value.Str, tzcell.val)
355	posix.putenv('TZ', tzval.s)
356
357	time_.tzset()
358
359	# Handle special values:
360	# User can specify two special values -1 and -2 as in Bash
361	# Reference Manual 4.2: "Two special argument values may be
362	# used: -1 represents the current time, and -2 represents the
363	# time the shell was invoked." from
364	# https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
365	if d == -1: # the current time
366	ts = time_.time()
367	elif d == -2: # the shell start time
368	ts = self.shell_start_time
369	else:
370	ts = d
371
372	s = time_.strftime(typ[1:-2], time_.localtime(ts))
373	if precision >= 0:
374	s = s[:precision] # truncate
375
376	else: # typ in 'diouxX'
377	# Disallowed because it depends on 32- or 64- bit
378	if d < 0 and typ in 'ouxX':
379	e_die(
380	"Can't format negative number %d with %%%s"
381	% (d, typ), part.type)
382
383	if typ == 'o':
384	s = mylib.octal(d)
385	elif typ == 'x':
386	s = mylib.hex_lower(d)
387	elif typ == 'X':
388	s = mylib.hex_upper(d)
389	else: # diu
390	s = str(d) # without spaces like ' -42 '
391
392	# There are TWO different ways to ZERO PAD, and they differ on
393	# the negative sign! See spec/builtin-printf
394
395	zero_pad = 0 # no zero padding
396	if width >= 0 and '0' in flags:
397	zero_pad = 1 # style 1
398	elif precision > 0 and len(s) < precision:
399	zero_pad = 2 # style 2
400
401	if zero_pad:
402	negative = (s[0] == '-')
403	if negative:
404	digits = s[1:]
405	sign = '-'
406	if zero_pad == 1:
407	# [%06d] -42 becomes [-00042] (6 TOTAL)
408	n = width - 1
409	else:
410	# [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
411	n = precision
412	else:
413	digits = s
414	sign = ''
415	if zero_pad == 1:
416	n = width
417	else:
418	n = precision
419	s = sign + digits.rjust(n, '0')
420
421	else:
422	raise AssertionError()
423
424	if width >= 0:
425	if '-' in flags:
426	s = s.ljust(width, ' ')
427	else:
428	s = s.rjust(width, ' ')
429
430	out.append(s)
431
432	else:
433	raise AssertionError()
434
435	if backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
436	break
437
438	if arg_index == 0:
439	# We went through ALL parts and didn't consume ANY arg.
440	# Example: print x y
441	break
442	if arg_index >= num_args:
443	# We printed all args
444	break
445	# There are more arg: Implement the 'arg recycling' behavior.
446
447	return 0
448
449	def Run(self, cmd_val):
450	# type: (cmd_value.Argv) -> int
451	"""
452	printf: printf [-v var] format [argument ...]
453	"""
454	attrs, arg_r = flag_spec.ParseCmdVal('printf', cmd_val)
455	arg = arg_types.printf(attrs.attrs)
456
457	fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
458	varargs, locs = arg_r.Rest2()
459
460	#log('fmt %s', fmt)
461	#log('vals %s', vals)
462
463	arena = self.parse_ctx.arena
464	if fmt in self.parse_cache:
465	parts = self.parse_cache[fmt]
466	else:
467	line_reader = reader.StringLineReader(fmt, arena)
468	# TODO: Make public
469	lexer = self.parse_ctx.MakeLexer(line_reader)
470	parser = _FormatStringParser(lexer)
471
472	with alloc.ctx_SourceCode(arena, source.ArgvWord('printf', fmt_loc)):
473	try:
474	parts = parser.Parse()
475	except error.Parse as e:
476	self.errfmt.PrettyPrintError(e)
477	return 2 # parse error
478
479	self.parse_cache[fmt] = parts
480
481	if 0:
482	print()
483	for part in parts:
484	part.PrettyPrint()
485	print()
486
487	out = [] # type: List[str]
488	status = self._Format(parts, varargs, locs, out)
489	if status != 0:
490	return status # failure
491
492	result = ''.join(out)
493	if arg.v is not None:
494	# TODO: get the location for arg.v!
495	v_loc = loc.Missing
496	lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
497	state.BuiltinSetValue(self.mem, lval, value.Str(result))
498	else:
499	mylib.Stdout().write(result)
500	return 0