Line data Source code
1 :
2 : /* Parser implementation */
3 :
4 : /* For a description, see the comments at end of this file */
5 :
6 : /* XXX To do: error recovery */
7 :
8 : #include "Python.h"
9 : #include "pgenheaders.h"
10 : #include "token.h"
11 : #include "grammar.h"
12 : #include "node.h"
13 : #include "parser.h"
14 : #include "errcode.h"
15 :
16 :
17 : #ifdef Py_DEBUG
18 : extern int Py_DebugFlag;
19 : #define D(x) if (!Py_DebugFlag); else x
20 : #else
21 : #define D(x)
22 : #endif
23 :
24 :
25 : /* STACK DATA TYPE */
26 :
27 : static void s_reset(stack *);
28 :
29 : static void
30 63 : s_reset(stack *s)
31 : {
32 63 : s->s_top = &s->s_base[MAXSTACK];
33 63 : }
34 :
35 : #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36 :
37 : static int
38 308032 : s_push(register stack *s, dfa *d, node *parent)
39 : {
40 : register stackentry *top;
41 308032 : if (s->s_top == s->s_base) {
42 0 : fprintf(stderr, "s_push: parser stack overflow\n");
43 0 : return E_NOMEM;
44 : }
45 308032 : top = --s->s_top;
46 308032 : top->s_dfa = d;
47 308032 : top->s_parent = parent;
48 308032 : top->s_state = 0;
49 308032 : return 0;
50 : }
51 :
52 : #ifdef Py_DEBUG
53 :
54 : static void
55 : s_pop(register stack *s)
56 : {
57 : if (s_empty(s))
58 : Py_FatalError("s_pop: parser stack underflow -- FATAL");
59 : s->s_top++;
60 : }
61 :
62 : #else /* !Py_DEBUG */
63 :
64 : #define s_pop(s) (s)->s_top++
65 :
66 : #endif
67 :
68 :
69 : /* PARSER CREATION */
70 :
71 : parser_state *
72 63 : PyParser_New(grammar *g, int start)
73 : {
74 : parser_state *ps;
75 :
76 63 : if (!g->g_accel)
77 3 : PyGrammar_AddAccelerators(g);
78 63 : ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79 63 : if (ps == NULL)
80 0 : return NULL;
81 63 : ps->p_grammar = g;
82 : #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83 63 : ps->p_flags = 0;
84 : #endif
85 63 : ps->p_tree = PyNode_New(start);
86 63 : if (ps->p_tree == NULL) {
87 0 : PyMem_FREE(ps);
88 0 : return NULL;
89 : }
90 63 : s_reset(&ps->p_stack);
91 63 : (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92 63 : return ps;
93 : }
94 :
95 : void
96 63 : PyParser_Delete(parser_state *ps)
97 : {
98 : /* NB If you want to save the parse tree,
99 : you must set p_tree to NULL before calling delparser! */
100 63 : PyNode_Free(ps->p_tree);
101 63 : PyMem_FREE(ps);
102 63 : }
103 :
104 :
105 : /* PARSER STACK OPERATIONS */
106 :
107 : static int
108 74459 : shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
109 : {
110 : int err;
111 : assert(!s_empty(s));
112 74459 : err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
113 74459 : if (err)
114 0 : return err;
115 74459 : s->s_top->s_state = newstate;
116 74459 : return 0;
117 : }
118 :
119 : static int
120 307969 : push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
121 : {
122 : int err;
123 : register node *n;
124 307969 : n = s->s_top->s_parent;
125 : assert(!s_empty(s));
126 307969 : err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
127 307969 : if (err)
128 0 : return err;
129 307969 : s->s_top->s_state = newstate;
130 307969 : return s_push(s, d, CHILD(n, NCH(n)-1));
131 : }
132 :
133 :
134 : /* PARSER PROPER */
135 :
136 : static int
137 74459 : classify(parser_state *ps, int type, char *str)
138 : {
139 74459 : grammar *g = ps->p_grammar;
140 74459 : register int n = g->g_ll.ll_nlabels;
141 :
142 74459 : if (type == NAME) {
143 29200 : register char *s = str;
144 29200 : register label *l = g->g_ll.ll_label;
145 : register int i;
146 8897556 : for (i = n; i > 0; i--, l++) {
147 5225644 : if (l->lb_type != NAME || l->lb_str == NULL ||
148 833338 : l->lb_str[0] != s[0] ||
149 32929 : strcmp(l->lb_str, s) != 0)
150 4419578 : continue;
151 : #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
152 8223 : if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION &&
153 2663 : s[0] == 'p' && strcmp(s, "print") == 0) {
154 60 : break; /* no longer a keyword */
155 : }
156 : #endif
157 : D(printf("It's a keyword\n"));
158 5597 : return n - i;
159 : }
160 : }
161 :
162 : {
163 68862 : register label *l = g->g_ll.ll_label;
164 : register int i;
165 2825490 : for (i = n; i > 0; i--, l++) {
166 2825490 : if (l->lb_type == type && l->lb_str == NULL) {
167 : D(printf("It's a token we know\n"));
168 68862 : return n - i;
169 : }
170 : }
171 : }
172 :
173 : D(printf("Illegal token\n"));
174 0 : return -1;
175 : }
176 :
177 : #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
178 : static void
179 245 : future_hack(parser_state *ps)
180 : {
181 245 : node *n = ps->p_stack.s_top->s_parent;
182 : node *ch, *cch;
183 : int i;
184 :
185 : /* from __future__ import ..., must have at least 4 children */
186 245 : n = CHILD(n, 0);
187 245 : if (NCH(n) < 4)
188 69 : return;
189 176 : ch = CHILD(n, 0);
190 176 : if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
191 0 : return;
192 176 : ch = CHILD(n, 1);
193 317 : if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
194 141 : strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
195 129 : return;
196 47 : ch = CHILD(n, 3);
197 : /* ch can be a star, a parenthesis or import_as_names */
198 47 : if (TYPE(ch) == STAR)
199 0 : return;
200 47 : if (TYPE(ch) == LPAR)
201 1 : ch = CHILD(n, 4);
202 :
203 128 : for (i = 0; i < NCH(ch); i += 2) {
204 81 : cch = CHILD(ch, i);
205 81 : if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
206 81 : char *str_ch = STR(CHILD(cch, 0));
207 81 : if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
208 0 : ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
209 81 : } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
210 12 : ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
211 69 : } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
212 0 : ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
213 : }
214 : }
215 : }
216 : }
217 : #endif /* future keyword */
218 :
219 : int
220 74459 : PyParser_AddToken(register parser_state *ps, register int type, char *str,
221 : int lineno, int col_offset, int *expected_ret)
222 : {
223 : register int ilabel;
224 : int err;
225 :
226 : D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
227 :
228 : /* Find out which label this token is */
229 74459 : ilabel = classify(ps, type, str);
230 74459 : if (ilabel < 0)
231 0 : return E_SYNTAX;
232 :
233 : /* Loop until the token is shifted or an error occurred */
234 : for (;;) {
235 : /* Fetch the current dfa and state */
236 642913 : register dfa *d = ps->p_stack.s_top->s_dfa;
237 642913 : register state *s = &d->d_state[ps->p_stack.s_top->s_state];
238 :
239 : D(printf(" DFA '%s', state %d:",
240 : d->d_name, ps->p_stack.s_top->s_state));
241 :
242 : /* Check accelerator */
243 642913 : if (s->s_lower <= ilabel && ilabel < s->s_upper) {
244 399103 : register int x = s->s_accel[ilabel - s->s_lower];
245 399103 : if (x != -1) {
246 382428 : if (x & (1<<7)) {
247 : /* Push non-terminal */
248 307969 : int nt = (x >> 8) + NT_OFFSET;
249 307969 : int arrow = x & ((1<<7)-1);
250 307969 : dfa *d1 = PyGrammar_FindDFA(
251 : ps->p_grammar, nt);
252 307969 : if ((err = push(&ps->p_stack, nt, d1,
253 : arrow, lineno, col_offset)) > 0) {
254 : D(printf(" MemError: push\n"));
255 0 : return err;
256 : }
257 : D(printf(" Push ...\n"));
258 307969 : continue;
259 : }
260 :
261 : /* Shift the token */
262 74459 : if ((err = shift(&ps->p_stack, type, str,
263 : x, lineno, col_offset)) > 0) {
264 : D(printf(" MemError: shift.\n"));
265 0 : return err;
266 : }
267 : D(printf(" Shift.\n"));
268 : /* Pop while we are in an accept-only state */
269 562231 : while (s = &d->d_state
270 243886 : [ps->p_stack.s_top->s_state],
271 121943 : s->s_accept && s->s_narcs == 1) {
272 : D(printf(" DFA '%s', state %d: "
273 : "Direct pop.\n",
274 : d->d_name,
275 : ps->p_stack.s_top->s_state));
276 : #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
277 47854 : if (d->d_name[0] == 'i' &&
278 307 : strcmp(d->d_name,
279 : "import_stmt") == 0)
280 1 : future_hack(ps);
281 : #endif
282 47547 : s_pop(&ps->p_stack);
283 47547 : if (s_empty(&ps->p_stack)) {
284 : D(printf(" ACCEPT.\n"));
285 63 : return E_DONE;
286 : }
287 47484 : d = ps->p_stack.s_top->s_dfa;
288 : }
289 74396 : return E_OK;
290 : }
291 : }
292 :
293 260485 : if (s->s_accept) {
294 : #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
295 262024 : if (d->d_name[0] == 'i' &&
296 1539 : strcmp(d->d_name, "import_stmt") == 0)
297 244 : future_hack(ps);
298 : #endif
299 : /* Pop this dfa and try again */
300 260485 : s_pop(&ps->p_stack);
301 : D(printf(" Pop ...\n"));
302 260485 : if (s_empty(&ps->p_stack)) {
303 : D(printf(" Error: bottom of stack.\n"));
304 0 : return E_SYNTAX;
305 : }
306 260485 : continue;
307 : }
308 :
309 : /* Stuck, report syntax error */
310 : D(printf(" Error.\n"));
311 0 : if (expected_ret) {
312 0 : if (s->s_lower == s->s_upper - 1) {
313 : /* Only one possible expected token */
314 0 : *expected_ret = ps->p_grammar->
315 0 : g_ll.ll_label[s->s_lower].lb_type;
316 : }
317 : else
318 0 : *expected_ret = -1;
319 : }
320 0 : return E_SYNTAX;
321 568454 : }
322 : }
323 :
324 :
325 : #ifdef Py_DEBUG
326 :
327 : /* DEBUG OUTPUT */
328 :
329 : void
330 : dumptree(grammar *g, node *n)
331 : {
332 : int i;
333 :
334 : if (n == NULL)
335 : printf("NIL");
336 : else {
337 : label l;
338 : l.lb_type = TYPE(n);
339 : l.lb_str = STR(n);
340 : printf("%s", PyGrammar_LabelRepr(&l));
341 : if (ISNONTERMINAL(TYPE(n))) {
342 : printf("(");
343 : for (i = 0; i < NCH(n); i++) {
344 : if (i > 0)
345 : printf(",");
346 : dumptree(g, CHILD(n, i));
347 : }
348 : printf(")");
349 : }
350 : }
351 : }
352 :
353 : void
354 : showtree(grammar *g, node *n)
355 : {
356 : int i;
357 :
358 : if (n == NULL)
359 : return;
360 : if (ISNONTERMINAL(TYPE(n))) {
361 : for (i = 0; i < NCH(n); i++)
362 : showtree(g, CHILD(n, i));
363 : }
364 : else if (ISTERMINAL(TYPE(n))) {
365 : printf("%s", _PyParser_TokenNames[TYPE(n)]);
366 : if (TYPE(n) == NUMBER || TYPE(n) == NAME)
367 : printf("(%s)", STR(n));
368 : printf(" ");
369 : }
370 : else
371 : printf("? ");
372 : }
373 :
374 : void
375 : printtree(parser_state *ps)
376 : {
377 : if (Py_DebugFlag) {
378 : printf("Parse tree:\n");
379 : dumptree(ps->p_grammar, ps->p_tree);
380 : printf("\n");
381 : printf("Tokens:\n");
382 : showtree(ps->p_grammar, ps->p_tree);
383 : printf("\n");
384 : }
385 : printf("Listing:\n");
386 : PyNode_ListTree(ps->p_tree);
387 : printf("\n");
388 : }
389 :
390 : #endif /* Py_DEBUG */
391 :
392 : /*
393 :
394 : Description
395 : -----------
396 :
397 : The parser's interface is different than usual: the function addtoken()
398 : must be called for each token in the input. This makes it possible to
399 : turn it into an incremental parsing system later. The parsing system
400 : constructs a parse tree as it goes.
401 :
402 : A parsing rule is represented as a Deterministic Finite-state Automaton
403 : (DFA). A node in a DFA represents a state of the parser; an arc represents
404 : a transition. Transitions are either labeled with terminal symbols or
405 : with non-terminals. When the parser decides to follow an arc labeled
406 : with a non-terminal, it is invoked recursively with the DFA representing
407 : the parsing rule for that as its initial state; when that DFA accepts,
408 : the parser that invoked it continues. The parse tree constructed by the
409 : recursively called parser is inserted as a child in the current parse tree.
410 :
411 : The DFA's can be constructed automatically from a more conventional
412 : language description. An extended LL(1) grammar (ELL(1)) is suitable.
413 : Certain restrictions make the parser's life easier: rules that can produce
414 : the empty string should be outlawed (there are other ways to put loops
415 : or optional parts in the language). To avoid the need to construct
416 : FIRST sets, we can require that all but the last alternative of a rule
417 : (really: arc going out of a DFA's state) must begin with a terminal
418 : symbol.
419 :
420 : As an example, consider this grammar:
421 :
422 : expr: term (OP term)*
423 : term: CONSTANT | '(' expr ')'
424 :
425 : The DFA corresponding to the rule for expr is:
426 :
427 : ------->.---term-->.------->
428 : ^ |
429 : | |
430 : \----OP----/
431 :
432 : The parse tree generated for the input a+b is:
433 :
434 : (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
435 :
436 : */
|