Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(10)

Side by Side Diff: Parser/tokenizer.c

Issue 3353: make built-in tokenizer available via Python C API
Patch Set: Created 4 years, 10 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Parser/pgen.c ('k') | Parser/tokenizer.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 1
2 /* Tokenizer implementation */ 2 /* Tokenizer implementation */
3 3
4 #include "Python.h" 4 #include "Python.h"
5 #include "pgenheaders.h" 5 #include "pgenheaders.h"
6 6
7 #include <ctype.h> 7 #include <ctype.h>
8 #include <assert.h> 8 #include <assert.h>
9 9
10 #include "tokenizer.h" 10 #include "tokenizer.h"
(...skipping 22 matching lines...) Expand all
33 33
34 extern char *PyOS_Readline(FILE *, FILE *, const char *); 34 extern char *PyOS_Readline(FILE *, FILE *, const char *);
35 /* Return malloc'ed string including trailing \n; 35 /* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF; 36 empty malloc'ed string for EOF;
37 NULL if interrupted */ 37 NULL if interrupted */
38 38
39 /* Don't ever change this -- it would break the portability of Python code */ 39 /* Don't ever change this -- it would break the portability of Python code */
40 #define TABSIZE 8 40 #define TABSIZE 8
41 41
42 /* Forward */ 42 /* Forward */
43 static struct tok_state *tok_new(void); 43 static PyTokenizer_State *tok_new(void);
44 static int tok_nextc(struct tok_state *tok); 44 static int tok_nextc(PyTokenizer_State *tok);
45 static void tok_backup(struct tok_state *tok, int c); 45 static void tok_backup(PyTokenizer_State *tok, int c);
46 46
47 47
48 /* Token names */ 48 /* Token names */
49 49
50 const char *_PyParser_TokenNames[] = { 50 const char *_PyParser_TokenNames[] = {
51 "ENDMARKER", 51 "ENDMARKER",
52 "NAME", 52 "NAME",
53 "NUMBER", 53 "NUMBER",
54 "STRING", 54 "STRING",
55 "NEWLINE", 55 "NEWLINE",
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
103 "ELLIPSIS", 103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */ 104 /* This table must match the #defines in token.h! */
105 "OP", 105 "OP",
106 "<ERRORTOKEN>", 106 "<ERRORTOKEN>",
107 "<N_TOKENS>" 107 "<N_TOKENS>"
108 }; 108 };
109 109
110 110
111 /* Create and initialize a new tok_state structure */ 111 /* Create and initialize a new tok_state structure */
112 112
113 static struct tok_state * 113 static PyTokenizer_State *
114 tok_new(void) 114 tok_new(void)
115 { 115 {
116 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 116 PyTokenizer_State *tok = (PyTokenizer_State *)PyMem_MALLOC(
117 sizeof(struct tok_state)); 117 sizeof(PyTokenizer_State));
118 if (tok == NULL) 118 if (tok == NULL)
119 return NULL; 119 return NULL;
120 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 120 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
121 tok->done = E_OK; 121 tok->done = E_OK;
122 tok->fp = NULL; 122 tok->fp = NULL;
123 tok->input = NULL; 123 tok->input = NULL;
124 tok->tabsize = TABSIZE; 124 tok->tabsize = TABSIZE;
125 tok->indent = 0; 125 tok->indent = 0;
126 tok->indstack[0] = 0; 126 tok->indstack[0] = 0;
127 tok->atbol = 1; 127 tok->atbol = 1;
128 tok->pendin = 0; 128 tok->pendin = 0;
129 tok->prompt = tok->nextprompt = NULL; 129 tok->prompt = tok->nextprompt = NULL;
130 tok->lineno = 0; 130 tok->lineno = 0;
131 tok->level = 0; 131 tok->level = 0;
132 tok->altwarning = 1; 132 tok->altwarning = 1;
133 tok->alterror = 1; 133 tok->alterror = 1;
134 tok->alttabsize = 1; 134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0; 135 tok->altindstack[0] = 0;
136 tok->decoding_state = STATE_INIT; 136 tok->decoding_state = PYTOKENIZER_STATE_INIT;
137 tok->decoding_erred = 0; 137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0; 138 tok->read_coding_spec = 0;
139 tok->enc = NULL; 139 tok->enc = NULL;
140 tok->encoding = NULL; 140 tok->encoding = NULL;
141 tok->cont_line = 0; 141 tok->cont_line = 0;
142 #ifndef PGEN 142 #ifndef PGEN
143 tok->filename = NULL; 143 tok->filename = NULL;
144 tok->decoding_readline = NULL; 144 tok->decoding_readline = NULL;
145 tok->decoding_buffer = NULL; 145 tok->decoding_buffer = NULL;
146 #endif 146 #endif
147 return tok; 147 return tok;
148 } 148 }
149 149
150 static char * 150 static char *
151 new_string(const char *s, Py_ssize_t len, struct tok_state *tok) 151 new_string(const char *s, Py_ssize_t len, PyTokenizer_State *tok)
152 { 152 {
153 char* result = (char *)PyMem_MALLOC(len + 1); 153 char* result = (char *)PyMem_MALLOC(len + 1);
154 if (!result) { 154 if (!result) {
155 tok->done = E_NOMEM; 155 tok->done = E_NOMEM;
156 return NULL; 156 return NULL;
157 } 157 }
158 memcpy(result, s, len); 158 memcpy(result, s, len);
159 result[len] = '\0'; 159 result[len] = '\0';
160 return result; 160 return result;
161 } 161 }
162 162
163 #ifdef PGEN 163 #ifdef PGEN
164 164
165 static char * 165 static char *
166 decoding_fgets(char *s, int size, struct tok_state *tok) 166 decoding_fgets(char *s, int size, PyTokenizer_State *tok)
167 { 167 {
168 return fgets(s, size, tok->fp); 168 return fgets(s, size, tok->fp);
169 } 169 }
170 170
171 static int 171 static int
172 decoding_feof(struct tok_state *tok) 172 decoding_feof(PyTokenizer_State *tok)
173 { 173 {
174 return feof(tok->fp); 174 return feof(tok->fp);
175 } 175 }
176 176
177 static char * 177 static char *
178 decode_str(const char *str, int exec_input, struct tok_state *tok) 178 decode_str(const char *str, int exec_input, PyTokenizer_State *tok)
179 { 179 {
180 return new_string(str, strlen(str), tok); 180 return new_string(str, strlen(str), tok);
181 } 181 }
182 182
183 #else /* PGEN */ 183 #else /* PGEN */
184 184
185 static char * 185 static char *
186 error_ret(struct tok_state *tok) /* XXX */ 186 error_ret(PyTokenizer_State *tok) /* XXX */
187 { 187 {
188 tok->decoding_erred = 1; 188 tok->decoding_erred = 1;
189 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 189 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
190 PyMem_FREE(tok->buf); 190 PyMem_FREE(tok->buf);
191 tok->buf = NULL; 191 tok->buf = NULL;
192 return NULL; /* as if it were EOF */ 192 return NULL; /* as if it were EOF */
193 } 193 }
194 194
195 195
196 static char * 196 static char *
(...skipping 21 matching lines...) Expand all
218 strncmp(buf, "iso-8859-1-", 11) == 0 || 218 strncmp(buf, "iso-8859-1-", 11) == 0 ||
219 strncmp(buf, "iso-latin-1-", 12) == 0) 219 strncmp(buf, "iso-latin-1-", 12) == 0)
220 return "iso-8859-1"; 220 return "iso-8859-1";
221 else 221 else
222 return s; 222 return s;
223 } 223 }
224 224
225 /* Return the coding spec in S, or NULL if none is found. */ 225 /* Return the coding spec in S, or NULL if none is found. */
226 226
227 static int 227 static int
228 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t ok) 228 get_coding_spec(const char *s, char **spec, Py_ssize_t size, PyTokenizer_State * tok)
229 { 229 {
230 Py_ssize_t i; 230 Py_ssize_t i;
231 *spec = NULL; 231 *spec = NULL;
232 /* Coding spec must be in a comment, and that comment must be 232 /* Coding spec must be in a comment, and that comment must be
233 * the only statement on the source code line. */ 233 * the only statement on the source code line. */
234 for (i = 0; i < size - 6; i++) { 234 for (i = 0; i < size - 6; i++) {
235 if (s[i] == '#') 235 if (s[i] == '#')
236 break; 236 break;
237 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 237 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
238 return 1; 238 return 1;
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
271 } 271 }
272 return 1; 272 return 1;
273 } 273 }
274 274
275 /* Check whether the line contains a coding spec. If it does, 275 /* Check whether the line contains a coding spec. If it does,
276 invoke the set_readline function for the new encoding. 276 invoke the set_readline function for the new encoding.
277 This function receives the tok_state and the new encoding. 277 This function receives the tok_state and the new encoding.
278 Return 1 on success, 0 on failure. */ 278 Return 1 on success, 0 on failure. */
279 279
280 static int 280 static int
281 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 281 check_coding_spec(const char* line, Py_ssize_t size, PyTokenizer_State *tok,
282 int set_readline(struct tok_state *, const char *)) 282 int set_readline(PyTokenizer_State *, const char *))
283 { 283 {
284 char *cs; 284 char *cs;
285 int r = 1; 285 int r = 1;
286 286
287 if (tok->cont_line) { 287 if (tok->cont_line) {
288 /* It's a continuation line, so it can't be a coding spec. */ 288 /* It's a continuation line, so it can't be a coding spec. */
289 tok->read_coding_spec = 1; 289 tok->read_coding_spec = 1;
290 return 1; 290 return 1;
291 } 291 }
292 if (!get_coding_spec(line, &cs, size, tok)) 292 if (!get_coding_spec(line, &cs, size, tok))
293 return 0; 293 return 0;
294 if (!cs) { 294 if (!cs) {
295 Py_ssize_t i; 295 Py_ssize_t i;
296 for (i = 0; i < size; i++) { 296 for (i = 0; i < size; i++) {
297 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') 297 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
298 break; 298 break;
299 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { 299 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
300 /* Stop checking coding spec after a line containing 300 /* Stop checking coding spec after a line containing
301 * anything except a comment. */ 301 * anything except a comment. */
302 tok->read_coding_spec = 1; 302 tok->read_coding_spec = 1;
303 break; 303 break;
304 } 304 }
305 } 305 }
306 return 1; 306 return 1;
307 } 307 }
308 tok->read_coding_spec = 1; 308 tok->read_coding_spec = 1;
309 if (tok->encoding == NULL) { 309 if (tok->encoding == NULL) {
310 assert(tok->decoding_state == STATE_RAW); 310 assert(tok->decoding_state == PYTOKENIZER_STATE_RAW);
311 if (strcmp(cs, "utf-8") == 0) { 311 if (strcmp(cs, "utf-8") == 0) {
312 tok->encoding = cs; 312 tok->encoding = cs;
313 } else { 313 } else {
314 r = set_readline(tok, cs); 314 r = set_readline(tok, cs);
315 if (r) { 315 if (r) {
316 tok->encoding = cs; 316 tok->encoding = cs;
317 tok->decoding_state = STATE_NORMAL; 317 tok->decoding_state = PYTOKENIZER_STATE_NORMAL;
318 } 318 }
319 else { 319 else {
320 PyErr_Format(PyExc_SyntaxError, 320 PyErr_Format(PyExc_SyntaxError,
321 "encoding problem: %s", cs); 321 "encoding problem: %s", cs);
322 PyMem_FREE(cs); 322 PyMem_FREE(cs);
323 } 323 }
324 } 324 }
325 } else { /* then, compare cs with BOM */ 325 } else { /* then, compare cs with BOM */
326 r = (strcmp(tok->encoding, cs) == 0); 326 r = (strcmp(tok->encoding, cs) == 0);
327 if (!r) 327 if (!r)
328 PyErr_Format(PyExc_SyntaxError, 328 PyErr_Format(PyExc_SyntaxError,
329 "encoding problem: %s with BOM", cs); 329 "encoding problem: %s with BOM", cs);
330 PyMem_FREE(cs); 330 PyMem_FREE(cs);
331 } 331 }
332 return r; 332 return r;
333 } 333 }
334 334
335 /* See whether the file starts with a BOM. If it does, 335 /* See whether the file starts with a BOM. If it does,
336 invoke the set_readline function with the new encoding. 336 invoke the set_readline function with the new encoding.
337 Return 1 on success, 0 on failure. */ 337 Return 1 on success, 0 on failure. */
338 338
339 static int 339 static int
340 check_bom(int get_char(struct tok_state *), 340 check_bom(int get_char(PyTokenizer_State *),
341 void unget_char(int, struct tok_state *), 341 void unget_char(int, PyTokenizer_State *),
342 int set_readline(struct tok_state *, const char *), 342 int set_readline(PyTokenizer_State *, const char *),
343 struct tok_state *tok) 343 PyTokenizer_State *tok)
344 { 344 {
345 int ch1, ch2, ch3; 345 int ch1, ch2, ch3;
346 ch1 = get_char(tok); 346 ch1 = get_char(tok);
347 tok->decoding_state = STATE_RAW; 347 tok->decoding_state = PYTOKENIZER_STATE_RAW;
348 if (ch1 == EOF) { 348 if (ch1 == EOF) {
349 return 1; 349 return 1;
350 } else if (ch1 == 0xEF) { 350 } else if (ch1 == 0xEF) {
351 ch2 = get_char(tok); 351 ch2 = get_char(tok);
352 if (ch2 != 0xBB) { 352 if (ch2 != 0xBB) {
353 unget_char(ch2, tok); 353 unget_char(ch2, tok);
354 unget_char(ch1, tok); 354 unget_char(ch1, tok);
355 return 1; 355 return 1;
356 } 356 }
357 ch3 = get_char(tok); 357 ch3 = get_char(tok);
358 if (ch3 != 0xBF) { 358 if (ch3 != 0xBF) {
359 unget_char(ch3, tok); 359 unget_char(ch3, tok);
360 unget_char(ch2, tok); 360 unget_char(ch2, tok);
361 unget_char(ch1, tok); 361 unget_char(ch1, tok);
362 return 1; 362 return 1;
363 } 363 }
364 #if 0 364 #if 0
365 /* Disable support for UTF-16 BOMs until a decision 365 /* Disable support for UTF-16 BOMs until a decision
366 is made whether this needs to be supported. */ 366 is made whether this needs to be supported. */
367 } else if (ch1 == 0xFE) { 367 } else if (ch1 == 0xFE) {
368 ch2 = get_char(tok); 368 ch2 = get_char(tok);
369 if (ch2 != 0xFF) { 369 if (ch2 != 0xFF) {
370 unget_char(ch2, tok); 370 unget_char(ch2, tok);
371 unget_char(ch1, tok); 371 unget_char(ch1, tok);
372 return 1; 372 return 1;
373 } 373 }
374 if (!set_readline(tok, "utf-16-be")) 374 if (!set_readline(tok, "utf-16-be"))
375 return 0; 375 return 0;
376 tok->decoding_state = STATE_NORMAL; 376 tok->decoding_state = PYTOKENIZER_STATE_NORMAL;
377 } else if (ch1 == 0xFF) { 377 } else if (ch1 == 0xFF) {
378 ch2 = get_char(tok); 378 ch2 = get_char(tok);
379 if (ch2 != 0xFE) { 379 if (ch2 != 0xFE) {
380 unget_char(ch2, tok); 380 unget_char(ch2, tok);
381 unget_char(ch1, tok); 381 unget_char(ch1, tok);
382 return 1; 382 return 1;
383 } 383 }
384 if (!set_readline(tok, "utf-16-le")) 384 if (!set_readline(tok, "utf-16-le"))
385 return 0; 385 return 0;
386 tok->decoding_state = STATE_NORMAL; 386 tok->decoding_state = PYTOKENIZER_STATE_NORMAL;
387 #endif 387 #endif
388 } else { 388 } else {
389 unget_char(ch1, tok); 389 unget_char(ch1, tok);
390 return 1; 390 return 1;
391 } 391 }
392 if (tok->encoding != NULL) 392 if (tok->encoding != NULL)
393 PyMem_FREE(tok->encoding); 393 PyMem_FREE(tok->encoding);
394 tok->encoding = new_string("utf-8", 5, tok); 394 tok->encoding = new_string("utf-8", 5, tok);
395 if (!tok->encoding) 395 if (!tok->encoding)
396 return 0; 396 return 0;
(...skipping 10 matching lines...) Expand all
407 stored the result in tok->decoding_buffer 407 stored the result in tok->decoding_buffer
408 3) PyByteArrayObject *: previous call to fp_readl did not have enough room 408 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
409 (in the s buffer) to copy entire contents of the line read 409 (in the s buffer) to copy entire contents of the line read
410 by tok->decoding_readline. tok->decoding_buffer has the overflow. 410 by tok->decoding_readline. tok->decoding_buffer has the overflow.
411 In this case, fp_readl is called in a loop (with an expanded buffer) 411 In this case, fp_readl is called in a loop (with an expanded buffer)
412 until the buffer ends with a '\n' (or until the end of the file is 412 until the buffer ends with a '\n' (or until the end of the file is
413 reached): see tok_nextc and its calls to decoding_fgets. 413 reached): see tok_nextc and its calls to decoding_fgets.
414 */ 414 */
415 415
416 static char * 416 static char *
417 fp_readl(char *s, int size, struct tok_state *tok) 417 fp_readl(char *s, int size, PyTokenizer_State *tok)
418 { 418 {
419 PyObject* bufobj; 419 PyObject* bufobj;
420 const char *buf; 420 const char *buf;
421 Py_ssize_t buflen; 421 Py_ssize_t buflen;
422 422
423 /* Ask for one less byte so we can terminate it */ 423 /* Ask for one less byte so we can terminate it */
424 assert(size > 0); 424 assert(size > 0);
425 size--; 425 size--;
426 426
427 if (tok->decoding_buffer) { 427 if (tok->decoding_buffer) {
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
478 readline function. The StreamReader is named ENC. 478 readline function. The StreamReader is named ENC.
479 479
480 This function is called from check_bom and check_coding_spec. 480 This function is called from check_bom and check_coding_spec.
481 481
482 ENC is usually identical to the future value of tok->encoding, 482 ENC is usually identical to the future value of tok->encoding,
483 except for the (currently unsupported) case of UTF-16. 483 except for the (currently unsupported) case of UTF-16.
484 484
485 Return 1 on success, 0 on failure. */ 485 Return 1 on success, 0 on failure. */
486 486
487 static int 487 static int
488 fp_setreadl(struct tok_state *tok, const char* enc) 488 fp_setreadl(PyTokenizer_State *tok, const char* enc)
489 { 489 {
490 PyObject *readline = NULL, *stream = NULL, *io = NULL; 490 PyObject *readline = NULL, *stream = NULL, *io = NULL;
491 _Py_IDENTIFIER(open); 491 _Py_IDENTIFIER(open);
492 _Py_IDENTIFIER(readline); 492 _Py_IDENTIFIER(readline);
493 int fd; 493 int fd;
494 long pos; 494 long pos;
495 495
496 io = PyImport_ImportModuleNoBlock("io"); 496 io = PyImport_ImportModuleNoBlock("io");
497 if (io == NULL) 497 if (io == NULL)
498 goto cleanup; 498 goto cleanup;
(...skipping 27 matching lines...) Expand all
526 } 526 }
527 527
528 cleanup: 528 cleanup:
529 Py_XDECREF(stream); 529 Py_XDECREF(stream);
530 Py_XDECREF(io); 530 Py_XDECREF(io);
531 return readline != NULL; 531 return readline != NULL;
532 } 532 }
533 533
534 /* Fetch the next byte from TOK. */ 534 /* Fetch the next byte from TOK. */
535 535
536 static int fp_getc(struct tok_state *tok) { 536 static int fp_getc(PyTokenizer_State *tok) {
537 return getc(tok->fp); 537 return getc(tok->fp);
538 } 538 }
539 539
540 /* Unfetch the last byte back into TOK. */ 540 /* Unfetch the last byte back into TOK. */
541 541
542 static void fp_ungetc(int c, struct tok_state *tok) { 542 static void fp_ungetc(int c, PyTokenizer_State *tok) {
543 ungetc(c, tok->fp); 543 ungetc(c, tok->fp);
544 } 544 }
545 545
546 /* Check whether the characters at s start a valid 546 /* Check whether the characters at s start a valid
547 UTF-8 sequence. Return the number of characters forming 547 UTF-8 sequence. Return the number of characters forming
548 the sequence if yes, 0 if not. */ 548 the sequence if yes, 0 if not. */
549 static int valid_utf8(const unsigned char* s) 549 static int valid_utf8(const unsigned char* s)
550 { 550 {
551 int expected = 0; 551 int expected = 0;
552 int length; 552 int length;
(...skipping 15 matching lines...) Expand all
568 for (; expected; expected--) 568 for (; expected; expected--)
569 if (s[expected] < 0x80 || s[expected] >= 0xC0) 569 if (s[expected] < 0x80 || s[expected] >= 0xC0)
570 return 0; 570 return 0;
571 return length; 571 return length;
572 } 572 }
573 573
574 /* Read a line of input from TOK. Determine encoding 574 /* Read a line of input from TOK. Determine encoding
575 if necessary. */ 575 if necessary. */
576 576
577 static char * 577 static char *
578 decoding_fgets(char *s, int size, struct tok_state *tok) 578 decoding_fgets(char *s, int size, PyTokenizer_State *tok)
579 { 579 {
580 char *line = NULL; 580 char *line = NULL;
581 int badchar = 0; 581 int badchar = 0;
582 for (;;) { 582 for (;;) {
583 if (tok->decoding_state == STATE_NORMAL) { 583 if (tok->decoding_state == PYTOKENIZER_STATE_NORMAL) {
584 /* We already have a codec associated with 584 /* We already have a codec associated with
585 this input. */ 585 this input. */
586 line = fp_readl(s, size, tok); 586 line = fp_readl(s, size, tok);
587 break; 587 break;
588 } else if (tok->decoding_state == STATE_RAW) { 588 } else if (tok->decoding_state == PYTOKENIZER_STATE_RAW) {
589 /* We want a 'raw' read. */ 589 /* We want a 'raw' read. */
590 line = Py_UniversalNewlineFgets(s, size, 590 line = Py_UniversalNewlineFgets(s, size,
591 tok->fp, NULL); 591 tok->fp, NULL);
592 break; 592 break;
593 } else { 593 } else {
594 /* We have not yet determined the encoding. 594 /* We have not yet determined the encoding.
595 If an encoding is found, use the file-pointer 595 If an encoding is found, use the file-pointer
596 reader functions from now on. */ 596 reader functions from now on. */
597 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 597 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
598 return error_ret(tok); 598 return error_ret(tok);
599 assert(tok->decoding_state != STATE_INIT); 599 assert(tok->decoding_state != PYTOKENIZER_STATE_INIT);
600 } 600 }
601 } 601 }
602 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 602 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
603 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 603 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
604 return error_ret(tok); 604 return error_ret(tok);
605 } 605 }
606 } 606 }
607 #ifndef PGEN 607 #ifndef PGEN
608 /* The default encoding is UTF-8, so make sure we don't have any 608 /* The default encoding is UTF-8, so make sure we don't have any
609 non-UTF-8 sequences in it. */ 609 non-UTF-8 sequences in it. */
(...skipping 15 matching lines...) Expand all
625 "but no encoding declared; " 625 "but no encoding declared; "
626 "see http://python.org/dev/peps/pep-0263/ for details", 626 "see http://python.org/dev/peps/pep-0263/ for details",
627 badchar, tok->filename, tok->lineno + 1); 627 badchar, tok->filename, tok->lineno + 1);
628 return error_ret(tok); 628 return error_ret(tok);
629 } 629 }
630 #endif 630 #endif
631 return line; 631 return line;
632 } 632 }
633 633
634 static int 634 static int
635 decoding_feof(struct tok_state *tok) 635 decoding_feof(PyTokenizer_State *tok)
636 { 636 {
637 if (tok->decoding_state != STATE_NORMAL) { 637 if (tok->decoding_state != PYTOKENIZER_STATE_NORMAL) {
638 return feof(tok->fp); 638 return feof(tok->fp);
639 } else { 639 } else {
640 PyObject* buf = tok->decoding_buffer; 640 PyObject* buf = tok->decoding_buffer;
641 if (buf == NULL) { 641 if (buf == NULL) {
642 buf = PyObject_CallObject(tok->decoding_readline, NULL); 642 buf = PyObject_CallObject(tok->decoding_readline, NULL);
643 if (buf == NULL) { 643 if (buf == NULL) {
644 error_ret(tok); 644 error_ret(tok);
645 return 1; 645 return 1;
646 } else { 646 } else {
647 tok->decoding_buffer = buf; 647 tok->decoding_buffer = buf;
648 } 648 }
649 } 649 }
650 return PyObject_Length(buf) == 0; 650 return PyObject_Length(buf) == 0;
651 } 651 }
652 } 652 }
653 653
654 /* Fetch a byte from TOK, using the string buffer. */ 654 /* Fetch a byte from TOK, using the string buffer. */
655 655
656 static int 656 static int
657 buf_getc(struct tok_state *tok) { 657 buf_getc(PyTokenizer_State *tok) {
658 return Py_CHARMASK(*tok->str++); 658 return Py_CHARMASK(*tok->str++);
659 } 659 }
660 660
661 /* Unfetch a byte from TOK, using the string buffer. */ 661 /* Unfetch a byte from TOK, using the string buffer. */
662 662
663 static void 663 static void
664 buf_ungetc(int c, struct tok_state *tok) { 664 buf_ungetc(int c, PyTokenizer_State *tok) {
665 tok->str--; 665 tok->str--;
666 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-on ly segment */ 666 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-on ly segment */
667 } 667 }
668 668
669 /* Set the readline function for TOK to ENC. For the string-based 669 /* Set the readline function for TOK to ENC. For the string-based
670 tokenizer, this means to just record the encoding. */ 670 tokenizer, this means to just record the encoding. */
671 671
672 static int 672 static int
673 buf_setreadl(struct tok_state *tok, const char* enc) { 673 buf_setreadl(PyTokenizer_State *tok, const char* enc) {
674 tok->enc = enc; 674 tok->enc = enc;
675 return 1; 675 return 1;
676 } 676 }
677 677
678 /* Return a UTF-8 encoding Python string object from the 678 /* Return a UTF-8 encoding Python string object from the
679 C byte string STR, which is encoded with ENC. */ 679 C byte string STR, which is encoded with ENC. */
680 680
681 static PyObject * 681 static PyObject *
682 translate_into_utf8(const char* str, const char* enc) { 682 translate_into_utf8(const char* str, const char* enc) {
683 PyObject *utf8; 683 PyObject *utf8;
684 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 684 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
685 if (buf == NULL) 685 if (buf == NULL)
686 return NULL; 686 return NULL;
687 utf8 = PyUnicode_AsUTF8String(buf); 687 utf8 = PyUnicode_AsUTF8String(buf);
688 Py_DECREF(buf); 688 Py_DECREF(buf);
689 return utf8; 689 return utf8;
690 } 690 }
691 691
692 692
693 static char * 693 static char *
694 translate_newlines(const char *s, int exec_input, struct tok_state *tok) { 694 translate_newlines(const char *s, int exec_input, PyTokenizer_State *tok) {
695 int skip_next_lf = 0; 695 int skip_next_lf = 0;
696 size_t needed_length = strlen(s) + 2, final_length; 696 size_t needed_length = strlen(s) + 2, final_length;
697 char *buf, *current; 697 char *buf, *current;
698 char c = '\0'; 698 char c = '\0';
699 buf = PyMem_MALLOC(needed_length); 699 buf = PyMem_MALLOC(needed_length);
700 if (buf == NULL) { 700 if (buf == NULL) {
701 tok->done = E_NOMEM; 701 tok->done = E_NOMEM;
702 return NULL; 702 return NULL;
703 } 703 }
704 for (current = buf; *s; s++, current++) { 704 for (current = buf; *s; s++, current++) {
(...skipping 24 matching lines...) Expand all
729 /* should never fail */ 729 /* should never fail */
730 buf = PyMem_REALLOC(buf, final_length); 730 buf = PyMem_REALLOC(buf, final_length);
731 return buf; 731 return buf;
732 } 732 }
733 733
734 /* Decode a byte string STR for use as the buffer of TOK. 734 /* Decode a byte string STR for use as the buffer of TOK.
735 Look for encoding declarations inside STR, and record them 735 Look for encoding declarations inside STR, and record them
736 inside TOK. */ 736 inside TOK. */
737 737
738 static const char * 738 static const char *
739 decode_str(const char *input, int single, struct tok_state *tok) 739 decode_str(const char *input, int single, PyTokenizer_State *tok)
740 { 740 {
741 PyObject* utf8 = NULL; 741 PyObject* utf8 = NULL;
742 const char *str; 742 const char *str;
743 const char *s; 743 const char *s;
744 const char *newl[2] = {NULL, NULL}; 744 const char *newl[2] = {NULL, NULL};
745 int lineno = 0; 745 int lineno = 0;
746 tok->input = str = translate_newlines(input, single, tok); 746 tok->input = str = translate_newlines(input, single, tok);
747 if (str == NULL) 747 if (str == NULL)
748 return NULL; 748 return NULL;
749 tok->enc = NULL; 749 tok->enc = NULL;
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
788 } 788 }
789 assert(tok->decoding_buffer == NULL); 789 assert(tok->decoding_buffer == NULL);
790 tok->decoding_buffer = utf8; /* CAUTION */ 790 tok->decoding_buffer = utf8; /* CAUTION */
791 return str; 791 return str;
792 } 792 }
793 793
794 #endif /* PGEN */ 794 #endif /* PGEN */
795 795
796 /* Set up tokenizer for string */ 796 /* Set up tokenizer for string */
797 797
798 struct tok_state * 798 PyTokenizer_State *
799 PyTokenizer_FromString(const char *str, int exec_input) 799 PyTokenizer_FromString(const char *str, int exec_input)
800 { 800 {
801 struct tok_state *tok = tok_new(); 801 PyTokenizer_State *tok = tok_new();
802 if (tok == NULL) 802 if (tok == NULL)
803 return NULL; 803 return NULL;
804 str = decode_str(str, exec_input, tok); 804 str = decode_str(str, exec_input, tok);
805 if (str == NULL) { 805 if (str == NULL) {
806 PyTokenizer_Free(tok); 806 PyTokenizer_Free(tok);
807 return NULL; 807 return NULL;
808 } 808 }
809 809
810 /* XXX: constify members. */ 810 /* XXX: constify members. */
811 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 811 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
812 return tok; 812 return tok;
813 } 813 }
814 814
815 struct tok_state * 815 PyTokenizer_State *
816 PyTokenizer_FromUTF8(const char *str, int exec_input) 816 PyTokenizer_FromUTF8(const char *str, int exec_input)
817 { 817 {
818 struct tok_state *tok = tok_new(); 818 PyTokenizer_State *tok = tok_new();
819 if (tok == NULL) 819 if (tok == NULL)
820 return NULL; 820 return NULL;
821 #ifndef PGEN 821 #ifndef PGEN
822 tok->input = str = translate_newlines(str, exec_input, tok); 822 tok->input = str = translate_newlines(str, exec_input, tok);
823 #endif 823 #endif
824 if (str == NULL) { 824 if (str == NULL) {
825 PyTokenizer_Free(tok); 825 PyTokenizer_Free(tok);
826 return NULL; 826 return NULL;
827 } 827 }
828 tok->decoding_state = STATE_RAW; 828 tok->decoding_state = PYTOKENIZER_STATE_RAW;
829 tok->read_coding_spec = 1; 829 tok->read_coding_spec = 1;
830 tok->enc = NULL; 830 tok->enc = NULL;
831 tok->str = str; 831 tok->str = str;
832 tok->encoding = (char *)PyMem_MALLOC(6); 832 tok->encoding = (char *)PyMem_MALLOC(6);
833 if (!tok->encoding) { 833 if (!tok->encoding) {
834 PyTokenizer_Free(tok); 834 PyTokenizer_Free(tok);
835 return NULL; 835 return NULL;
836 } 836 }
837 strcpy(tok->encoding, "utf-8"); 837 strcpy(tok->encoding, "utf-8");
838 838
839 /* XXX: constify members. */ 839 /* XXX: constify members. */
840 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 840 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
841 return tok; 841 return tok;
842 } 842 }
843 843
844 /* Set up tokenizer for file */ 844 /* Set up tokenizer for file */
845 845
846 struct tok_state * 846 PyTokenizer_State *
847 PyTokenizer_FromFile(FILE *fp, const char* enc, 847 PyTokenizer_FromFile(FILE *fp, const char* enc,
848 const char *ps1, const char *ps2) 848 const char *ps1, const char *ps2)
849 { 849 {
850 struct tok_state *tok = tok_new(); 850 PyTokenizer_State *tok = tok_new();
851 if (tok == NULL) 851 if (tok == NULL)
852 return NULL; 852 return NULL;
853 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { 853 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
854 PyTokenizer_Free(tok); 854 PyTokenizer_Free(tok);
855 return NULL; 855 return NULL;
856 } 856 }
857 tok->cur = tok->inp = tok->buf; 857 tok->cur = tok->inp = tok->buf;
858 tok->end = tok->buf + BUFSIZ; 858 tok->end = tok->buf + BUFSIZ;
859 tok->fp = fp; 859 tok->fp = fp;
860 tok->prompt = ps1; 860 tok->prompt = ps1;
861 tok->nextprompt = ps2; 861 tok->nextprompt = ps2;
862 if (enc != NULL) { 862 if (enc != NULL) {
863 /* Must copy encoding declaration since it 863 /* Must copy encoding declaration since it
864 gets copied into the parse tree. */ 864 gets copied into the parse tree. */
865 tok->encoding = PyMem_MALLOC(strlen(enc)+1); 865 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
866 if (!tok->encoding) { 866 if (!tok->encoding) {
867 PyTokenizer_Free(tok); 867 PyTokenizer_Free(tok);
868 return NULL; 868 return NULL;
869 } 869 }
870 strcpy(tok->encoding, enc); 870 strcpy(tok->encoding, enc);
871 tok->decoding_state = STATE_NORMAL; 871 tok->decoding_state = PYTOKENIZER_STATE_NORMAL;
872 } 872 }
873 return tok; 873 return tok;
874 } 874 }
875 875
876 876
877 /* Free a tok_state structure */ 877 /* Free a tok_state structure */
878 878
879 void 879 void
880 PyTokenizer_Free(struct tok_state *tok) 880 PyTokenizer_Free(PyTokenizer_State *tok)
881 { 881 {
882 if (tok->encoding != NULL) 882 if (tok->encoding != NULL)
883 PyMem_FREE(tok->encoding); 883 PyMem_FREE(tok->encoding);
884 #ifndef PGEN 884 #ifndef PGEN
885 Py_XDECREF(tok->decoding_readline); 885 Py_XDECREF(tok->decoding_readline);
886 Py_XDECREF(tok->decoding_buffer); 886 Py_XDECREF(tok->decoding_buffer);
887 Py_XDECREF(tok->filename); 887 Py_XDECREF(tok->filename);
888 #endif 888 #endif
889 if (tok->fp != NULL && tok->buf != NULL) 889 if (tok->fp != NULL && tok->buf != NULL)
890 PyMem_FREE(tok->buf); 890 PyMem_FREE(tok->buf);
891 if (tok->input) 891 if (tok->input)
892 PyMem_FREE((char *)tok->input); 892 PyMem_FREE((char *)tok->input);
893 PyMem_FREE(tok); 893 PyMem_FREE(tok);
894 } 894 }
895 895
896 /* Get next char, updating state; error code goes into tok->done */ 896 /* Get next char, updating state; error code goes into tok->done */
897 897
898 static int 898 static int
899 tok_nextc(struct tok_state *tok) 899 tok_nextc(PyTokenizer_State *tok)
900 { 900 {
901 for (;;) { 901 for (;;) {
902 if (tok->cur != tok->inp) { 902 if (tok->cur != tok->inp) {
903 return Py_CHARMASK(*tok->cur++); /* Fast path */ 903 return Py_CHARMASK(*tok->cur++); /* Fast path */
904 } 904 }
905 if (tok->done != E_OK) 905 if (tok->done != E_OK)
906 return EOF; 906 return EOF;
907 if (tok->fp == NULL) { 907 if (tok->fp == NULL) {
908 char *end = strchr(tok->inp, '\n'); 908 char *end = strchr(tok->inp, '\n');
909 if (end != NULL) 909 if (end != NULL)
(...skipping 176 matching lines...) Expand 10 before | Expand all | Expand 10 after
1086 return EOF; 1086 return EOF;
1087 } 1087 }
1088 } 1088 }
1089 /*NOTREACHED*/ 1089 /*NOTREACHED*/
1090 } 1090 }
1091 1091
1092 1092
1093 /* Back-up one character */ 1093 /* Back-up one character */
1094 1094
1095 static void 1095 static void
1096 tok_backup(struct tok_state *tok, int c) 1096 tok_backup(PyTokenizer_State *tok, int c)
1097 { 1097 {
1098 if (c != EOF) { 1098 if (c != EOF) {
1099 if (--tok->cur < tok->buf) 1099 if (--tok->cur < tok->buf)
1100 Py_FatalError("tok_backup: beginning of buffer"); 1100 Py_FatalError("tok_backup: beginning of buffer");
1101 if (*tok->cur != c) 1101 if (*tok->cur != c)
1102 *tok->cur = c; 1102 *tok->cur = c;
1103 } 1103 }
1104 } 1104 }
1105 1105
1106 1106
1107 /* Return the token corresponding to a single character */ 1107 /* Return the token corresponding to a single character */
1108 1108
1109 int 1109 int
1110 PyToken_OneChar(int c) 1110 PyToken_OneChar(int c)
1111 { 1111 {
1112 switch (c) { 1112 switch (c) {
1113 case '(': return LPAR; 1113 case '(': return PYTOK_LPAR;
1114 case ')': return RPAR; 1114 case ')': return PYTOK_RPAR;
1115 case '[': return LSQB; 1115 case '[': return PYTOK_LSQB;
1116 case ']': return RSQB; 1116 case ']': return PYTOK_RSQB;
1117 case ':': return COLON; 1117 case ':': return PYTOK_COLON;
1118 case ',': return COMMA; 1118 case ',': return PYTOK_COMMA;
1119 case ';': return SEMI; 1119 case ';': return PYTOK_SEMI;
1120 case '+': return PLUS; 1120 case '+': return PYTOK_PLUS;
1121 case '-': return MINUS; 1121 case '-': return PYTOK_MINUS;
1122 case '*': return STAR; 1122 case '*': return PYTOK_STAR;
1123 case '/': return SLASH; 1123 case '/': return PYTOK_SLASH;
1124 case '|': return VBAR; 1124 case '|': return PYTOK_VBAR;
1125 case '&': return AMPER; 1125 case '&': return PYTOK_AMPER;
1126 case '<': return LESS; 1126 case '<': return PYTOK_LESS;
1127 case '>': return GREATER; 1127 case '>': return PYTOK_GREATER;
1128 case '=': return EQUAL; 1128 case '=': return PYTOK_EQUAL;
1129 case '.': return DOT; 1129 case '.': return PYTOK_DOT;
1130 case '%': return PERCENT; 1130 case '%': return PYTOK_PERCENT;
1131 case '{': return LBRACE; 1131 case '{': return PYTOK_LBRACE;
1132 case '}': return RBRACE; 1132 case '}': return PYTOK_RBRACE;
1133 case '^': return CIRCUMFLEX; 1133 case '^': return PYTOK_CIRCUMFLEX;
1134 case '~': return TILDE; 1134 case '~': return PYTOK_TILDE;
1135 case '@': return AT; 1135 case '@': return PYTOK_AT;
1136 default: return OP; 1136 default: return PYTOK_OP;
1137 } 1137 }
1138 } 1138 }
1139 1139
1140 1140
1141 int 1141 int
1142 PyToken_TwoChars(int c1, int c2) 1142 PyToken_TwoChars(int c1, int c2)
1143 { 1143 {
1144 switch (c1) { 1144 switch (c1) {
1145 case '=': 1145 case '=':
1146 switch (c2) { 1146 switch (c2) {
1147 case '=': return EQEQUAL; 1147 case '=': return PYTOK_EQEQUAL;
1148 } 1148 }
1149 break; 1149 break;
1150 case '!': 1150 case '!':
1151 switch (c2) { 1151 switch (c2) {
1152 case '=': return NOTEQUAL; 1152 case '=': return PYTOK_NOTEQUAL;
1153 } 1153 }
1154 break; 1154 break;
1155 case '<': 1155 case '<':
1156 switch (c2) { 1156 switch (c2) {
1157 case '>': return NOTEQUAL; 1157 case '>': return PYTOK_NOTEQUAL;
1158 case '=': return LESSEQUAL; 1158 case '=': return PYTOK_LESSEQUAL;
1159 case '<': return LEFTSHIFT; 1159 case '<': return PYTOK_LEFTSHIFT;
1160 } 1160 }
1161 break; 1161 break;
1162 case '>': 1162 case '>':
1163 switch (c2) { 1163 switch (c2) {
1164 case '=': return GREATEREQUAL; 1164 case '=': return PYTOK_GREATEREQUAL;
1165 case '>': return RIGHTSHIFT; 1165 case '>': return PYTOK_RIGHTSHIFT;
1166 } 1166 }
1167 break; 1167 break;
1168 case '+': 1168 case '+':
1169 switch (c2) { 1169 switch (c2) {
1170 case '=': return PLUSEQUAL; 1170 case '=': return PYTOK_PLUSEQUAL;
1171 } 1171 }
1172 break; 1172 break;
1173 case '-': 1173 case '-':
1174 switch (c2) { 1174 switch (c2) {
1175 case '=': return MINEQUAL; 1175 case '=': return PYTOK_MINEQUAL;
1176 case '>': return RARROW; 1176 case '>': return PYTOK_RARROW;
1177 } 1177 }
1178 break; 1178 break;
1179 case '*': 1179 case '*':
1180 switch (c2) { 1180 switch (c2) {
1181 case '*': return DOUBLESTAR; 1181 case '*': return PYTOK_DOUBLESTAR;
1182 case '=': return STAREQUAL; 1182 case '=': return PYTOK_STAREQUAL;
1183 } 1183 }
1184 break; 1184 break;
1185 case '/': 1185 case '/':
1186 switch (c2) { 1186 switch (c2) {
1187 case '/': return DOUBLESLASH; 1187 case '/': return PYTOK_DOUBLESLASH;
1188 case '=': return SLASHEQUAL; 1188 case '=': return PYTOK_SLASHEQUAL;
1189 } 1189 }
1190 break; 1190 break;
1191 case '|': 1191 case '|':
1192 switch (c2) { 1192 switch (c2) {
1193 case '=': return VBAREQUAL; 1193 case '=': return PYTOK_VBAREQUAL;
1194 } 1194 }
1195 break; 1195 break;
1196 case '%': 1196 case '%':
1197 switch (c2) { 1197 switch (c2) {
1198 case '=': return PERCENTEQUAL; 1198 case '=': return PYTOK_PERCENTEQUAL;
1199 } 1199 }
1200 break; 1200 break;
1201 case '&': 1201 case '&':
1202 switch (c2) { 1202 switch (c2) {
1203 case '=': return AMPEREQUAL; 1203 case '=': return PYTOK_AMPEREQUAL;
1204 } 1204 }
1205 break; 1205 break;
1206 case '^': 1206 case '^':
1207 switch (c2) { 1207 switch (c2) {
1208 case '=': return CIRCUMFLEXEQUAL; 1208 case '=': return PYTOK_CIRCUMFLEXEQUAL;
1209 } 1209 }
1210 break; 1210 break;
1211 case '@': 1211 case '@':
1212 switch (c2) { 1212 switch (c2) {
1213 case '=': return ATEQUAL; 1213 case '=': return PYTOK_ATEQUAL;
1214 } 1214 }
1215 break; 1215 break;
1216 } 1216 }
1217 return OP; 1217 return PYTOK_OP;
1218 } 1218 }
1219 1219
1220 int 1220 int
1221 PyToken_ThreeChars(int c1, int c2, int c3) 1221 PyToken_ThreeChars(int c1, int c2, int c3)
1222 { 1222 {
1223 switch (c1) { 1223 switch (c1) {
1224 case '<': 1224 case '<':
1225 switch (c2) { 1225 switch (c2) {
1226 case '<': 1226 case '<':
1227 switch (c3) { 1227 switch (c3) {
1228 case '=': 1228 case '=':
1229 return LEFTSHIFTEQUAL; 1229 return PYTOK_LEFTSHIFTEQUAL;
1230 } 1230 }
1231 break; 1231 break;
1232 } 1232 }
1233 break; 1233 break;
1234 case '>': 1234 case '>':
1235 switch (c2) { 1235 switch (c2) {
1236 case '>': 1236 case '>':
1237 switch (c3) { 1237 switch (c3) {
1238 case '=': 1238 case '=':
1239 return RIGHTSHIFTEQUAL; 1239 return PYTOK_RIGHTSHIFTEQUAL;
1240 } 1240 }
1241 break; 1241 break;
1242 } 1242 }
1243 break; 1243 break;
1244 case '*': 1244 case '*':
1245 switch (c2) { 1245 switch (c2) {
1246 case '*': 1246 case '*':
1247 switch (c3) { 1247 switch (c3) {
1248 case '=': 1248 case '=':
1249 return DOUBLESTAREQUAL; 1249 return PYTOK_DOUBLESTAREQUAL;
1250 } 1250 }
1251 break; 1251 break;
1252 } 1252 }
1253 break; 1253 break;
1254 case '/': 1254 case '/':
1255 switch (c2) { 1255 switch (c2) {
1256 case '/': 1256 case '/':
1257 switch (c3) { 1257 switch (c3) {
1258 case '=': 1258 case '=':
1259 return DOUBLESLASHEQUAL; 1259 return PYTOK_DOUBLESLASHEQUAL;
1260 } 1260 }
1261 break; 1261 break;
1262 } 1262 }
1263 break; 1263 break;
1264 case '.': 1264 case '.':
1265 switch (c2) { 1265 switch (c2) {
1266 case '.': 1266 case '.':
1267 switch (c3) { 1267 switch (c3) {
1268 case '.': 1268 case '.':
1269 return ELLIPSIS; 1269 return PYTOK_ELLIPSIS;
1270 } 1270 }
1271 break; 1271 break;
1272 } 1272 }
1273 break; 1273 break;
1274 } 1274 }
1275 return OP; 1275 return PYTOK_OP;
1276 } 1276 }
1277 1277
1278 static int 1278 static int
1279 indenterror(struct tok_state *tok) 1279 indenterror(PyTokenizer_State *tok)
1280 { 1280 {
1281 if (tok->alterror) { 1281 if (tok->alterror) {
1282 tok->done = E_TABSPACE; 1282 tok->done = E_TABSPACE;
1283 tok->cur = tok->inp; 1283 tok->cur = tok->inp;
1284 return 1; 1284 return 1;
1285 } 1285 }
1286 if (tok->altwarning) { 1286 if (tok->altwarning) {
1287 #ifdef PGEN 1287 #ifdef PGEN
1288 PySys_WriteStderr("inconsistent use of tabs and spaces " 1288 PySys_WriteStderr("inconsistent use of tabs and spaces "
1289 "in indentation\n"); 1289 "in indentation\n");
1290 #else 1290 #else
1291 PySys_FormatStderr("%U: inconsistent use of tabs and spaces " 1291 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
1292 "in indentation\n", tok->filename); 1292 "in indentation\n", tok->filename);
1293 #endif 1293 #endif
1294 tok->altwarning = 0; 1294 tok->altwarning = 0;
1295 } 1295 }
1296 return 0; 1296 return 0;
1297 } 1297 }
1298 1298
1299 #ifdef PGEN 1299 #ifdef PGEN
1300 #define verify_identifier(tok) 1 1300 #define verify_identifier(tok) 1
1301 #else 1301 #else
1302 /* Verify that the identifier follows PEP 3131. 1302 /* Verify that the identifier follows PEP 3131.
1303 All identifier strings are guaranteed to be "ready" unicode objects. 1303 All identifier strings are guaranteed to be "ready" unicode objects.
1304 */ 1304 */
1305 static int 1305 static int
1306 verify_identifier(struct tok_state *tok) 1306 verify_identifier(PyTokenizer_State *tok)
1307 { 1307 {
1308 PyObject *s; 1308 PyObject *s;
1309 int result; 1309 int result;
1310 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); 1310 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1311 if (s == NULL || PyUnicode_READY(s) == -1) { 1311 if (s == NULL || PyUnicode_READY(s) == -1) {
1312 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { 1312 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1313 PyErr_Clear(); 1313 PyErr_Clear();
1314 tok->done = E_IDENTIFIER; 1314 tok->done = E_IDENTIFIER;
1315 } else { 1315 } else {
1316 tok->done = E_ERROR; 1316 tok->done = E_ERROR;
1317 } 1317 }
1318 return 0; 1318 return 0;
1319 } 1319 }
1320 result = PyUnicode_IsIdentifier(s); 1320 result = PyUnicode_IsIdentifier(s);
1321 Py_DECREF(s); 1321 Py_DECREF(s);
1322 if (result == 0) 1322 if (result == 0)
1323 tok->done = E_IDENTIFIER; 1323 tok->done = E_IDENTIFIER;
1324 return result; 1324 return result;
1325 } 1325 }
1326 #endif 1326 #endif
1327 1327
1328 /* Get next token, after space stripping etc. */ 1328 /* Get next token, after space stripping etc. */
1329 1329
1330 static int 1330 static int
1331 tok_get(struct tok_state *tok, char **p_start, char **p_end) 1331 tok_get(PyTokenizer_State *tok, char **p_start, char **p_end)
1332 { 1332 {
1333 int c; 1333 int c;
1334 int blankline, nonascii; 1334 int blankline, nonascii;
1335 1335
1336 *p_start = *p_end = NULL; 1336 *p_start = *p_end = NULL;
1337 nextline: 1337 nextline:
1338 tok->start = NULL; 1338 tok->start = NULL;
1339 blankline = 0; 1339 blankline = 0;
1340 1340
1341 /* Get indentation level */ 1341 /* Get indentation level */
(...skipping 12 matching lines...) Expand all
1354 } 1354 }
1355 else if (c == '\014') /* Control-L (formfeed) */ 1355 else if (c == '\014') /* Control-L (formfeed) */
1356 col = altcol = 0; /* For Emacs users */ 1356 col = altcol = 0; /* For Emacs users */
1357 else 1357 else
1358 break; 1358 break;
1359 } 1359 }
1360 tok_backup(tok, c); 1360 tok_backup(tok, c);
1361 if (c == '#' || c == '\n') { 1361 if (c == '#' || c == '\n') {
1362 /* Lines with only whitespace and/or comments 1362 /* Lines with only whitespace and/or comments
1363 shouldn't affect the indentation and are 1363 shouldn't affect the indentation and are
1364 not passed to the parser as NEWLINE tokens, 1364 not passed to the parser as PYTOK_NEWLINE tokens,
1365 except *totally* empty lines in interactive 1365 except *totally* empty lines in interactive
1366 mode, which signal the end of a command group. */ 1366 mode, which signal the end of a command group. */
1367 if (col == 0 && c == '\n' && tok->prompt != NULL) 1367 if (col == 0 && c == '\n' && tok->prompt != NULL)
1368 blankline = 0; /* Let it through */ 1368 blankline = 0; /* Let it through */
1369 else 1369 else
1370 blankline = 1; /* Ignore completely */ 1370 blankline = 1; /* Ignore completely */
1371 /* We can't jump back right here since we still 1371 /* We can't jump back right here since we still
1372 may need to skip to the end of a comment */ 1372 may need to skip to the end of a comment */
1373 } 1373 }
1374 if (!blankline && tok->level == 0) { 1374 if (!blankline && tok->level == 0) {
1375 if (col == tok->indstack[tok->indent]) { 1375 if (col == tok->indstack[tok->indent]) {
1376 /* No change */ 1376 /* No change */
1377 if (altcol != tok->altindstack[tok->indent]) { 1377 if (altcol != tok->altindstack[tok->indent]) {
1378 if (indenterror(tok)) 1378 if (indenterror(tok))
1379 return ERRORTOKEN; 1379 return PYTOK_ERRORTOKEN;
1380 } 1380 }
1381 } 1381 }
1382 else if (col > tok->indstack[tok->indent]) { 1382 else if (col > tok->indstack[tok->indent]) {
1383 /* Indent -- always one */ 1383 /* Indent -- always one */
1384 if (tok->indent+1 >= MAXINDENT) { 1384 if (tok->indent+1 >= PYTOKENIZER_MAXINDENT) {
1385 tok->done = E_TOODEEP; 1385 tok->done = E_TOODEEP;
1386 tok->cur = tok->inp; 1386 tok->cur = tok->inp;
1387 return ERRORTOKEN; 1387 return PYTOK_ERRORTOKEN;
1388 } 1388 }
1389 if (altcol <= tok->altindstack[tok->indent]) { 1389 if (altcol <= tok->altindstack[tok->indent]) {
1390 if (indenterror(tok)) 1390 if (indenterror(tok))
1391 return ERRORTOKEN; 1391 return PYTOK_ERRORTOKEN;
1392 } 1392 }
1393 tok->pendin++; 1393 tok->pendin++;
1394 tok->indstack[++tok->indent] = col; 1394 tok->indstack[++tok->indent] = col;
1395 tok->altindstack[tok->indent] = altcol; 1395 tok->altindstack[tok->indent] = altcol;
1396 } 1396 }
1397 else /* col < tok->indstack[tok->indent] */ { 1397 else /* col < tok->indstack[tok->indent] */ {
1398 /* Dedent -- any number, must be consistent */ 1398 /* Dedent -- any number, must be consistent */
1399 while (tok->indent > 0 && 1399 while (tok->indent > 0 &&
1400 col < tok->indstack[tok->indent]) { 1400 col < tok->indstack[tok->indent]) {
1401 tok->pendin--; 1401 tok->pendin--;
1402 tok->indent--; 1402 tok->indent--;
1403 } 1403 }
1404 if (col != tok->indstack[tok->indent]) { 1404 if (col != tok->indstack[tok->indent]) {
1405 tok->done = E_DEDENT; 1405 tok->done = E_DEDENT;
1406 tok->cur = tok->inp; 1406 tok->cur = tok->inp;
1407 return ERRORTOKEN; 1407 return PYTOK_ERRORTOKEN;
1408 } 1408 }
1409 if (altcol != tok->altindstack[tok->indent]) { 1409 if (altcol != tok->altindstack[tok->indent]) {
1410 if (indenterror(tok)) 1410 if (indenterror(tok))
1411 return ERRORTOKEN; 1411 return PYTOK_ERRORTOKEN;
1412 } 1412 }
1413 } 1413 }
1414 } 1414 }
1415 } 1415 }
1416 1416
1417 tok->start = tok->cur; 1417 tok->start = tok->cur;
1418 1418
1419 /* Return pending indents/dedents */ 1419 /* Return pending indents/dedents */
1420 if (tok->pendin != 0) { 1420 if (tok->pendin != 0) {
1421 if (tok->pendin < 0) { 1421 if (tok->pendin < 0) {
1422 tok->pendin++; 1422 tok->pendin++;
1423 return DEDENT; 1423 return PYTOK_DEDENT;
1424 } 1424 }
1425 else { 1425 else {
1426 tok->pendin--; 1426 tok->pendin--;
1427 return INDENT; 1427 return PYTOK_INDENT;
1428 } 1428 }
1429 } 1429 }
1430 1430
1431 again: 1431 again:
1432 tok->start = NULL; 1432 tok->start = NULL;
1433 /* Skip spaces */ 1433 /* Skip spaces */
1434 do { 1434 do {
1435 c = tok_nextc(tok); 1435 c = tok_nextc(tok);
1436 } while (c == ' ' || c == '\t' || c == '\014'); 1436 } while (c == ' ' || c == '\t' || c == '\014');
1437 1437
1438 /* Set start of current token */ 1438 /* Set start of current token */
1439 tok->start = tok->cur - 1; 1439 tok->start = tok->cur - 1;
1440 1440
1441 /* Skip comment */ 1441 /* Skip comment */
1442 if (c == '#') 1442 if (c == '#')
1443 while (c != EOF && c != '\n') 1443 while (c != EOF && c != '\n')
1444 c = tok_nextc(tok); 1444 c = tok_nextc(tok);
1445 1445
1446 /* Check for EOF and errors now */ 1446 /* Check for EOF and errors now */
1447 if (c == EOF) { 1447 if (c == EOF) {
1448 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1448 return tok->done == E_EOF ? PYTOK_ENDMARKER : PYTOK_ERRORTOKEN;
1449 } 1449 }
1450 1450
1451 /* Identifier (most frequent token!) */ 1451 /* Identifier (most frequent token!) */
1452 nonascii = 0; 1452 nonascii = 0;
1453 if (is_potential_identifier_start(c)) { 1453 if (is_potential_identifier_start(c)) {
1454 /* Process b"", r"", u"", br"" and rb"" */ 1454 /* Process b"", r"", u"", br"" and rb"" */
1455 int saw_b = 0, saw_r = 0, saw_u = 0; 1455 int saw_b = 0, saw_r = 0, saw_u = 0;
1456 while (1) { 1456 while (1) {
1457 if (!(saw_b || saw_u) && (c == 'b' || c == 'B')) 1457 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
1458 saw_b = 1; 1458 saw_b = 1;
(...skipping 12 matching lines...) Expand all
1471 } 1471 }
1472 while (is_potential_identifier_char(c)) { 1472 while (is_potential_identifier_char(c)) {
1473 if (c >= 128) 1473 if (c >= 128)
1474 nonascii = 1; 1474 nonascii = 1;
1475 c = tok_nextc(tok); 1475 c = tok_nextc(tok);
1476 } 1476 }
1477 tok_backup(tok, c); 1477 tok_backup(tok, c);
1478 if (nonascii && 1478 if (nonascii &&
1479 !verify_identifier(tok)) { 1479 !verify_identifier(tok)) {
1480 tok->done = E_IDENTIFIER; 1480 tok->done = E_IDENTIFIER;
1481 return ERRORTOKEN; 1481 return PYTOK_ERRORTOKEN;
1482 } 1482 }
1483 *p_start = tok->start; 1483 *p_start = tok->start;
1484 *p_end = tok->cur; 1484 *p_end = tok->cur;
1485 return NAME; 1485 return PYTOK_NAME;
1486 } 1486 }
1487 1487
1488 /* Newline */ 1488 /* Newline */
1489 if (c == '\n') { 1489 if (c == '\n') {
1490 tok->atbol = 1; 1490 tok->atbol = 1;
1491 if (blankline || tok->level > 0) 1491 if (blankline || tok->level > 0)
1492 goto nextline; 1492 goto nextline;
1493 *p_start = tok->start; 1493 *p_start = tok->start;
1494 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1494 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1495 tok->cont_line = 0; 1495 tok->cont_line = 0;
1496 return NEWLINE; 1496 return PYTOK_NEWLINE;
1497 } 1497 }
1498 1498
1499 /* Period or number starting with period? */ 1499 /* Period or number starting with period? */
1500 if (c == '.') { 1500 if (c == '.') {
1501 c = tok_nextc(tok); 1501 c = tok_nextc(tok);
1502 if (isdigit(c)) { 1502 if (isdigit(c)) {
1503 goto fraction; 1503 goto fraction;
1504 } else if (c == '.') { 1504 } else if (c == '.') {
1505 c = tok_nextc(tok); 1505 c = tok_nextc(tok);
1506 if (c == '.') { 1506 if (c == '.') {
1507 *p_start = tok->start; 1507 *p_start = tok->start;
1508 *p_end = tok->cur; 1508 *p_end = tok->cur;
1509 return ELLIPSIS; 1509 return PYTOK_ELLIPSIS;
1510 } else { 1510 } else {
1511 tok_backup(tok, c); 1511 tok_backup(tok, c);
1512 } 1512 }
1513 tok_backup(tok, '.'); 1513 tok_backup(tok, '.');
1514 } else { 1514 } else {
1515 tok_backup(tok, c); 1515 tok_backup(tok, c);
1516 } 1516 }
1517 *p_start = tok->start; 1517 *p_start = tok->start;
1518 *p_end = tok->cur; 1518 *p_end = tok->cur;
1519 return DOT; 1519 return PYTOK_DOT;
1520 } 1520 }
1521 1521
1522 /* Number */ 1522 /* Number */
1523 if (isdigit(c)) { 1523 if (isdigit(c)) {
1524 if (c == '0') { 1524 if (c == '0') {
1525 /* Hex, octal or binary -- maybe. */ 1525 /* Hex, octal or binary -- maybe. */
1526 c = tok_nextc(tok); 1526 c = tok_nextc(tok);
1527 if (c == '.') 1527 if (c == '.')
1528 goto fraction; 1528 goto fraction;
1529 if (c == 'j' || c == 'J') 1529 if (c == 'j' || c == 'J')
1530 goto imaginary; 1530 goto imaginary;
1531 if (c == 'x' || c == 'X') { 1531 if (c == 'x' || c == 'X') {
1532 1532
1533 /* Hex */ 1533 /* Hex */
1534 c = tok_nextc(tok); 1534 c = tok_nextc(tok);
1535 if (!isxdigit(c)) { 1535 if (!isxdigit(c)) {
1536 tok->done = E_TOKEN; 1536 tok->done = E_TOKEN;
1537 tok_backup(tok, c); 1537 tok_backup(tok, c);
1538 return ERRORTOKEN; 1538 return PYTOK_ERRORTOKEN;
1539 } 1539 }
1540 do { 1540 do {
1541 c = tok_nextc(tok); 1541 c = tok_nextc(tok);
1542 } while (isxdigit(c)); 1542 } while (isxdigit(c));
1543 } 1543 }
1544 else if (c == 'o' || c == 'O') { 1544 else if (c == 'o' || c == 'O') {
1545 /* Octal */ 1545 /* Octal */
1546 c = tok_nextc(tok); 1546 c = tok_nextc(tok);
1547 if (c < '0' || c >= '8') { 1547 if (c < '0' || c >= '8') {
1548 tok->done = E_TOKEN; 1548 tok->done = E_TOKEN;
1549 tok_backup(tok, c); 1549 tok_backup(tok, c);
1550 return ERRORTOKEN; 1550 return PYTOK_ERRORTOKEN;
1551 } 1551 }
1552 do { 1552 do {
1553 c = tok_nextc(tok); 1553 c = tok_nextc(tok);
1554 } while ('0' <= c && c < '8'); 1554 } while ('0' <= c && c < '8');
1555 } 1555 }
1556 else if (c == 'b' || c == 'B') { 1556 else if (c == 'b' || c == 'B') {
1557 /* Binary */ 1557 /* Binary */
1558 c = tok_nextc(tok); 1558 c = tok_nextc(tok);
1559 if (c != '0' && c != '1') { 1559 if (c != '0' && c != '1') {
1560 tok->done = E_TOKEN; 1560 tok->done = E_TOKEN;
1561 tok_backup(tok, c); 1561 tok_backup(tok, c);
1562 return ERRORTOKEN; 1562 return PYTOK_ERRORTOKEN;
1563 } 1563 }
1564 do { 1564 do {
1565 c = tok_nextc(tok); 1565 c = tok_nextc(tok);
1566 } while (c == '0' || c == '1'); 1566 } while (c == '0' || c == '1');
1567 } 1567 }
1568 else { 1568 else {
1569 int nonzero = 0; 1569 int nonzero = 0;
1570 /* maybe old-style octal; c is first char of it */ 1570 /* maybe old-style octal; c is first char of it */
1571 /* in any case, allow '0' as a literal */ 1571 /* in any case, allow '0' as a literal */
1572 while (c == '0') 1572 while (c == '0')
1573 c = tok_nextc(tok); 1573 c = tok_nextc(tok);
1574 while (isdigit(c)) { 1574 while (isdigit(c)) {
1575 nonzero = 1; 1575 nonzero = 1;
1576 c = tok_nextc(tok); 1576 c = tok_nextc(tok);
1577 } 1577 }
1578 if (c == '.') 1578 if (c == '.')
1579 goto fraction; 1579 goto fraction;
1580 else if (c == 'e' || c == 'E') 1580 else if (c == 'e' || c == 'E')
1581 goto exponent; 1581 goto exponent;
1582 else if (c == 'j' || c == 'J') 1582 else if (c == 'j' || c == 'J')
1583 goto imaginary; 1583 goto imaginary;
1584 else if (nonzero) { 1584 else if (nonzero) {
1585 tok->done = E_TOKEN; 1585 tok->done = E_TOKEN;
1586 tok_backup(tok, c); 1586 tok_backup(tok, c);
1587 return ERRORTOKEN; 1587 return PYTOK_ERRORTOKEN;
1588 } 1588 }
1589 } 1589 }
1590 } 1590 }
1591 else { 1591 else {
1592 /* Decimal */ 1592 /* Decimal */
1593 do { 1593 do {
1594 c = tok_nextc(tok); 1594 c = tok_nextc(tok);
1595 } while (isdigit(c)); 1595 } while (isdigit(c));
1596 { 1596 {
1597 /* Accept floating point numbers. */ 1597 /* Accept floating point numbers. */
1598 if (c == '.') { 1598 if (c == '.') {
1599 fraction: 1599 fraction:
1600 /* Fraction */ 1600 /* Fraction */
1601 do { 1601 do {
1602 c = tok_nextc(tok); 1602 c = tok_nextc(tok);
1603 } while (isdigit(c)); 1603 } while (isdigit(c));
1604 } 1604 }
1605 if (c == 'e' || c == 'E') { 1605 if (c == 'e' || c == 'E') {
1606 int e; 1606 int e;
1607 exponent: 1607 exponent:
1608 e = c; 1608 e = c;
1609 /* Exponent part */ 1609 /* Exponent part */
1610 c = tok_nextc(tok); 1610 c = tok_nextc(tok);
1611 if (c == '+' || c == '-') { 1611 if (c == '+' || c == '-') {
1612 c = tok_nextc(tok); 1612 c = tok_nextc(tok);
1613 if (!isdigit(c)) { 1613 if (!isdigit(c)) {
1614 tok->done = E_TOKEN; 1614 tok->done = E_TOKEN;
1615 tok_backup(tok, c); 1615 tok_backup(tok, c);
1616 return ERRORTOKEN; 1616 return PYTOK_ERRORTOKEN;
1617 } 1617 }
1618 } else if (!isdigit(c)) { 1618 } else if (!isdigit(c)) {
1619 tok_backup(tok, c); 1619 tok_backup(tok, c);
1620 tok_backup(tok, e); 1620 tok_backup(tok, e);
1621 *p_start = tok->start; 1621 *p_start = tok->start;
1622 *p_end = tok->cur; 1622 *p_end = tok->cur;
1623 return NUMBER; 1623 return PYTOK_NUMBER;
1624 } 1624 }
1625 do { 1625 do {
1626 c = tok_nextc(tok); 1626 c = tok_nextc(tok);
1627 } while (isdigit(c)); 1627 } while (isdigit(c));
1628 } 1628 }
1629 if (c == 'j' || c == 'J') 1629 if (c == 'j' || c == 'J')
1630 /* Imaginary part */ 1630 /* Imaginary part */
1631 imaginary: 1631 imaginary:
1632 c = tok_nextc(tok); 1632 c = tok_nextc(tok);
1633 } 1633 }
1634 } 1634 }
1635 tok_backup(tok, c); 1635 tok_backup(tok, c);
1636 *p_start = tok->start; 1636 *p_start = tok->start;
1637 *p_end = tok->cur; 1637 *p_end = tok->cur;
1638 return NUMBER; 1638 return PYTOK_NUMBER;
1639 } 1639 }
1640 1640
1641 letter_quote: 1641 letter_quote:
1642 /* String */ 1642 /* String */
1643 if (c == '\'' || c == '"') { 1643 if (c == '\'' || c == '"') {
1644 int quote = c; 1644 int quote = c;
1645 int quote_size = 1; /* 1 or 3 */ 1645 int quote_size = 1; /* 1 or 3 */
1646 int end_quote_size = 0; 1646 int end_quote_size = 0;
1647 1647
1648 /* Find the quote size and start of string */ 1648 /* Find the quote size and start of string */
(...skipping 10 matching lines...) Expand all
1659 1659
1660 /* Get rest of string */ 1660 /* Get rest of string */
1661 while (end_quote_size != quote_size) { 1661 while (end_quote_size != quote_size) {
1662 c = tok_nextc(tok); 1662 c = tok_nextc(tok);
1663 if (c == EOF) { 1663 if (c == EOF) {
1664 if (quote_size == 3) 1664 if (quote_size == 3)
1665 tok->done = E_EOFS; 1665 tok->done = E_EOFS;
1666 else 1666 else
1667 tok->done = E_EOLS; 1667 tok->done = E_EOLS;
1668 tok->cur = tok->inp; 1668 tok->cur = tok->inp;
1669 return ERRORTOKEN; 1669 return PYTOK_ERRORTOKEN;
1670 } 1670 }
1671 if (quote_size == 1 && c == '\n') { 1671 if (quote_size == 1 && c == '\n') {
1672 tok->done = E_EOLS; 1672 tok->done = E_EOLS;
1673 tok->cur = tok->inp; 1673 tok->cur = tok->inp;
1674 return ERRORTOKEN; 1674 return PYTOK_ERRORTOKEN;
1675 } 1675 }
1676 if (c == quote) 1676 if (c == quote)
1677 end_quote_size += 1; 1677 end_quote_size += 1;
1678 else { 1678 else {
1679 end_quote_size = 0; 1679 end_quote_size = 0;
1680 if (c == '\\') 1680 if (c == '\\')
1681 c = tok_nextc(tok); /* skip escaped char */ 1681 c = tok_nextc(tok); /* skip escaped char */
1682 } 1682 }
1683 } 1683 }
1684 1684
1685 *p_start = tok->start; 1685 *p_start = tok->start;
1686 *p_end = tok->cur; 1686 *p_end = tok->cur;
1687 return STRING; 1687 return PYTOK_STRING;
1688 } 1688 }
1689 1689
1690 /* Line continuation */ 1690 /* Line continuation */
1691 if (c == '\\') { 1691 if (c == '\\') {
1692 c = tok_nextc(tok); 1692 c = tok_nextc(tok);
1693 if (c != '\n') { 1693 if (c != '\n') {
1694 tok->done = E_LINECONT; 1694 tok->done = E_LINECONT;
1695 tok->cur = tok->inp; 1695 tok->cur = tok->inp;
1696 return ERRORTOKEN; 1696 return PYTOK_ERRORTOKEN;
1697 } 1697 }
1698 tok->cont_line = 1; 1698 tok->cont_line = 1;
1699 goto again; /* Read next line */ 1699 goto again; /* Read next line */
1700 } 1700 }
1701 1701
1702 /* Check for two-character token */ 1702 /* Check for two-character token */
1703 { 1703 {
1704 int c2 = tok_nextc(tok); 1704 int c2 = tok_nextc(tok);
1705 int token = PyToken_TwoChars(c, c2); 1705 int token = PyToken_TwoChars(c, c2);
1706 if (token != OP) { 1706 if (token != PYTOK_OP) {
1707 int c3 = tok_nextc(tok); 1707 int c3 = tok_nextc(tok);
1708 int token3 = PyToken_ThreeChars(c, c2, c3); 1708 int token3 = PyToken_ThreeChars(c, c2, c3);
1709 if (token3 != OP) { 1709 if (token3 != PYTOK_OP) {
1710 token = token3; 1710 token = token3;
1711 } else { 1711 } else {
1712 tok_backup(tok, c3); 1712 tok_backup(tok, c3);
1713 } 1713 }
1714 *p_start = tok->start; 1714 *p_start = tok->start;
1715 *p_end = tok->cur; 1715 *p_end = tok->cur;
1716 return token; 1716 return token;
1717 } 1717 }
1718 tok_backup(tok, c2); 1718 tok_backup(tok, c2);
1719 } 1719 }
(...skipping 12 matching lines...) Expand all
1732 break; 1732 break;
1733 } 1733 }
1734 1734
1735 /* Punctuation character */ 1735 /* Punctuation character */
1736 *p_start = tok->start; 1736 *p_start = tok->start;
1737 *p_end = tok->cur; 1737 *p_end = tok->cur;
1738 return PyToken_OneChar(c); 1738 return PyToken_OneChar(c);
1739 } 1739 }
1740 1740
1741 int 1741 int
1742 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1742 PyTokenizer_Get(PyTokenizer_State *tok, char **p_start, char **p_end)
1743 { 1743 {
1744 int result = tok_get(tok, p_start, p_end); 1744 int result = tok_get(tok, p_start, p_end);
1745 if (tok->decoding_erred) { 1745 if (tok->decoding_erred) {
1746 result = ERRORTOKEN; 1746 result = PYTOK_ERRORTOKEN;
1747 tok->done = E_DECODE; 1747 tok->done = E_DECODE;
1748 } 1748 }
1749 return result; 1749 return result;
1750 } 1750 }
1751 1751
1752 /* Get the encoding of a Python file. Check for the coding cookie and check if 1752 /* Get the encoding of a Python file. Check for the coding cookie and check if
1753 the file starts with a BOM. 1753 the file starts with a BOM.
1754 1754
1755 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the 1755 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1756 encoding in the first or second line of the file (in which case the encoding 1756 encoding in the first or second line of the file (in which case the encoding
1757 should be assumed to be UTF-8). 1757 should be assumed to be UTF-8).
1758 1758
1759 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed 1759 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1760 by the caller. */ 1760 by the caller. */
1761 1761
1762 char * 1762 char *
1763 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) 1763 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1764 { 1764 {
1765 struct tok_state *tok; 1765 PyTokenizer_State *tok;
1766 FILE *fp; 1766 FILE *fp;
1767 char *p_start =NULL , *p_end =NULL , *encoding = NULL; 1767 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1768 1768
1769 #ifndef PGEN 1769 #ifndef PGEN
1770 fd = _Py_dup(fd); 1770 fd = _Py_dup(fd);
1771 #else 1771 #else
1772 fd = dup(fd); 1772 fd = dup(fd);
1773 #endif 1773 #endif
1774 if (fd < 0) { 1774 if (fd < 0) {
1775 return NULL; 1775 return NULL;
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
1816 { 1816 {
1817 return PyTokenizer_FindEncodingFilename(fd, NULL); 1817 return PyTokenizer_FindEncodingFilename(fd, NULL);
1818 } 1818 }
1819 1819
1820 #ifdef Py_DEBUG 1820 #ifdef Py_DEBUG
1821 1821
1822 void 1822 void
1823 tok_dump(int type, char *start, char *end) 1823 tok_dump(int type, char *start, char *end)
1824 { 1824 {
1825 printf("%s", _PyParser_TokenNames[type]); 1825 printf("%s", _PyParser_TokenNames[type]);
1826 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1826 if (type == PYTOK_NAME || type == PYTOK_NUMBER || type == PYTOK_STRING || ty pe == PYTOK_OP)
1827 printf("(%.*s)", (int)(end - start), start); 1827 printf("(%.*s)", (int)(end - start), start);
1828 } 1828 }
1829 1829
1830 #endif 1830 #endif
OLDNEW
« no previous file with comments | « Parser/pgen.c ('k') | Parser/tokenizer.h » ('j') | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+