diff --git a/Doc/c-api/tokenizer.rst b/Doc/c-api/tokenizer.rst new file mode 100644 --- /dev/null +++ b/Doc/c-api/tokenizer.rst @@ -0,0 +1,61 @@ +.. highlightlang:: c + +.. _tokenizer: + +Tokenizing Python Code +====================== + +.. sectionauthor:: Dustin J. Mitchell + +.. index:: + tokenizer + +These routines allow C code to break Python code into a stream of tokens. +The token constants match those defined in :mod:`token`. + +.. c:type:: PyTokenizer_State + + The C structure used to represent the state of a tokenizer. + +.. c:function:: PyTokenizer_State *PyTokenizer_FromString(string, exec_input) + + :param string: string to convert to tokens + :param exec_input: true if the input is from an ``exec`` call + + Initialize a tokenizer to read from a C string. + If ``exec_input`` is true, then an implicit newline will be added to the end of the string. + +.. c:function:: PyTokenizer_State *PyTokenizer_FromUTF8String(string, exec_input) + + :param string: UTF-8 encoded string to convert to tokens + :param exec_input: true if the input is from an ``exec`` call + + Initialize a tokenizer to read from a UTF-8 encoded C string. + If ``exec_input`` is true, then an implicit newline will be added to the end of the string. + +.. c:function:: PyTokenizer_State *PyTokenizer_FromFile(FILE *fp, const char *encoding, const char *ps1, const char *ps2) + + :param fp: file to tokenize + :param encoding: encoding of the file contents + :param ps1: initial-line interactive prompt + :param ps2: subsequent-line interactive prompt + + Initialize a tokenizer to read from a file. + The file data is decoded using ``encoding``, if given. + If ``ps1`` and ``ps2`` are not NULL, the tokenizer will operate in interactive mode. + +.. c:function:: PyTokenizer_Free(PyTokenizer_State *state) + + :param state: tokenizer state + + Free the given tokenizer. + +.. c:function:: int PyTokenizer_Get(PyTokenizer_State, *state, char **p_start, char **p_end) + + :param state: tokenizer state + :param p_start: (output) first character of the returned token + :param p_end: (output) first character following the returned token + :return: token + + Get the next token from the tokenizer. + The ``p_start`` and ``p_end`` output parameters give the boundaries of the returned token. diff --git a/Doc/c-api/utilities.rst b/Doc/c-api/utilities.rst --- a/Doc/c-api/utilities.rst +++ b/Doc/c-api/utilities.rst @@ -19,3 +19,4 @@ and parsing function arguments and const conversion.rst reflection.rst codec.rst + tokenizer.rst diff --git a/Include/tokenizer.h b/Include/tokenizer.h new file mode 100644 --- /dev/null +++ b/Include/tokenizer.h @@ -0,0 +1,83 @@ +#ifndef Py_TOKENIZER_H +#define Py_TOKENIZER_H +#ifdef __cplusplus +extern "C" { +#endif + +#include "object.h" + +/* Tokenizer interface */ + +#include "token.h" /* For token types */ + +typedef struct PyTokenizer_State PyTokenizer_State; + +#ifndef Py_LIMITED_API +#define MAXINDENT 100 /* Max indentation level */ + +enum decoding_state { + STATE_INIT, + STATE_RAW, + STATE_NORMAL /* have a codec associated with input */ +}; + +struct PyTokenizer_State { + /* Input state; buf <= cur <= inp <= end */ + /* NB an entire line is held in the buffer */ + char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ + char *cur; /* Next character in buffer */ + char *inp; /* End of data in buffer */ + char *end; /* End of input buffer if buf != NULL */ + char *start; /* Start of current token if not NULL */ + int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ + /* NB If done != E_OK, cur must be == inp!!! */ + FILE *fp; /* Rest of input; NULL if tokenizing a string */ + int tabsize; /* Tab spacing */ + int indent; /* Current indentation index */ + int indstack[MAXINDENT]; /* Stack of indents */ + int atbol; /* Nonzero if at begin of new line */ + int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ + const char *prompt, *nextprompt; /* For interactive prompting */ + int lineno; /* Current line number */ + int level; /* () [] {} Parentheses nesting level */ + /* Used to allow free continuations inside them */ + /* Stuff for checking on different tab sizes */ +#ifndef PGEN + /* pgen doesn't have access to Python codecs, it cannot decode the input + filename. The bytes filename might be kept, but it is only used by + indenterror() and it is not really needed: pgen only compiles one file + (Grammar/Grammar). */ + PyObject *filename; +#endif + int altwarning; /* Issue warning if alternate tabs don't match */ + int alterror; /* Issue error if alternate tabs don't match */ + int alttabsize; /* Alternate tab spacing */ + int altindstack[MAXINDENT]; /* Stack of alternate indents */ + /* Stuff for PEP 0263 */ + enum decoding_state decoding_state; + int decoding_erred; /* whether erred in decoding */ + int read_coding_spec; /* whether 'coding:...' has been read */ + char *encoding; /* Source encoding. */ + int cont_line; /* whether we are in a continuation line. */ + const char* line_start; /* pointer to start of current line */ +#ifndef PGEN + PyObject *decoding_readline; /* open(...).readline */ + PyObject *decoding_buffer; +#endif + const char* enc; /* Encoding for the current str. */ + const char* str; + const char* input; /* Tokenizer's newline translated copy of the string. */ +}; +#endif + +PyAPI_FUNC(PyTokenizer_State *)PyTokenizer_FromString(const char *, int); +PyAPI_FUNC(PyTokenizer_State *)PyTokenizer_FromUTF8(const char *, int); +PyAPI_FUNC(PyTokenizer_State *)PyTokenizer_FromFile(FILE *, const char*, + const char *, const char *); +PyAPI_FUNC(void) PyTokenizer_Free(PyTokenizer_State *); +PyAPI_FUNC(int) PyTokenizer_Get(PyTokenizer_State *, char **, char **); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_TOKENIZER_H */ diff --git a/Makefile.pre.in b/Makefile.pre.in --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -322,7 +322,7 @@ PGOBJS= \ PARSER_HEADERS= \ $(srcdir)/Parser/parser.h \ $(srcdir)/Include/parsetok.h \ - $(srcdir)/Parser/tokenizer.h + $(srcdir)/Include/tokenizer.h PGENSRCS= $(PSRCS) $(PGSRCS) PGENOBJS= $(POBJS) $(PGOBJS) diff --git a/Parser/parsetok.c b/Parser/parsetok.c --- a/Parser/parsetok.c +++ b/Parser/parsetok.c @@ -12,7 +12,7 @@ /* Forward */ -static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *); +static node *parsetok(PyTokenizer_State *, grammar *, int, perrdetail *, int *); static int initerr(perrdetail *err_ret, PyObject * filename); /* Parse input coming from a string. Return error code, print some errors. */ @@ -45,7 +45,7 @@ PyParser_ParseStringObject(const char *s grammar *g, int start, perrdetail *err_ret, int *flags) { - struct tok_state *tok; + PyTokenizer_State *tok; int exec_input = start == file_input; if (initerr(err_ret, filename) < 0) @@ -118,7 +118,7 @@ PyParser_ParseFileObject(FILE *fp, PyObj const char *ps1, const char *ps2, perrdetail *err_ret, int *flags) { - struct tok_state *tok; + PyTokenizer_State *tok; if (initerr(err_ret, filename) < 0) return NULL; @@ -181,7 +181,7 @@ warn(const char *msg, const char *filena Return error code. */ static node * -parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, +parsetok(PyTokenizer_State *tok, grammar *g, int start, perrdetail *err_ret, int *flags) { parser_state *ps; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -40,9 +40,9 @@ extern char *PyOS_Readline(FILE *, FILE #define TABSIZE 8 /* Forward */ -static struct tok_state *tok_new(void); -static int tok_nextc(struct tok_state *tok); -static void tok_backup(struct tok_state *tok, int c); +static PyTokenizer_State *tok_new(void); +static int tok_nextc(PyTokenizer_State *tok); +static void tok_backup(PyTokenizer_State *tok, int c); /* Token names */ @@ -110,11 +110,11 @@ const char *_PyParser_TokenNames[] = { /* Create and initialize a new tok_state structure */ -static struct tok_state * +static PyTokenizer_State * tok_new(void) { - struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( - sizeof(struct tok_state)); + PyTokenizer_State *tok = (PyTokenizer_State *)PyMem_MALLOC( + sizeof(PyTokenizer_State)); if (tok == NULL) return NULL; tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; @@ -148,7 +148,7 @@ tok_new(void) } static char * -new_string(const char *s, Py_ssize_t len, struct tok_state *tok) +new_string(const char *s, Py_ssize_t len, PyTokenizer_State *tok) { char* result = (char *)PyMem_MALLOC(len + 1); if (!result) { @@ -163,19 +163,19 @@ new_string(const char *s, Py_ssize_t len #ifdef PGEN static char * -decoding_fgets(char *s, int size, struct tok_state *tok) +decoding_fgets(char *s, int size, PyTokenizer_State *tok) { return fgets(s, size, tok->fp); } static int -decoding_feof(struct tok_state *tok) +decoding_feof(PyTokenizer_State *tok) { return feof(tok->fp); } static char * -decode_str(const char *str, int exec_input, struct tok_state *tok) +decode_str(const char *str, int exec_input, PyTokenizer_State *tok) { return new_string(str, strlen(str), tok); } @@ -183,7 +183,7 @@ decode_str(const char *str, int exec_inp #else /* PGEN */ static char * -error_ret(struct tok_state *tok) /* XXX */ +error_ret(PyTokenizer_State *tok) /* XXX */ { tok->decoding_erred = 1; if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ @@ -225,7 +225,7 @@ get_normal_name(char *s) /* for u /* Return the coding spec in S, or NULL if none is found. */ static int -get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) +get_coding_spec(const char *s, char **spec, Py_ssize_t size, PyTokenizer_State *tok) { Py_ssize_t i; *spec = NULL; @@ -278,8 +278,8 @@ get_coding_spec(const char *s, char **sp Return 1 on success, 0 on failure. */ static int -check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, - int set_readline(struct tok_state *, const char *)) +check_coding_spec(const char* line, Py_ssize_t size, PyTokenizer_State *tok, + int set_readline(PyTokenizer_State *, const char *)) { char *cs; int r = 1; @@ -337,10 +337,10 @@ check_coding_spec(const char* line, Py_s Return 1 on success, 0 on failure. */ static int -check_bom(int get_char(struct tok_state *), - void unget_char(int, struct tok_state *), - int set_readline(struct tok_state *, const char *), - struct tok_state *tok) +check_bom(int get_char(PyTokenizer_State *), + void unget_char(int, PyTokenizer_State *), + int set_readline(PyTokenizer_State *, const char *), + PyTokenizer_State *tok) { int ch1, ch2, ch3; ch1 = get_char(tok); @@ -414,7 +414,7 @@ check_bom(int get_char(struct tok_state */ static char * -fp_readl(char *s, int size, struct tok_state *tok) +fp_readl(char *s, int size, PyTokenizer_State *tok) { PyObject* bufobj; const char *buf; @@ -485,7 +485,7 @@ error: Return 1 on success, 0 on failure. */ static int -fp_setreadl(struct tok_state *tok, const char* enc) +fp_setreadl(PyTokenizer_State *tok, const char* enc) { PyObject *readline = NULL, *stream = NULL, *io = NULL; _Py_IDENTIFIER(open); @@ -533,13 +533,13 @@ fp_setreadl(struct tok_state *tok, const /* Fetch the next byte from TOK. */ -static int fp_getc(struct tok_state *tok) { +static int fp_getc(PyTokenizer_State *tok) { return getc(tok->fp); } /* Unfetch the last byte back into TOK. */ -static void fp_ungetc(int c, struct tok_state *tok) { +static void fp_ungetc(int c, PyTokenizer_State *tok) { ungetc(c, tok->fp); } @@ -575,7 +575,7 @@ static int valid_utf8(const unsigned cha if necessary. */ static char * -decoding_fgets(char *s, int size, struct tok_state *tok) +decoding_fgets(char *s, int size, PyTokenizer_State *tok) { char *line = NULL; int badchar = 0; @@ -632,7 +632,7 @@ decoding_fgets(char *s, int size, struct } static int -decoding_feof(struct tok_state *tok) +decoding_feof(PyTokenizer_State *tok) { if (tok->decoding_state != STATE_NORMAL) { return feof(tok->fp); @@ -654,14 +654,14 @@ decoding_feof(struct tok_state *tok) /* Fetch a byte from TOK, using the string buffer. */ static int -buf_getc(struct tok_state *tok) { +buf_getc(PyTokenizer_State *tok) { return Py_CHARMASK(*tok->str++); } /* Unfetch a byte from TOK, using the string buffer. */ static void -buf_ungetc(int c, struct tok_state *tok) { +buf_ungetc(int c, PyTokenizer_State *tok) { tok->str--; assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ } @@ -670,7 +670,7 @@ buf_ungetc(int c, struct tok_state *tok) tokenizer, this means to just record the encoding. */ static int -buf_setreadl(struct tok_state *tok, const char* enc) { +buf_setreadl(PyTokenizer_State *tok, const char* enc) { tok->enc = enc; return 1; } @@ -691,7 +691,7 @@ translate_into_utf8(const char* str, con static char * -translate_newlines(const char *s, int exec_input, struct tok_state *tok) { +translate_newlines(const char *s, int exec_input, PyTokenizer_State *tok) { int skip_next_lf = 0; size_t needed_length = strlen(s) + 2, final_length; char *buf, *current; @@ -736,7 +736,7 @@ translate_newlines(const char *s, int ex inside TOK. */ static const char * -decode_str(const char *input, int single, struct tok_state *tok) +decode_str(const char *input, int single, PyTokenizer_State *tok) { PyObject* utf8 = NULL; const char *str; @@ -795,10 +795,10 @@ decode_str(const char *input, int single /* Set up tokenizer for string */ -struct tok_state * +PyTokenizer_State * PyTokenizer_FromString(const char *str, int exec_input) { - struct tok_state *tok = tok_new(); + PyTokenizer_State *tok = tok_new(); if (tok == NULL) return NULL; str = decode_str(str, exec_input, tok); @@ -812,10 +812,10 @@ PyTokenizer_FromString(const char *str, return tok; } -struct tok_state * +PyTokenizer_State * PyTokenizer_FromUTF8(const char *str, int exec_input) { - struct tok_state *tok = tok_new(); + PyTokenizer_State *tok = tok_new(); if (tok == NULL) return NULL; #ifndef PGEN @@ -843,11 +843,11 @@ PyTokenizer_FromUTF8(const char *str, in /* Set up tokenizer for file */ -struct tok_state * +PyTokenizer_State * PyTokenizer_FromFile(FILE *fp, const char* enc, const char *ps1, const char *ps2) { - struct tok_state *tok = tok_new(); + PyTokenizer_State *tok = tok_new(); if (tok == NULL) return NULL; if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { @@ -877,7 +877,7 @@ PyTokenizer_FromFile(FILE *fp, const cha /* Free a tok_state structure */ void -PyTokenizer_Free(struct tok_state *tok) +PyTokenizer_Free(PyTokenizer_State *tok) { if (tok->encoding != NULL) PyMem_FREE(tok->encoding); @@ -896,7 +896,7 @@ PyTokenizer_Free(struct tok_state *tok) /* Get next char, updating state; error code goes into tok->done */ static int -tok_nextc(struct tok_state *tok) +tok_nextc(PyTokenizer_State *tok) { for (;;) { if (tok->cur != tok->inp) { @@ -1093,7 +1093,7 @@ tok_nextc(struct tok_state *tok) /* Back-up one character */ static void -tok_backup(struct tok_state *tok, int c) +tok_backup(PyTokenizer_State *tok, int c) { if (c != EOF) { if (--tok->cur < tok->buf) @@ -1276,7 +1276,7 @@ PyToken_ThreeChars(int c1, int c2, int c } static int -indenterror(struct tok_state *tok) +indenterror(PyTokenizer_State *tok) { if (tok->alterror) { tok->done = E_TABSPACE; @@ -1303,7 +1303,7 @@ indenterror(struct tok_state *tok) All identifier strings are guaranteed to be "ready" unicode objects. */ static int -verify_identifier(struct tok_state *tok) +verify_identifier(PyTokenizer_State *tok) { PyObject *s; int result; @@ -1328,7 +1328,7 @@ verify_identifier(struct tok_state *tok) /* Get next token, after space stripping etc. */ static int -tok_get(struct tok_state *tok, char **p_start, char **p_end) +tok_get(PyTokenizer_State *tok, char **p_start, char **p_end) { int c; int blankline, nonascii; @@ -1739,7 +1739,7 @@ tok_get(struct tok_state *tok, char **p_ } int -PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) +PyTokenizer_Get(PyTokenizer_State *tok, char **p_start, char **p_end) { int result = tok_get(tok, p_start, p_end); if (tok->decoding_erred) { @@ -1762,7 +1762,7 @@ PyTokenizer_Get(struct tok_state *tok, c char * PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) { - struct tok_state *tok; + PyTokenizer_State *tok; FILE *fp; char *p_start =NULL , *p_end =NULL , *encoding = NULL; diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h deleted file mode 100644 --- a/Parser/tokenizer.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef Py_TOKENIZER_H -#define Py_TOKENIZER_H -#ifdef __cplusplus -extern "C" { -#endif - -#include "object.h" - -/* Tokenizer interface */ - -#include "token.h" /* For token types */ - -#define MAXINDENT 100 /* Max indentation level */ - -enum decoding_state { - STATE_INIT, - STATE_RAW, - STATE_NORMAL /* have a codec associated with input */ -}; - -/* Tokenizer state */ -struct tok_state { - /* Input state; buf <= cur <= inp <= end */ - /* NB an entire line is held in the buffer */ - char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ - char *cur; /* Next character in buffer */ - char *inp; /* End of data in buffer */ - char *end; /* End of input buffer if buf != NULL */ - char *start; /* Start of current token if not NULL */ - int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ - /* NB If done != E_OK, cur must be == inp!!! */ - FILE *fp; /* Rest of input; NULL if tokenizing a string */ - int tabsize; /* Tab spacing */ - int indent; /* Current indentation index */ - int indstack[MAXINDENT]; /* Stack of indents */ - int atbol; /* Nonzero if at begin of new line */ - int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ - const char *prompt, *nextprompt; /* For interactive prompting */ - int lineno; /* Current line number */ - int level; /* () [] {} Parentheses nesting level */ - /* Used to allow free continuations inside them */ - /* Stuff for checking on different tab sizes */ -#ifndef PGEN - /* pgen doesn't have access to Python codecs, it cannot decode the input - filename. The bytes filename might be kept, but it is only used by - indenterror() and it is not really needed: pgen only compiles one file - (Grammar/Grammar). */ - PyObject *filename; -#endif - int altwarning; /* Issue warning if alternate tabs don't match */ - int alterror; /* Issue error if alternate tabs don't match */ - int alttabsize; /* Alternate tab spacing */ - int altindstack[MAXINDENT]; /* Stack of alternate indents */ - /* Stuff for PEP 0263 */ - enum decoding_state decoding_state; - int decoding_erred; /* whether erred in decoding */ - int read_coding_spec; /* whether 'coding:...' has been read */ - char *encoding; /* Source encoding. */ - int cont_line; /* whether we are in a continuation line. */ - const char* line_start; /* pointer to start of current line */ -#ifndef PGEN - PyObject *decoding_readline; /* open(...).readline */ - PyObject *decoding_buffer; -#endif - const char* enc; /* Encoding for the current str. */ - const char* str; - const char* input; /* Tokenizer's newline translated copy of the string. */ -}; - -extern struct tok_state *PyTokenizer_FromString(const char *, int); -extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); -extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, - const char *, const char *); -extern void PyTokenizer_Free(struct tok_state *); -extern int PyTokenizer_Get(struct tok_state *, char **, char **); -extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, - int len, int *offset); - -#ifdef __cplusplus -} -#endif -#endif /* !Py_TOKENIZER_H */