File Explorer

/proc/self/root/proc/thread-self/root/proc/self/task/13/root/usr/lib64/python3.9
This explorer reads the filesystem of the server it runs on, so /workspace/user isn't present here. Browsing and the terminal still work against this server's own disk from /.
30 dirs
174 files
tokenize.py25.3 KB · 683 lines
1"""Tokenization help for Python programs.2 3tokenize(readline) is a generator that breaks a stream of bytes into4Python tokens.  It decodes the bytes according to PEP-0263 for5determining source file encoding.6 7It accepts a readline-like method which is called repeatedly to get the8next line of input (or b"" for EOF).  It generates 5-tuples with these9members:10 11    the token type (see token.py)12    the token (a string)13    the starting (row, column) indices of the token (a 2-tuple of ints)14    the ending (row, column) indices of the token (a 2-tuple of ints)15    the original line (string)16 17It is designed to match the working of the Python tokenizer exactly, except18that it produces COMMENT tokens for comments and gives type OP for all19operators.  Additionally, all token lists start with an ENCODING token20which tells you which encoding was used to decode the bytes stream.21"""22 23__author__ = 'Ka-Ping Yee <ping@lfw.org>'24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '25               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '26               'Michael Foord')27from builtins import open as _builtin_open28from codecs import lookup, BOM_UTF829import collections30from io import TextIOWrapper31import itertools as _itertools32import re33import sys34from token import *35from token import EXACT_TOKEN_TYPES36 37cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)38blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)39 40import token41__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",42                           "untokenize", "TokenInfo"]43del token44 45class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):46    def __repr__(self):47        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])48        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %49                self._replace(type=annotated_type))50 51    @property52    def exact_type(self):53        if self.type == OP and self.string in EXACT_TOKEN_TYPES:54            return EXACT_TOKEN_TYPES[self.string]55        else:56            return self.type57 58def group(*choices): return '(' + '|'.join(choices) + ')'59def any(*choices): return group(*choices) + '*'60def maybe(*choices): return group(*choices) + '?'61 62# Note: we use unicode matching for names ("\w") but ascii matching for63# number literals.64Whitespace = r'[ \f\t]*'65Comment = r'#[^\r\n]*'66Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)67Name = r'\w+'68 69Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'70Binnumber = r'0[bB](?:_?[01])+'71Octnumber = r'0[oO](?:_?[0-7])+'72Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'73Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)74Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'75Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',76                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)77Expfloat = r'[0-9](?:_?[0-9])*' + Exponent78Floatnumber = group(Pointfloat, Expfloat)79Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')80Number = group(Imagnumber, Floatnumber, Intnumber)81 82# Return the empty string, plus all of the valid string prefixes.83def _all_string_prefixes():84    # The valid string prefixes. Only contain the lower case versions,85    #  and don't contain any permutations (include 'fr', but not86    #  'rf'). The various permutations will be generated.87    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']88    # if we add binary f-strings, add: ['fb', 'fbr']89    result = {''}90    for prefix in _valid_string_prefixes:91        for t in _itertools.permutations(prefix):92            # create a list with upper and lower versions of each93            #  character94            for u in _itertools.product(*[(c, c.upper()) for c in t]):95                result.add(''.join(u))96    return result97 98def _compile(expr):99    return re.compile(expr, re.UNICODE)100 101# Note that since _all_string_prefixes includes the empty string,102#  StringPrefix can be the empty string (making it optional).103StringPrefix = group(*_all_string_prefixes())104 105# Tail end of ' string.106Single = r"[^'\\]*(?:\\.[^'\\]*)*'"107# Tail end of " string.108Double = r'[^"\\]*(?:\\.[^"\\]*)*"'109# Tail end of ''' string.110Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"111# Tail end of """ string.112Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'113Triple = group(StringPrefix + "'''", StringPrefix + '"""')114# Single-line ' or " string.115String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",116               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')117 118# Sorting in reverse order puts the long operators before their prefixes.119# Otherwise if = came before ==, == would get recognized as two instances120# of =.121Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))122Funny = group(r'\r?\n', Special)123 124PlainToken = group(Number, Funny, String, Name)125Token = Ignore + PlainToken126 127# First (or only) line of ' or " string.128ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +129                group("'", r'\\\r?\n'),130                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +131                group('"', r'\\\r?\n'))132PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)133PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)134 135# For a given string prefix plus quotes, endpats maps it to a regex136#  to match the remainder of that string. _prefix can be empty, for137#  a normal single or triple quoted string (with no prefix).138endpats = {}139for _prefix in _all_string_prefixes():140    endpats[_prefix + "'"] = Single141    endpats[_prefix + '"'] = Double142    endpats[_prefix + "'''"] = Single3143    endpats[_prefix + '"""'] = Double3144 145# A set of all of the single and triple quoted string prefixes,146#  including the opening quotes.147single_quoted = set()148triple_quoted = set()149for t in _all_string_prefixes():150    for u in (t + '"', t + "'"):151        single_quoted.add(u)152    for u in (t + '"""', t + "'''"):153        triple_quoted.add(u)154 155tabsize = 8156 157class TokenError(Exception): pass158 159class StopTokenizing(Exception): pass160 161 162class Untokenizer:163 164    def __init__(self):165        self.tokens = []166        self.prev_row = 1167        self.prev_col = 0168        self.encoding = None169 170    def add_whitespace(self, start):171        row, col = start172        if row < self.prev_row or row == self.prev_row and col < self.prev_col:173            raise ValueError("start ({},{}) precedes previous end ({},{})"174                             .format(row, col, self.prev_row, self.prev_col))175        row_offset = row - self.prev_row176        if row_offset:177            self.tokens.append("\\\n" * row_offset)178            self.prev_col = 0179        col_offset = col - self.prev_col180        if col_offset:181            self.tokens.append(" " * col_offset)182 183    def untokenize(self, iterable):184        it = iter(iterable)185        indents = []186        startline = False187        for t in it:188            if len(t) == 2:189                self.compat(t, it)190                break191            tok_type, token, start, end, line = t192            if tok_type == ENCODING:193                self.encoding = token194                continue195            if tok_type == ENDMARKER:196                break197            if tok_type == INDENT:198                indents.append(token)199                continue200            elif tok_type == DEDENT:201                indents.pop()202                self.prev_row, self.prev_col = end203                continue204            elif tok_type in (NEWLINE, NL):205                startline = True206            elif startline and indents:207                indent = indents[-1]208                if start[1] >= len(indent):209                    self.tokens.append(indent)210                    self.prev_col = len(indent)211                startline = False212            self.add_whitespace(start)213            self.tokens.append(token)214            self.prev_row, self.prev_col = end215            if tok_type in (NEWLINE, NL):216                self.prev_row += 1217                self.prev_col = 0218        return "".join(self.tokens)219 220    def compat(self, token, iterable):221        indents = []222        toks_append = self.tokens.append223        startline = token[0] in (NEWLINE, NL)224        prevstring = False225 226        for tok in _itertools.chain([token], iterable):227            toknum, tokval = tok[:2]228            if toknum == ENCODING:229                self.encoding = tokval230                continue231 232            if toknum in (NAME, NUMBER):233                tokval += ' '234 235            # Insert a space between two consecutive strings236            if toknum == STRING:237                if prevstring:238                    tokval = ' ' + tokval239                prevstring = True240            else:241                prevstring = False242 243            if toknum == INDENT:244                indents.append(tokval)245                continue246            elif toknum == DEDENT:247                indents.pop()248                continue249            elif toknum in (NEWLINE, NL):250                startline = True251            elif startline and indents:252                toks_append(indents[-1])253                startline = False254            toks_append(tokval)255 256 257def untokenize(iterable):258    """Transform tokens back into Python source code.259    It returns a bytes object, encoded using the ENCODING260    token, which is the first token sequence output by tokenize.261 262    Each element returned by the iterable must be a token sequence263    with at least two elements, a token number and token value.  If264    only two tokens are passed, the resulting output is poor.265 266    Round-trip invariant for full input:267        Untokenized source will match input source exactly268 269    Round-trip invariant for limited input:270        # Output bytes will tokenize back to the input271        t1 = [tok[:2] for tok in tokenize(f.readline)]272        newcode = untokenize(t1)273        readline = BytesIO(newcode).readline274        t2 = [tok[:2] for tok in tokenize(readline)]275        assert t1 == t2276    """277    ut = Untokenizer()278    out = ut.untokenize(iterable)279    if ut.encoding is not None:280        out = out.encode(ut.encoding)281    return out282 283 284def _get_normal_name(orig_enc):285    """Imitates get_normal_name in tokenizer.c."""286    # Only care about the first 12 characters.287    enc = orig_enc[:12].lower().replace("_", "-")288    if enc == "utf-8" or enc.startswith("utf-8-"):289        return "utf-8"290    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \291       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):292        return "iso-8859-1"293    return orig_enc294 295def detect_encoding(readline):296    """297    The detect_encoding() function is used to detect the encoding that should298    be used to decode a Python source file.  It requires one argument, readline,299    in the same way as the tokenize() generator.300 301    It will call readline a maximum of twice, and return the encoding used302    (as a string) and a list of any lines (left as bytes) it has read in.303 304    It detects the encoding from the presence of a utf-8 bom or an encoding305    cookie as specified in pep-0263.  If both a bom and a cookie are present,306    but disagree, a SyntaxError will be raised.  If the encoding cookie is an307    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,308    'utf-8-sig' is returned.309 310    If no encoding is specified, then the default of 'utf-8' will be returned.311    """312    try:313        filename = readline.__self__.name314    except AttributeError:315        filename = None316    bom_found = False317    encoding = None318    default = 'utf-8'319    def read_or_stop():320        try:321            return readline()322        except StopIteration:323            return b''324 325    def find_cookie(line):326        try:327            # Decode as UTF-8. Either the line is an encoding declaration,328            # in which case it should be pure ASCII, or it must be UTF-8329            # per default encoding.330            line_string = line.decode('utf-8')331        except UnicodeDecodeError:332            msg = "invalid or missing encoding declaration"333            if filename is not None:334                msg = '{} for {!r}'.format(msg, filename)335            raise SyntaxError(msg)336 337        match = cookie_re.match(line_string)338        if not match:339            return None340        encoding = _get_normal_name(match.group(1))341        try:342            codec = lookup(encoding)343        except LookupError:344            # This behaviour mimics the Python interpreter345            if filename is None:346                msg = "unknown encoding: " + encoding347            else:348                msg = "unknown encoding for {!r}: {}".format(filename,349                        encoding)350            raise SyntaxError(msg)351 352        if bom_found:353            if encoding != 'utf-8':354                # This behaviour mimics the Python interpreter355                if filename is None:356                    msg = 'encoding problem: utf-8'357                else:358                    msg = 'encoding problem for {!r}: utf-8'.format(filename)359                raise SyntaxError(msg)360            encoding += '-sig'361        return encoding362 363    first = read_or_stop()364    if first.startswith(BOM_UTF8):365        bom_found = True366        first = first[3:]367        default = 'utf-8-sig'368    if not first:369        return default, []370 371    encoding = find_cookie(first)372    if encoding:373        return encoding, [first]374    if not blank_re.match(first):375        return default, [first]376 377    second = read_or_stop()378    if not second:379        return default, [first]380 381    encoding = find_cookie(second)382    if encoding:383        return encoding, [first, second]384 385    return default, [first, second]386 387 388def open(filename):389    """Open a file in read only mode using the encoding detected by390    detect_encoding().391    """392    buffer = _builtin_open(filename, 'rb')393    try:394        encoding, lines = detect_encoding(buffer.readline)395        buffer.seek(0)396        text = TextIOWrapper(buffer, encoding, line_buffering=True)397        text.mode = 'r'398        return text399    except:400        buffer.close()401        raise402 403 404def tokenize(readline):405    """406    The tokenize() generator requires one argument, readline, which407    must be a callable object which provides the same interface as the408    readline() method of built-in file objects.  Each call to the function409    should return one line of input as bytes.  Alternatively, readline410    can be a callable function terminating with StopIteration:411        readline = open(myfile, 'rb').__next__  # Example of alternate readline412 413    The generator produces 5-tuples with these members: the token type; the414    token string; a 2-tuple (srow, scol) of ints specifying the row and415    column where the token begins in the source; a 2-tuple (erow, ecol) of416    ints specifying the row and column where the token ends in the source;417    and the line on which the token was found.  The line passed is the418    physical line.419 420    The first token sequence will always be an ENCODING token421    which tells you which encoding was used to decode the bytes stream.422    """423    encoding, consumed = detect_encoding(readline)424    empty = _itertools.repeat(b"")425    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)426    return _tokenize(rl_gen.__next__, encoding)427 428 429def _tokenize(readline, encoding):430    lnum = parenlev = continued = 0431    numchars = '0123456789'432    contstr, needcont = '', 0433    contline = None434    indents = [0]435 436    if encoding is not None:437        if encoding == "utf-8-sig":438            # BOM will already have been stripped.439            encoding = "utf-8"440        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')441    last_line = b''442    line = b''443    while True:                                # loop over lines in stream444        try:445            # We capture the value of the line variable here because446            # readline uses the empty string '' to signal end of input,447            # hence `line` itself will always be overwritten at the end448            # of this loop.449            last_line = line450            line = readline()451        except StopIteration:452            line = b''453 454        if encoding is not None:455            line = line.decode(encoding)456        lnum += 1457        pos, max = 0, len(line)458 459        if contstr:                            # continued string460            if not line:461                raise TokenError("EOF in multi-line string", strstart)462            endmatch = endprog.match(line)463            if endmatch:464                pos = end = endmatch.end(0)465                yield TokenInfo(STRING, contstr + line[:end],466                       strstart, (lnum, end), contline + line)467                contstr, needcont = '', 0468                contline = None469            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':470                yield TokenInfo(ERRORTOKEN, contstr + line,471                           strstart, (lnum, len(line)), contline)472                contstr = ''473                contline = None474                continue475            else:476                contstr = contstr + line477                contline = contline + line478                continue479 480        elif parenlev == 0 and not continued:  # new statement481            if not line: break482            column = 0483            while pos < max:                   # measure leading whitespace484                if line[pos] == ' ':485                    column += 1486                elif line[pos] == '\t':487                    column = (column//tabsize + 1)*tabsize488                elif line[pos] == '\f':489                    column = 0490                else:491                    break492                pos += 1493            if pos == max:494                break495 496            if line[pos] in '#\r\n':           # skip comments or blank lines497                if line[pos] == '#':498                    comment_token = line[pos:].rstrip('\r\n')499                    yield TokenInfo(COMMENT, comment_token,500                           (lnum, pos), (lnum, pos + len(comment_token)), line)501                    pos += len(comment_token)502 503                yield TokenInfo(NL, line[pos:],504                           (lnum, pos), (lnum, len(line)), line)505                continue506 507            if column > indents[-1]:           # count indents or dedents508                indents.append(column)509                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)510            while column < indents[-1]:511                if column not in indents:512                    raise IndentationError(513                        "unindent does not match any outer indentation level",514                        ("<tokenize>", lnum, pos, line))515                indents = indents[:-1]516 517                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)518 519        else:                                  # continued statement520            if not line:521                raise TokenError("EOF in multi-line statement", (lnum, 0))522            continued = 0523 524        while pos < max:525            pseudomatch = _compile(PseudoToken).match(line, pos)526            if pseudomatch:                                # scan for tokens527                start, end = pseudomatch.span(1)528                spos, epos, pos = (lnum, start), (lnum, end), end529                if start == end:530                    continue531                token, initial = line[start:end], line[start]532 533                if (initial in numchars or                 # ordinary number534                    (initial == '.' and token != '.' and token != '...')):535                    yield TokenInfo(NUMBER, token, spos, epos, line)536                elif initial in '\r\n':537                    if parenlev > 0:538                        yield TokenInfo(NL, token, spos, epos, line)539                    else:540                        yield TokenInfo(NEWLINE, token, spos, epos, line)541 542                elif initial == '#':543                    assert not token.endswith("\n")544                    yield TokenInfo(COMMENT, token, spos, epos, line)545 546                elif token in triple_quoted:547                    endprog = _compile(endpats[token])548                    endmatch = endprog.match(line, pos)549                    if endmatch:                           # all on one line550                        pos = endmatch.end(0)551                        token = line[start:pos]552                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)553                    else:554                        strstart = (lnum, start)           # multiple lines555                        contstr = line[start:]556                        contline = line557                        break558 559                # Check up to the first 3 chars of the token to see if560                #  they're in the single_quoted set. If so, they start561                #  a string.562                # We're using the first 3, because we're looking for563                #  "rb'" (for example) at the start of the token. If564                #  we switch to longer prefixes, this needs to be565                #  adjusted.566                # Note that initial == token[:1].567                # Also note that single quote checking must come after568                #  triple quote checking (above).569                elif (initial in single_quoted or570                      token[:2] in single_quoted or571                      token[:3] in single_quoted):572                    if token[-1] == '\n':                  # continued string573                        strstart = (lnum, start)574                        # Again, using the first 3 chars of the575                        #  token. This is looking for the matching end576                        #  regex for the correct type of quote577                        #  character. So it's really looking for578                        #  endpats["'"] or endpats['"'], by trying to579                        #  skip string prefix characters, if any.580                        endprog = _compile(endpats.get(initial) or581                                           endpats.get(token[1]) or582                                           endpats.get(token[2]))583                        contstr, needcont = line[start:], 1584                        contline = line585                        break586                    else:                                  # ordinary string587                        yield TokenInfo(STRING, token, spos, epos, line)588 589                elif initial.isidentifier():               # ordinary name590                    yield TokenInfo(NAME, token, spos, epos, line)591                elif initial == '\\':                      # continued stmt592                    continued = 1593                else:594                    if initial in '([{':595                        parenlev += 1596                    elif initial in ')]}':597                        parenlev -= 1598                    yield TokenInfo(OP, token, spos, epos, line)599            else:600                yield TokenInfo(ERRORTOKEN, line[pos],601                           (lnum, pos), (lnum, pos+1), line)602                pos += 1603 604    # Add an implicit NEWLINE if the input doesn't end in one605    if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):606        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')607    for indent in indents[1:]:                 # pop remaining indent levels608        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')609    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')610 611 612def generate_tokens(readline):613    """Tokenize a source reading Python code as unicode strings.614 615    This has the same API as tokenize(), except that it expects the *readline*616    callable to return str objects instead of bytes.617    """618    return _tokenize(readline, None)619 620def main():621    import argparse622 623    # Helper error handling routines624    def perror(message):625        sys.stderr.write(message)626        sys.stderr.write('\n')627 628    def error(message, filename=None, location=None):629        if location:630            args = (filename,) + location + (message,)631            perror("%s:%d:%d: error: %s" % args)632        elif filename:633            perror("%s: error: %s" % (filename, message))634        else:635            perror("error: %s" % message)636        sys.exit(1)637 638    # Parse the arguments and options639    parser = argparse.ArgumentParser(prog='python -m tokenize')640    parser.add_argument(dest='filename', nargs='?',641                        metavar='filename.py',642                        help='the file to tokenize; defaults to stdin')643    parser.add_argument('-e', '--exact', dest='exact', action='store_true',644                        help='display token names using the exact type')645    args = parser.parse_args()646 647    try:648        # Tokenize the input649        if args.filename:650            filename = args.filename651            with _builtin_open(filename, 'rb') as f:652                tokens = list(tokenize(f.readline))653        else:654            filename = "<stdin>"655            tokens = _tokenize(sys.stdin.readline, None)656 657        # Output the tokenization658        for token in tokens:659            token_type = token.type660            if args.exact:661                token_type = token.exact_type662            token_range = "%d,%d-%d,%d:" % (token.start + token.end)663            print("%-20s%-15s%-15r" %664                  (token_range, tok_name[token_type], token.string))665    except IndentationError as err:666        line, column = err.args[1][1:3]667        error(err.args[0], filename, (line, column))668    except TokenError as err:669        line, column = err.args[1]670        error(err.args[0], filename, (line, column))671    except SyntaxError as err:672        error(err, filename)673    except OSError as err:674        error(err)675    except KeyboardInterrupt:676        print("interrupted\n")677    except Exception as err:678        perror("unexpected error: %s" % err)679        raise680 681if __name__ == "__main__":682    main()683