File Explorer

/proc/self/root/proc/self/root/proc/thread-self/root/usr/lib64/python3.9/email
This explorer reads the filesystem of the server it runs on, so /workspace/user isn't present here. Browsing and the terminal still work against this server's own disk from /.
2 dirs
21 files
feedparser.py22.2 KB · 537 lines
1# Copyright (C) 2004-2006 Python Software Foundation2# Authors: Baxter, Wouters and Warsaw3# Contact: email-sig@python.org4 5"""FeedParser - An email feed parser.6 7The feed parser implements an interface for incrementally parsing an email8message, line by line.  This has advantages for certain applications, such as9those reading email messages off a socket.10 11FeedParser.feed() is the primary interface for pushing new data into the12parser.  It returns when there's nothing more it can do with the available13data.  When you have no more data to push into the parser, call .close().14This completes the parsing and returns the root message object.15 16The other advantage of this parser is that it will never raise a parsing17exception.  Instead, when it finds something unexpected, it adds a 'defect' to18the current message.  Defects are just instances that live on the message19object's .defects attribute.20"""21 22__all__ = ['FeedParser', 'BytesFeedParser']23 24import re25 26from email import errors27from email._policybase import compat3228from collections import deque29from io import StringIO30 31NLCRE = re.compile(r'\r\n|\r|\n')32NLCRE_bol = re.compile(r'(\r\n|\r|\n)')33NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')34NLCRE_crack = re.compile(r'(\r\n|\r|\n)')35# RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character36# except controls, SP, and ":".37headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')38EMPTYSTRING = ''39NL = '\n'40 41NeedMoreData = object()42 43 4445class BufferedSubFile(object):46    """A file-ish object that can have new data loaded into it.47 48    You can also push and pop line-matching predicates onto a stack.  When the49    current predicate matches the current line, a false EOF response50    (i.e. empty string) is returned instead.  This lets the parser adhere to a51    simple abstraction -- it parses until EOF closes the current message.52    """53    def __init__(self):54        # Text stream of the last partial line pushed into this object.55        # See issue 22233 for why this is a text stream and not a list.56        self._partial = StringIO(newline='')57        # A deque of full, pushed lines58        self._lines = deque()59        # The stack of false-EOF checking predicates.60        self._eofstack = []61        # A flag indicating whether the file has been closed or not.62        self._closed = False63 64    def push_eof_matcher(self, pred):65        self._eofstack.append(pred)66 67    def pop_eof_matcher(self):68        return self._eofstack.pop()69 70    def close(self):71        # Don't forget any trailing partial line.72        self._partial.seek(0)73        self.pushlines(self._partial.readlines())74        self._partial.seek(0)75        self._partial.truncate()76        self._closed = True77 78    def readline(self):79        if not self._lines:80            if self._closed:81                return ''82            return NeedMoreData83        # Pop the line off the stack and see if it matches the current84        # false-EOF predicate.85        line = self._lines.popleft()86        # RFC 2046, section 5.1.2 requires us to recognize outer level87        # boundaries at any level of inner nesting.  Do this, but be sure it's88        # in the order of most to least nested.89        for ateof in reversed(self._eofstack):90            if ateof(line):91                # We're at the false EOF.  But push the last line back first.92                self._lines.appendleft(line)93                return ''94        return line95 96    def unreadline(self, line):97        # Let the consumer push a line back into the buffer.98        assert line is not NeedMoreData99        self._lines.appendleft(line)100 101    def push(self, data):102        """Push some new data into this object."""103        self._partial.write(data)104        if '\n' not in data and '\r' not in data:105            # No new complete lines, wait for more.106            return107 108        # Crack into lines, preserving the linesep characters.109        self._partial.seek(0)110        parts = self._partial.readlines()111        self._partial.seek(0)112        self._partial.truncate()113 114        # If the last element of the list does not end in a newline, then treat115        # it as a partial line.  We only check for '\n' here because a line116        # ending with '\r' might be a line that was split in the middle of a117        # '\r\n' sequence (see bugs 1555570 and 1721862).118        if not parts[-1].endswith('\n'):119            self._partial.write(parts.pop())120        self.pushlines(parts)121 122    def pushlines(self, lines):123        self._lines.extend(lines)124 125    def __iter__(self):126        return self127 128    def __next__(self):129        line = self.readline()130        if line == '':131            raise StopIteration132        return line133 134 135136class FeedParser:137    """A feed-style parser of email."""138 139    def __init__(self, _factory=None, *, policy=compat32):140        """_factory is called with no arguments to create a new message obj141 142        The policy keyword specifies a policy object that controls a number of143        aspects of the parser's operation.  The default policy maintains144        backward compatibility.145 146        """147        self.policy = policy148        self._old_style_factory = False149        if _factory is None:150            if policy.message_factory is None:151                from email.message import Message152                self._factory = Message153            else:154                self._factory = policy.message_factory155        else:156            self._factory = _factory157            try:158                _factory(policy=self.policy)159            except TypeError:160                # Assume this is an old-style factory161                self._old_style_factory = True162        self._input = BufferedSubFile()163        self._msgstack = []164        self._parse = self._parsegen().__next__165        self._cur = None166        self._last = None167        self._headersonly = False168 169    # Non-public interface for supporting Parser's headersonly flag170    def _set_headersonly(self):171        self._headersonly = True172 173    def feed(self, data):174        """Push more data into the parser."""175        self._input.push(data)176        self._call_parse()177 178    def _call_parse(self):179        try:180            self._parse()181        except StopIteration:182            pass183 184    def close(self):185        """Parse all remaining data and return the root message object."""186        self._input.close()187        self._call_parse()188        root = self._pop_message()189        assert not self._msgstack190        # Look for final set of defects191        if root.get_content_maintype() == 'multipart' \192               and not root.is_multipart():193            defect = errors.MultipartInvariantViolationDefect()194            self.policy.handle_defect(root, defect)195        return root196 197    def _new_message(self):198        if self._old_style_factory:199            msg = self._factory()200        else:201            msg = self._factory(policy=self.policy)202        if self._cur and self._cur.get_content_type() == 'multipart/digest':203            msg.set_default_type('message/rfc822')204        if self._msgstack:205            self._msgstack[-1].attach(msg)206        self._msgstack.append(msg)207        self._cur = msg208        self._last = msg209 210    def _pop_message(self):211        retval = self._msgstack.pop()212        if self._msgstack:213            self._cur = self._msgstack[-1]214        else:215            self._cur = None216        return retval217 218    def _parsegen(self):219        # Create a new message and start by parsing headers.220        self._new_message()221        headers = []222        # Collect the headers, searching for a line that doesn't match the RFC223        # 2822 header or continuation pattern (including an empty line).224        for line in self._input:225            if line is NeedMoreData:226                yield NeedMoreData227                continue228            if not headerRE.match(line):229                # If we saw the RFC defined header/body separator230                # (i.e. newline), just throw it away. Otherwise the line is231                # part of the body so push it back.232                if not NLCRE.match(line):233                    defect = errors.MissingHeaderBodySeparatorDefect()234                    self.policy.handle_defect(self._cur, defect)235                    self._input.unreadline(line)236                break237            headers.append(line)238        # Done with the headers, so parse them and figure out what we're239        # supposed to see in the body of the message.240        self._parse_headers(headers)241        # Headers-only parsing is a backwards compatibility hack, which was242        # necessary in the older parser, which could raise errors.  All243        # remaining lines in the input are thrown into the message body.244        if self._headersonly:245            lines = []246            while True:247                line = self._input.readline()248                if line is NeedMoreData:249                    yield NeedMoreData250                    continue251                if line == '':252                    break253                lines.append(line)254            self._cur.set_payload(EMPTYSTRING.join(lines))255            return256        if self._cur.get_content_type() == 'message/delivery-status':257            # message/delivery-status contains blocks of headers separated by258            # a blank line.  We'll represent each header block as a separate259            # nested message object, but the processing is a bit different260            # than standard message/* types because there is no body for the261            # nested messages.  A blank line separates the subparts.262            while True:263                self._input.push_eof_matcher(NLCRE.match)264                for retval in self._parsegen():265                    if retval is NeedMoreData:266                        yield NeedMoreData267                        continue268                    break269                msg = self._pop_message()270                # We need to pop the EOF matcher in order to tell if we're at271                # the end of the current file, not the end of the last block272                # of message headers.273                self._input.pop_eof_matcher()274                # The input stream must be sitting at the newline or at the275                # EOF.  We want to see if we're at the end of this subpart, so276                # first consume the blank line, then test the next line to see277                # if we're at this subpart's EOF.278                while True:279                    line = self._input.readline()280                    if line is NeedMoreData:281                        yield NeedMoreData282                        continue283                    break284                while True:285                    line = self._input.readline()286                    if line is NeedMoreData:287                        yield NeedMoreData288                        continue289                    break290                if line == '':291                    break292                # Not at EOF so this is a line we're going to need.293                self._input.unreadline(line)294            return295        if self._cur.get_content_maintype() == 'message':296            # The message claims to be a message/* type, then what follows is297            # another RFC 2822 message.298            for retval in self._parsegen():299                if retval is NeedMoreData:300                    yield NeedMoreData301                    continue302                break303            self._pop_message()304            return305        if self._cur.get_content_maintype() == 'multipart':306            boundary = self._cur.get_boundary()307            if boundary is None:308                # The message /claims/ to be a multipart but it has not309                # defined a boundary.  That's a problem which we'll handle by310                # reading everything until the EOF and marking the message as311                # defective.312                defect = errors.NoBoundaryInMultipartDefect()313                self.policy.handle_defect(self._cur, defect)314                lines = []315                for line in self._input:316                    if line is NeedMoreData:317                        yield NeedMoreData318                        continue319                    lines.append(line)320                self._cur.set_payload(EMPTYSTRING.join(lines))321                return322            # Make sure a valid content type was specified per RFC 2045:6.4.323            if (str(self._cur.get('content-transfer-encoding', '8bit')).lower()324                    not in ('7bit', '8bit', 'binary')):325                defect = errors.InvalidMultipartContentTransferEncodingDefect()326                self.policy.handle_defect(self._cur, defect)327            # Create a line match predicate which matches the inter-part328            # boundary as well as the end-of-multipart boundary.  Don't push329            # this onto the input stream until we've scanned past the330            # preamble.331            separator = '--' + boundary332            boundaryre = re.compile(333                '(?P<sep>' + re.escape(separator) +334                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')335            capturing_preamble = True336            preamble = []337            linesep = False338            close_boundary_seen = False339            while True:340                line = self._input.readline()341                if line is NeedMoreData:342                    yield NeedMoreData343                    continue344                if line == '':345                    break346                mo = boundaryre.match(line)347                if mo:348                    # If we're looking at the end boundary, we're done with349                    # this multipart.  If there was a newline at the end of350                    # the closing boundary, then we need to initialize the351                    # epilogue with the empty string (see below).352                    if mo.group('end'):353                        close_boundary_seen = True354                        linesep = mo.group('linesep')355                        break356                    # We saw an inter-part boundary.  Were we in the preamble?357                    if capturing_preamble:358                        if preamble:359                            # According to RFC 2046, the last newline belongs360                            # to the boundary.361                            lastline = preamble[-1]362                            eolmo = NLCRE_eol.search(lastline)363                            if eolmo:364                                preamble[-1] = lastline[:-len(eolmo.group(0))]365                            self._cur.preamble = EMPTYSTRING.join(preamble)366                        capturing_preamble = False367                        self._input.unreadline(line)368                        continue369                    # We saw a boundary separating two parts.  Consume any370                    # multiple boundary lines that may be following.  Our371                    # interpretation of RFC 2046 BNF grammar does not produce372                    # body parts within such double boundaries.373                    while True:374                        line = self._input.readline()375                        if line is NeedMoreData:376                            yield NeedMoreData377                            continue378                        mo = boundaryre.match(line)379                        if not mo:380                            self._input.unreadline(line)381                            break382                    # Recurse to parse this subpart; the input stream points383                    # at the subpart's first line.384                    self._input.push_eof_matcher(boundaryre.match)385                    for retval in self._parsegen():386                        if retval is NeedMoreData:387                            yield NeedMoreData388                            continue389                        break390                    # Because of RFC 2046, the newline preceding the boundary391                    # separator actually belongs to the boundary, not the392                    # previous subpart's payload (or epilogue if the previous393                    # part is a multipart).394                    if self._last.get_content_maintype() == 'multipart':395                        epilogue = self._last.epilogue396                        if epilogue == '':397                            self._last.epilogue = None398                        elif epilogue is not None:399                            mo = NLCRE_eol.search(epilogue)400                            if mo:401                                end = len(mo.group(0))402                                self._last.epilogue = epilogue[:-end]403                    else:404                        payload = self._last._payload405                        if isinstance(payload, str):406                            mo = NLCRE_eol.search(payload)407                            if mo:408                                payload = payload[:-len(mo.group(0))]409                                self._last._payload = payload410                    self._input.pop_eof_matcher()411                    self._pop_message()412                    # Set the multipart up for newline cleansing, which will413                    # happen if we're in a nested multipart.414                    self._last = self._cur415                else:416                    # I think we must be in the preamble417                    assert capturing_preamble418                    preamble.append(line)419            # We've seen either the EOF or the end boundary.  If we're still420            # capturing the preamble, we never saw the start boundary.  Note421            # that as a defect and store the captured text as the payload.422            if capturing_preamble:423                defect = errors.StartBoundaryNotFoundDefect()424                self.policy.handle_defect(self._cur, defect)425                self._cur.set_payload(EMPTYSTRING.join(preamble))426                epilogue = []427                for line in self._input:428                    if line is NeedMoreData:429                        yield NeedMoreData430                        continue431                self._cur.epilogue = EMPTYSTRING.join(epilogue)432                return433            # If we're not processing the preamble, then we might have seen434            # EOF without seeing that end boundary...that is also a defect.435            if not close_boundary_seen:436                defect = errors.CloseBoundaryNotFoundDefect()437                self.policy.handle_defect(self._cur, defect)438                return439            # Everything from here to the EOF is epilogue.  If the end boundary440            # ended in a newline, we'll need to make sure the epilogue isn't441            # None442            if linesep:443                epilogue = ['']444            else:445                epilogue = []446            for line in self._input:447                if line is NeedMoreData:448                    yield NeedMoreData449                    continue450                epilogue.append(line)451            # Any CRLF at the front of the epilogue is not technically part of452            # the epilogue.  Also, watch out for an empty string epilogue,453            # which means a single newline.454            if epilogue:455                firstline = epilogue[0]456                bolmo = NLCRE_bol.match(firstline)457                if bolmo:458                    epilogue[0] = firstline[len(bolmo.group(0)):]459            self._cur.epilogue = EMPTYSTRING.join(epilogue)460            return461        # Otherwise, it's some non-multipart type, so the entire rest of the462        # file contents becomes the payload.463        lines = []464        for line in self._input:465            if line is NeedMoreData:466                yield NeedMoreData467                continue468            lines.append(line)469        self._cur.set_payload(EMPTYSTRING.join(lines))470 471    def _parse_headers(self, lines):472        # Passed a list of lines that make up the headers for the current msg473        lastheader = ''474        lastvalue = []475        for lineno, line in enumerate(lines):476            # Check for continuation477            if line[0] in ' \t':478                if not lastheader:479                    # The first line of the headers was a continuation.  This480                    # is illegal, so let's note the defect, store the illegal481                    # line, and ignore it for purposes of headers.482                    defect = errors.FirstHeaderLineIsContinuationDefect(line)483                    self.policy.handle_defect(self._cur, defect)484                    continue485                lastvalue.append(line)486                continue487            if lastheader:488                self._cur.set_raw(*self.policy.header_source_parse(lastvalue))489                lastheader, lastvalue = '', []490            # Check for envelope header, i.e. unix-from491            if line.startswith('From '):492                if lineno == 0:493                    # Strip off the trailing newline494                    mo = NLCRE_eol.search(line)495                    if mo:496                        line = line[:-len(mo.group(0))]497                    self._cur.set_unixfrom(line)498                    continue499                elif lineno == len(lines) - 1:500                    # Something looking like a unix-from at the end - it's501                    # probably the first line of the body, so push back the502                    # line and stop.503                    self._input.unreadline(line)504                    return505                else:506                    # Weirdly placed unix-from line.  Note this as a defect507                    # and ignore it.508                    defect = errors.MisplacedEnvelopeHeaderDefect(line)509                    self._cur.defects.append(defect)510                    continue511            # Split the line on the colon separating field name from value.512            # There will always be a colon, because if there wasn't the part of513            # the parser that calls us would have started parsing the body.514            i = line.find(':')515 516            # If the colon is on the start of the line the header is clearly517            # malformed, but we might be able to salvage the rest of the518            # message. Track the error but keep going.519            if i == 0:520                defect = errors.InvalidHeaderDefect("Missing header name.")521                self._cur.defects.append(defect)522                continue523 524            assert i>0, "_parse_headers fed line with no : and no leading WS"525            lastheader = line[:i]526            lastvalue = [line]527        # Done with all the lines, so handle the last header.528        if lastheader:529            self._cur.set_raw(*self.policy.header_source_parse(lastvalue))530 531 532class BytesFeedParser(FeedParser):533    """Like FeedParser, but feed accepts bytes."""534 535    def feed(self, data):536        super().feed(data.decode('ascii', 'surrogateescape'))537