File Explorer

/proc/thread-self/root/proc/self/root/proc/self/root/usr/lib64/python3.9/urllib
This explorer reads the filesystem of the server it runs on, so /workspace/user isn't present here. Browsing and the terminal still work against this server's own disk from /.
1 dir
6 files
robotparser.py9.2 KB · 274 lines
1""" robotparser.py2 3    Copyright (C) 2000  Bastian Kleineidam4 5    You can choose between two licenses when using this package:6    1) GNU GPLv27    2) PSF license for Python 2.28 9    The robots.txt Exclusion Protocol is implemented as specified in10    http://www.robotstxt.org/norobots-rfc.txt11"""12 13import collections14import urllib.parse15import urllib.request16 17__all__ = ["RobotFileParser"]18 19RequestRate = collections.namedtuple("RequestRate", "requests seconds")20 21 22class RobotFileParser:23    """ This class provides a set of methods to read, parse and answer24    questions about a single robots.txt file.25 26    """27 28    def __init__(self, url=''):29        self.entries = []30        self.sitemaps = []31        self.default_entry = None32        self.disallow_all = False33        self.allow_all = False34        self.set_url(url)35        self.last_checked = 036 37    def mtime(self):38        """Returns the time the robots.txt file was last fetched.39 40        This is useful for long-running web spiders that need to41        check for new robots.txt files periodically.42 43        """44        return self.last_checked45 46    def modified(self):47        """Sets the time the robots.txt file was last fetched to the48        current time.49 50        """51        import time52        self.last_checked = time.time()53 54    def set_url(self, url):55        """Sets the URL referring to a robots.txt file."""56        self.url = url57        self.host, self.path = urllib.parse.urlparse(url)[1:3]58 59    def read(self):60        """Reads the robots.txt URL and feeds it to the parser."""61        try:62            f = urllib.request.urlopen(self.url)63        except urllib.error.HTTPError as err:64            if err.code in (401, 403):65                self.disallow_all = True66            elif err.code >= 400 and err.code < 500:67                self.allow_all = True68        else:69            raw = f.read()70            self.parse(raw.decode("utf-8").splitlines())71 72    def _add_entry(self, entry):73        if "*" in entry.useragents:74            # the default entry is considered last75            if self.default_entry is None:76                # the first default entry wins77                self.default_entry = entry78        else:79            self.entries.append(entry)80 81    def parse(self, lines):82        """Parse the input lines from a robots.txt file.83 84        We allow that a user-agent: line is not preceded by85        one or more blank lines.86        """87        # states:88        #   0: start state89        #   1: saw user-agent line90        #   2: saw an allow or disallow line91        state = 092        entry = Entry()93 94        self.modified()95        for line in lines:96            if not line:97                if state == 1:98                    entry = Entry()99                    state = 0100                elif state == 2:101                    self._add_entry(entry)102                    entry = Entry()103                    state = 0104            # remove optional comment and strip line105            i = line.find('#')106            if i >= 0:107                line = line[:i]108            line = line.strip()109            if not line:110                continue111            line = line.split(':', 1)112            if len(line) == 2:113                line[0] = line[0].strip().lower()114                line[1] = urllib.parse.unquote(line[1].strip())115                if line[0] == "user-agent":116                    if state == 2:117                        self._add_entry(entry)118                        entry = Entry()119                    entry.useragents.append(line[1])120                    state = 1121                elif line[0] == "disallow":122                    if state != 0:123                        entry.rulelines.append(RuleLine(line[1], False))124                        state = 2125                elif line[0] == "allow":126                    if state != 0:127                        entry.rulelines.append(RuleLine(line[1], True))128                        state = 2129                elif line[0] == "crawl-delay":130                    if state != 0:131                        # before trying to convert to int we need to make132                        # sure that robots.txt has valid syntax otherwise133                        # it will crash134                        if line[1].strip().isdigit():135                            entry.delay = int(line[1])136                        state = 2137                elif line[0] == "request-rate":138                    if state != 0:139                        numbers = line[1].split('/')140                        # check if all values are sane141                        if (len(numbers) == 2 and numbers[0].strip().isdigit()142                            and numbers[1].strip().isdigit()):143                            entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))144                        state = 2145                elif line[0] == "sitemap":146                    # According to http://www.sitemaps.org/protocol.html147                    # "This directive is independent of the user-agent line,148                    #  so it doesn't matter where you place it in your file."149                    # Therefore we do not change the state of the parser.150                    self.sitemaps.append(line[1])151        if state == 2:152            self._add_entry(entry)153 154    def can_fetch(self, useragent, url):155        """using the parsed robots.txt decide if useragent can fetch url"""156        if self.disallow_all:157            return False158        if self.allow_all:159            return True160        # Until the robots.txt file has been read or found not161        # to exist, we must assume that no url is allowable.162        # This prevents false positives when a user erroneously163        # calls can_fetch() before calling read().164        if not self.last_checked:165            return False166        # search for given user agent matches167        # the first match counts168        parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))169        url = urllib.parse.urlunparse(('','',parsed_url.path,170            parsed_url.params,parsed_url.query, parsed_url.fragment))171        url = urllib.parse.quote(url)172        if not url:173            url = "/"174        for entry in self.entries:175            if entry.applies_to(useragent):176                return entry.allowance(url)177        # try the default entry last178        if self.default_entry:179            return self.default_entry.allowance(url)180        # agent not found ==> access granted181        return True182 183    def crawl_delay(self, useragent):184        if not self.mtime():185            return None186        for entry in self.entries:187            if entry.applies_to(useragent):188                return entry.delay189        if self.default_entry:190            return self.default_entry.delay191        return None192 193    def request_rate(self, useragent):194        if not self.mtime():195            return None196        for entry in self.entries:197            if entry.applies_to(useragent):198                return entry.req_rate199        if self.default_entry:200            return self.default_entry.req_rate201        return None202 203    def site_maps(self):204        if not self.sitemaps:205            return None206        return self.sitemaps207 208    def __str__(self):209        entries = self.entries210        if self.default_entry is not None:211            entries = entries + [self.default_entry]212        return '\n\n'.join(map(str, entries))213 214 215class RuleLine:216    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"217       (allowance==False) followed by a path."""218    def __init__(self, path, allowance):219        if path == '' and not allowance:220            # an empty value means allow all221            allowance = True222        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))223        self.path = urllib.parse.quote(path)224        self.allowance = allowance225 226    def applies_to(self, filename):227        return self.path == "*" or filename.startswith(self.path)228 229    def __str__(self):230        return ("Allow" if self.allowance else "Disallow") + ": " + self.path231 232 233class Entry:234    """An entry has one or more user-agents and zero or more rulelines"""235    def __init__(self):236        self.useragents = []237        self.rulelines = []238        self.delay = None239        self.req_rate = None240 241    def __str__(self):242        ret = []243        for agent in self.useragents:244            ret.append(f"User-agent: {agent}")245        if self.delay is not None:246            ret.append(f"Crawl-delay: {self.delay}")247        if self.req_rate is not None:248            rate = self.req_rate249            ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")250        ret.extend(map(str, self.rulelines))251        return '\n'.join(ret)252 253    def applies_to(self, useragent):254        """check if this entry applies to the specified agent"""255        # split the name token and make it lower case256        useragent = useragent.split("/")[0].lower()257        for agent in self.useragents:258            if agent == '*':259                # we have the catch-all agent260                return True261            agent = agent.lower()262            if agent in useragent:263                return True264        return False265 266    def allowance(self, filename):267        """Preconditions:268        - our agent applies to this entry269        - filename is URL decoded"""270        for line in self.rulelines:271            if line.applies_to(filename):272                return line.allowance273        return True274