File Explorer

/proc/self/root/proc/thread-self/root/proc/1/task/1/root/proc/1/root/lib64/python3.9
This explorer reads the filesystem of the server it runs on, so /workspace/user isn't present here. Browsing and the terminal still work against this server's own disk from /.
30 dirs
174 files
csv.py15.8 KB · 449 lines
1 2"""3csv.py - read/write/investigate CSV files4"""5 6import re7from _csv import Error, __version__, writer, reader, register_dialect, \8                 unregister_dialect, get_dialect, list_dialects, \9                 field_size_limit, \10                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \11                 __doc__12from _csv import Dialect as _Dialect13 14from io import StringIO15 16__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",17           "Error", "Dialect", "__doc__", "excel", "excel_tab",18           "field_size_limit", "reader", "writer",19           "register_dialect", "get_dialect", "list_dialects", "Sniffer",20           "unregister_dialect", "__version__", "DictReader", "DictWriter",21           "unix_dialect"]22 23class Dialect:24    """Describe a CSV dialect.25 26    This must be subclassed (see csv.excel).  Valid attributes are:27    delimiter, quotechar, escapechar, doublequote, skipinitialspace,28    lineterminator, quoting.29 30    """31    _name = ""32    _valid = False33    # placeholders34    delimiter = None35    quotechar = None36    escapechar = None37    doublequote = None38    skipinitialspace = None39    lineterminator = None40    quoting = None41 42    def __init__(self):43        if self.__class__ != Dialect:44            self._valid = True45        self._validate()46 47    def _validate(self):48        try:49            _Dialect(self)50        except TypeError as e:51            # We do this for compatibility with py2.352            raise Error(str(e))53 54class excel(Dialect):55    """Describe the usual properties of Excel-generated CSV files."""56    delimiter = ','57    quotechar = '"'58    doublequote = True59    skipinitialspace = False60    lineterminator = '\r\n'61    quoting = QUOTE_MINIMAL62register_dialect("excel", excel)63 64class excel_tab(excel):65    """Describe the usual properties of Excel-generated TAB-delimited files."""66    delimiter = '\t'67register_dialect("excel-tab", excel_tab)68 69class unix_dialect(Dialect):70    """Describe the usual properties of Unix-generated CSV files."""71    delimiter = ','72    quotechar = '"'73    doublequote = True74    skipinitialspace = False75    lineterminator = '\n'76    quoting = QUOTE_ALL77register_dialect("unix", unix_dialect)78 79 80class DictReader:81    def __init__(self, f, fieldnames=None, restkey=None, restval=None,82                 dialect="excel", *args, **kwds):83        self._fieldnames = fieldnames   # list of keys for the dict84        self.restkey = restkey          # key to catch long rows85        self.restval = restval          # default value for short rows86        self.reader = reader(f, dialect, *args, **kwds)87        self.dialect = dialect88        self.line_num = 089 90    def __iter__(self):91        return self92 93    @property94    def fieldnames(self):95        if self._fieldnames is None:96            try:97                self._fieldnames = next(self.reader)98            except StopIteration:99                pass100        self.line_num = self.reader.line_num101        return self._fieldnames102 103    @fieldnames.setter104    def fieldnames(self, value):105        self._fieldnames = value106 107    def __next__(self):108        if self.line_num == 0:109            # Used only for its side effect.110            self.fieldnames111        row = next(self.reader)112        self.line_num = self.reader.line_num113 114        # unlike the basic reader, we prefer not to return blanks,115        # because we will typically wind up with a dict full of None116        # values117        while row == []:118            row = next(self.reader)119        d = dict(zip(self.fieldnames, row))120        lf = len(self.fieldnames)121        lr = len(row)122        if lf < lr:123            d[self.restkey] = row[lf:]124        elif lf > lr:125            for key in self.fieldnames[lr:]:126                d[key] = self.restval127        return d128 129 130class DictWriter:131    def __init__(self, f, fieldnames, restval="", extrasaction="raise",132                 dialect="excel", *args, **kwds):133        self.fieldnames = fieldnames    # list of keys for the dict134        self.restval = restval          # for writing short dicts135        if extrasaction.lower() not in ("raise", "ignore"):136            raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"137                             % extrasaction)138        self.extrasaction = extrasaction139        self.writer = writer(f, dialect, *args, **kwds)140 141    def writeheader(self):142        header = dict(zip(self.fieldnames, self.fieldnames))143        return self.writerow(header)144 145    def _dict_to_list(self, rowdict):146        if self.extrasaction == "raise":147            wrong_fields = rowdict.keys() - self.fieldnames148            if wrong_fields:149                raise ValueError("dict contains fields not in fieldnames: "150                                 + ", ".join([repr(x) for x in wrong_fields]))151        return (rowdict.get(key, self.restval) for key in self.fieldnames)152 153    def writerow(self, rowdict):154        return self.writer.writerow(self._dict_to_list(rowdict))155 156    def writerows(self, rowdicts):157        return self.writer.writerows(map(self._dict_to_list, rowdicts))158 159# Guard Sniffer's type checking against builds that exclude complex()160try:161    complex162except NameError:163    complex = float164 165class Sniffer:166    '''167    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)168    Returns a Dialect object.169    '''170    def __init__(self):171        # in case there is more than one possible delimiter172        self.preferred = [',', '\t', ';', ' ', ':']173 174 175    def sniff(self, sample, delimiters=None):176        """177        Returns a dialect (or None) corresponding to the sample178        """179 180        quotechar, doublequote, delimiter, skipinitialspace = \181                   self._guess_quote_and_delimiter(sample, delimiters)182        if not delimiter:183            delimiter, skipinitialspace = self._guess_delimiter(sample,184                                                                delimiters)185 186        if not delimiter:187            raise Error("Could not determine delimiter")188 189        class dialect(Dialect):190            _name = "sniffed"191            lineterminator = '\r\n'192            quoting = QUOTE_MINIMAL193            # escapechar = ''194 195        dialect.doublequote = doublequote196        dialect.delimiter = delimiter197        # _csv.reader won't accept a quotechar of ''198        dialect.quotechar = quotechar or '"'199        dialect.skipinitialspace = skipinitialspace200 201        return dialect202 203 204    def _guess_quote_and_delimiter(self, data, delimiters):205        """206        Looks for text enclosed between two identical quotes207        (the probable quotechar) which are preceded and followed208        by the same character (the probable delimiter).209        For example:210                         ,'some text',211        The quote with the most wins, same with the delimiter.212        If there is no quotechar the delimiter can't be determined213        this way.214        """215 216        matches = []217        for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",218                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",219                      r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"220                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)221            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)222            matches = regexp.findall(data)223            if matches:224                break225 226        if not matches:227            # (quotechar, doublequote, delimiter, skipinitialspace)228            return ('', False, None, 0)229        quotes = {}230        delims = {}231        spaces = 0232        groupindex = regexp.groupindex233        for m in matches:234            n = groupindex['quote'] - 1235            key = m[n]236            if key:237                quotes[key] = quotes.get(key, 0) + 1238            try:239                n = groupindex['delim'] - 1240                key = m[n]241            except KeyError:242                continue243            if key and (delimiters is None or key in delimiters):244                delims[key] = delims.get(key, 0) + 1245            try:246                n = groupindex['space'] - 1247            except KeyError:248                continue249            if m[n]:250                spaces += 1251 252        quotechar = max(quotes, key=quotes.get)253 254        if delims:255            delim = max(delims, key=delims.get)256            skipinitialspace = delims[delim] == spaces257            if delim == '\n': # most likely a file with a single column258                delim = ''259        else:260            # there is *no* delimiter, it's a single column of quoted data261            delim = ''262            skipinitialspace = 0263 264        # if we see an extra quote between delimiters, we've got a265        # double quoted format266        dq_regexp = re.compile(267                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \268                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)269 270 271 272        if dq_regexp.search(data):273            doublequote = True274        else:275            doublequote = False276 277        return (quotechar, doublequote, delim, skipinitialspace)278 279 280    def _guess_delimiter(self, data, delimiters):281        """282        The delimiter /should/ occur the same number of times on283        each row. However, due to malformed data, it may not. We don't want284        an all or nothing approach, so we allow for small variations in this285        number.286          1) build a table of the frequency of each character on every line.287          2) build a table of frequencies of this frequency (meta-frequency?),288             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,289             7 times in 2 rows'290          3) use the mode of the meta-frequency to determine the /expected/291             frequency for that character292          4) find out how often the character actually meets that goal293          5) the character that best meets its goal is the delimiter294        For performance reasons, the data is evaluated in chunks, so it can295        try and evaluate the smallest portion of the data possible, evaluating296        additional chunks as necessary.297        """298 299        data = list(filter(None, data.split('\n')))300 301        ascii = [chr(c) for c in range(127)] # 7-bit ASCII302 303        # build frequency tables304        chunkLength = min(10, len(data))305        iteration = 0306        charFrequency = {}307        modes = {}308        delims = {}309        start, end = 0, chunkLength310        while start < len(data):311            iteration += 1312            for line in data[start:end]:313                for char in ascii:314                    metaFrequency = charFrequency.get(char, {})315                    # must count even if frequency is 0316                    freq = line.count(char)317                    # value is the mode318                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1319                    charFrequency[char] = metaFrequency320 321            for char in charFrequency.keys():322                items = list(charFrequency[char].items())323                if len(items) == 1 and items[0][0] == 0:324                    continue325                # get the mode of the frequencies326                if len(items) > 1:327                    modes[char] = max(items, key=lambda x: x[1])328                    # adjust the mode - subtract the sum of all329                    # other frequencies330                    items.remove(modes[char])331                    modes[char] = (modes[char][0], modes[char][1]332                                   - sum(item[1] for item in items))333                else:334                    modes[char] = items[0]335 336            # build a list of possible delimiters337            modeList = modes.items()338            total = float(min(chunkLength * iteration, len(data)))339            # (rows of consistent data) / (number of rows) = 100%340            consistency = 1.0341            # minimum consistency threshold342            threshold = 0.9343            while len(delims) == 0 and consistency >= threshold:344                for k, v in modeList:345                    if v[0] > 0 and v[1] > 0:346                        if ((v[1]/total) >= consistency and347                            (delimiters is None or k in delimiters)):348                            delims[k] = v349                consistency -= 0.01350 351            if len(delims) == 1:352                delim = list(delims.keys())[0]353                skipinitialspace = (data[0].count(delim) ==354                                    data[0].count("%c " % delim))355                return (delim, skipinitialspace)356 357            # analyze another chunkLength lines358            start = end359            end += chunkLength360 361        if not delims:362            return ('', 0)363 364        # if there's more than one, fall back to a 'preferred' list365        if len(delims) > 1:366            for d in self.preferred:367                if d in delims.keys():368                    skipinitialspace = (data[0].count(d) ==369                                        data[0].count("%c " % d))370                    return (d, skipinitialspace)371 372        # nothing else indicates a preference, pick the character that373        # dominates(?)374        items = [(v,k) for (k,v) in delims.items()]375        items.sort()376        delim = items[-1][1]377 378        skipinitialspace = (data[0].count(delim) ==379                            data[0].count("%c " % delim))380        return (delim, skipinitialspace)381 382 383    def has_header(self, sample):384        # Creates a dictionary of types of data in each column. If any385        # column is of a single type (say, integers), *except* for the first386        # row, then the first row is presumed to be labels. If the type387        # can't be determined, it is assumed to be a string in which case388        # the length of the string is the determining factor: if all of the389        # rows except for the first are the same length, it's a header.390        # Finally, a 'vote' is taken at the end for each column, adding or391        # subtracting from the likelihood of the first row being a header.392 393        rdr = reader(StringIO(sample), self.sniff(sample))394 395        header = next(rdr) # assume first row is header396 397        columns = len(header)398        columnTypes = {}399        for i in range(columns): columnTypes[i] = None400 401        checked = 0402        for row in rdr:403            # arbitrary number of rows to check, to keep it sane404            if checked > 20:405                break406            checked += 1407 408            if len(row) != columns:409                continue # skip rows that have irregular number of columns410 411            for col in list(columnTypes.keys()):412 413                for thisType in [int, float, complex]:414                    try:415                        thisType(row[col])416                        break417                    except (ValueError, OverflowError):418                        pass419                else:420                    # fallback to length of string421                    thisType = len(row[col])422 423                if thisType != columnTypes[col]:424                    if columnTypes[col] is None: # add new column type425                        columnTypes[col] = thisType426                    else:427                        # type is inconsistent, remove column from428                        # consideration429                        del columnTypes[col]430 431        # finally, compare results against first row and "vote"432        # on whether it's a header433        hasHeader = 0434        for col, colType in columnTypes.items():435            if type(colType) == type(0): # it's a length436                if len(header[col]) != colType:437                    hasHeader += 1438                else:439                    hasHeader -= 1440            else: # attempt typecast441                try:442                    colType(header[col])443                except (ValueError, TypeError):444                    hasHeader += 1445                else:446                    hasHeader -= 1447 448        return hasHeader > 0449