From bee8d9dd68ecaddefc7675e197606b19774e193c Mon Sep 17 00:00:00 2001 From: Ka Wo Chen Date: Wed, 14 Oct 2015 20:00:00 -0400 Subject: [PATCH] CLN: GH11271 move _get_handle, UTF encoders to io.common --- pandas/core/common.py | 152 +----------------------------------------- pandas/core/format.py | 5 +- pandas/io/common.py | 148 ++++++++++++++++++++++++++++++++++++++++ pandas/io/parsers.py | 19 +++--- 4 files changed, 162 insertions(+), 162 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 724843d379f64..c6e774b5077db 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -5,8 +5,6 @@ import re import collections import numbers -import codecs -import csv import types from datetime import datetime, timedelta from functools import partial @@ -19,7 +17,7 @@ import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import StringIO, BytesIO, range, long, u, zip, map, string_types, iteritems +from pandas.compat import BytesIO, range, long, u, zip, map, string_types, iteritems from pandas.core.dtypes import CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType from pandas.core.config import get_option @@ -2808,154 +2806,6 @@ def _all_none(*args): return True -class UTF8Recoder: - - """ - Iterator that reads an encoded stream and reencodes the input to UTF-8 - """ - - def __init__(self, f, encoding): - self.reader = codecs.getreader(encoding)(f) - - def __iter__(self): - return self - - def read(self, bytes=-1): - return self.reader.read(bytes).encode('utf-8') - - def readline(self): - return self.reader.readline().encode('utf-8') - - def next(self): - return next(self.reader).encode("utf-8") - - # Python 3 iterator - __next__ = next - - -def _get_handle(path, mode, encoding=None, compression=None): - """Gets file handle for given path and mode. - NOTE: Under Python 3.2, getting a compressed file handle means reading in - the entire file, decompressing it and decoding it to ``str`` all at once - and then wrapping it in a StringIO. - """ - if compression is not None: - if encoding is not None and not compat.PY3: - msg = 'encoding + compression not yet supported in Python 2' - raise ValueError(msg) - - if compression == 'gzip': - import gzip - f = gzip.GzipFile(path, mode) - elif compression == 'bz2': - import bz2 - f = bz2.BZ2File(path, mode) - else: - raise ValueError('Unrecognized compression type: %s' % - compression) - if compat.PY3: - from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding) - return f - else: - if compat.PY3: - if encoding: - f = open(path, mode, encoding=encoding) - else: - f = open(path, mode, errors='replace') - else: - f = open(path, mode) - - return f - - -if compat.PY3: # pragma: no cover - def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): - # ignore encoding - return csv.reader(f, dialect=dialect, **kwds) - - def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): - return csv.writer(f, dialect=dialect, **kwds) -else: - class UnicodeReader: - - """ - A CSV reader which will iterate over lines in the CSV file "f", - which is encoded in the given encoding. - - On Python 3, this is replaced (below) by csv.reader, which handles - unicode. - """ - - def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): - f = UTF8Recoder(f, encoding) - self.reader = csv.reader(f, dialect=dialect, **kwds) - - def next(self): - row = next(self.reader) - return [compat.text_type(s, "utf-8") for s in row] - - # python 3 iterator - __next__ = next - - def __iter__(self): # pragma: no cover - return self - - class UnicodeWriter: - - """ - A CSV writer which will write rows to CSV file "f", - which is encoded in the given encoding. - """ - - def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): - # Redirect output to a queue - self.queue = StringIO() - self.writer = csv.writer(self.queue, dialect=dialect, **kwds) - self.stream = f - self.encoder = codecs.getincrementalencoder(encoding)() - self.quoting = kwds.get("quoting", None) - - def writerow(self, row): - def _check_as_is(x): - return (self.quoting == csv.QUOTE_NONNUMERIC and - is_number(x)) or isinstance(x, str) - - row = [x if _check_as_is(x) - else pprint_thing(x).encode('utf-8') for x in row] - - self.writer.writerow([s for s in row]) - # Fetch UTF-8 output from the queue ... - data = self.queue.getvalue() - data = data.decode("utf-8") - # ... and reencode it into the target encoding - data = self.encoder.encode(data) - # write to the target stream - self.stream.write(data) - # empty queue - self.queue.truncate(0) - - def writerows(self, rows): - def _check_as_is(x): - return (self.quoting == csv.QUOTE_NONNUMERIC and - is_number(x)) or isinstance(x, str) - - for i, row in enumerate(rows): - rows[i] = [x if _check_as_is(x) - else pprint_thing(x).encode('utf-8') for x in row] - - self.writer.writerows([[s for s in row] for row in rows]) - # Fetch UTF-8 output from the queue ... - data = self.queue.getvalue() - data = data.decode("utf-8") - # ... and reencode it into the target encoding - data = self.encoder.encode(data) - # write to the target stream - self.stream.write(data) - # empty queue - self.queue.truncate(0) - - def get_dtype_kinds(l): """ Parameters diff --git a/pandas/core/format.py b/pandas/core/format.py index 322d97ab6b58f..e4aa1eac248d5 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -13,6 +13,7 @@ OrderedDict) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option, set_option +from pandas.io.common import _get_handle, UnicodeWriter import pandas.core.common as com import pandas.lib as lib from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime @@ -1475,7 +1476,7 @@ def save(self): f = self.path_or_buf close = False else: - f = com._get_handle(self.path_or_buf, self.mode, + f = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, compression=self.compression) close = True @@ -1488,7 +1489,7 @@ def save(self): quotechar=self.quotechar) if self.encoding is not None: writer_kwargs['encoding'] = self.encoding - self.writer = com.UnicodeWriter(f, **writer_kwargs) + self.writer = UnicodeWriter(f, **writer_kwargs) else: self.writer = csv.writer(f, **writer_kwargs) diff --git a/pandas/io/common.py b/pandas/io/common.py index b9cdd44e52555..ad0145492f9b6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,11 +2,14 @@ import sys import os +import csv +import codecs import zipfile from contextlib import contextmanager, closing from pandas.compat import StringIO, string_types, BytesIO from pandas import compat +from pandas.core.common import pprint_thing, is_number if compat.PY3: @@ -284,3 +287,148 @@ def ZipFile(*args, **kwargs): yield zf else: ZipFile = zipfile.ZipFile + + +def _get_handle(path, mode, encoding=None, compression=None): + """Gets file handle for given path and mode. + """ + if compression is not None: + if encoding is not None and not compat.PY3: + msg = 'encoding + compression not yet supported in Python 2' + raise ValueError(msg) + + if compression == 'gzip': + import gzip + f = gzip.GzipFile(path, mode) + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(path, mode) + else: + raise ValueError('Unrecognized compression type: %s' % + compression) + if compat.PY3: + from io import TextIOWrapper + f = TextIOWrapper(f, encoding=encoding) + return f + else: + if compat.PY3: + if encoding: + f = open(path, mode, encoding=encoding) + else: + f = open(path, mode, errors='replace') + else: + f = open(path, mode) + + return f + + +class UTF8Recoder: + + """ + Iterator that reads an encoded stream and reencodes the input to UTF-8 + """ + + def __init__(self, f, encoding): + self.reader = codecs.getreader(encoding)(f) + + def __iter__(self): + return self + + def read(self, bytes=-1): + return self.reader.read(bytes).encode("utf-8") + + def readline(self): + return self.reader.readline().encode("utf-8") + + def next(self): + return next(self.reader).encode("utf-8") + + # Python 3 iterator + __next__ = next + + +if compat.PY3: # pragma: no cover + def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): + # ignore encoding + return csv.reader(f, dialect=dialect, **kwds) + + def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): + return csv.writer(f, dialect=dialect, **kwds) +else: + class UnicodeReader: + + """ + A CSV reader which will iterate over lines in the CSV file "f", + which is encoded in the given encoding. + + On Python 3, this is replaced (below) by csv.reader, which handles + unicode. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + f = UTF8Recoder(f, encoding) + self.reader = csv.reader(f, dialect=dialect, **kwds) + + def next(self): + row = next(self.reader) + return [compat.text_type(s, "utf-8") for s in row] + + # python 3 iterator + __next__ = next + + def __iter__(self): # pragma: no cover + return self + + class UnicodeWriter: + + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + self.quoting = kwds.get("quoting", None) + + def writerow(self, row): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + row = [x if _check_as_is(x) + else pprint_thing(x).encode("utf-8") for x in row] + + self.writer.writerow([s for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + def writerows(self, rows): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + for i, row in enumerate(rows): + rows[i] = [x if _check_as_is(x) + else pprint_thing(x).encode("utf-8") for x in row] + + self.writer.writerows([[s for s in row] for row in rows]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) \ No newline at end of file diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8ac1aed9d9af7..fb58c45170c52 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,7 +17,8 @@ from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser -from pandas.io.common import get_filepath_or_buffer, _validate_header_arg +from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, + _get_handle, UnicodeReader, UTF8Recoder) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -1084,7 +1085,7 @@ def __init__(self, src, **kwds): if 'utf-16' in (kwds.get('encoding') or ''): if isinstance(src, compat.string_types): src = open(src, 'rb') - src = com.UTF8Recoder(src, kwds['encoding']) + src = UTF8Recoder(src, kwds['encoding']) kwds['encoding'] = 'utf-8' # #2442 @@ -1420,7 +1421,7 @@ def __init__(self, f, **kwds): self._comment_lines = [] if isinstance(f, compat.string_types): - f = com._get_handle(f, 'r', encoding=self.encoding, + f = _get_handle(f, 'r', encoding=self.encoding, compression=self.compression) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) @@ -1540,17 +1541,17 @@ class MyDialect(csv.Dialect): dia.delimiter = sniffed.delimiter if self.encoding is not None: self.buf.extend(list( - com.UnicodeReader(StringIO(line), - dialect=dia, - encoding=self.encoding))) + UnicodeReader(StringIO(line), + dialect=dia, + encoding=self.encoding))) else: self.buf.extend(list(csv.reader(StringIO(line), dialect=dia))) if self.encoding is not None: - reader = com.UnicodeReader(f, dialect=dia, - encoding=self.encoding, - strict=True) + reader = UnicodeReader(f, dialect=dia, + encoding=self.encoding, + strict=True) else: reader = csv.reader(f, dialect=dia, strict=True)