Skip to content

CLN: GH11271 move _get_handle, UTF encoders to io.common #11330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 15, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 1 addition & 151 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import re
import collections
import numbers
import codecs
import csv
import types
from datetime import datetime, timedelta
from functools import partial
Expand All @@ -19,7 +17,7 @@
import pandas.lib as lib
import pandas.tslib as tslib
from pandas import compat
from pandas.compat import StringIO, BytesIO, range, long, u, zip, map, string_types, iteritems
from pandas.compat import BytesIO, range, long, u, zip, map, string_types, iteritems
from pandas.core.dtypes import CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType
from pandas.core.config import get_option

Expand Down Expand Up @@ -2808,154 +2806,6 @@ def _all_none(*args):
return True


class UTF8Recoder:

"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""

def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)

def __iter__(self):
return self

def read(self, bytes=-1):
return self.reader.read(bytes).encode('utf-8')

def readline(self):
return self.reader.readline().encode('utf-8')

def next(self):
return next(self.reader).encode("utf-8")

# Python 3 iterator
__next__ = next


def _get_handle(path, mode, encoding=None, compression=None):
"""Gets file handle for given path and mode.
NOTE: Under Python 3.2, getting a compressed file handle means reading in
the entire file, decompressing it and decoding it to ``str`` all at once
and then wrapping it in a StringIO.
"""
if compression is not None:
if encoding is not None and not compat.PY3:
msg = 'encoding + compression not yet supported in Python 2'
raise ValueError(msg)

if compression == 'gzip':
import gzip
f = gzip.GzipFile(path, mode)
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(path, mode)
else:
raise ValueError('Unrecognized compression type: %s' %
compression)
if compat.PY3:
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
return f
else:
if compat.PY3:
if encoding:
f = open(path, mode, encoding=encoding)
else:
f = open(path, mode, errors='replace')
else:
f = open(path, mode)

return f


if compat.PY3: # pragma: no cover
def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
# ignore encoding
return csv.reader(f, dialect=dialect, **kwds)

def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)
else:
class UnicodeReader:

"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.

On Python 3, this is replaced (below) by csv.reader, which handles
unicode.
"""

def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)

def next(self):
row = next(self.reader)
return [compat.text_type(s, "utf-8") for s in row]

# python 3 iterator
__next__ = next

def __iter__(self): # pragma: no cover
return self

class UnicodeWriter:

"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""

def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
self.quoting = kwds.get("quoting", None)

def writerow(self, row):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)

row = [x if _check_as_is(x)
else pprint_thing(x).encode('utf-8') for x in row]

self.writer.writerow([s for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)

def writerows(self, rows):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)

for i, row in enumerate(rows):
rows[i] = [x if _check_as_is(x)
else pprint_thing(x).encode('utf-8') for x in row]

self.writer.writerows([[s for s in row] for row in rows])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)


def get_dtype_kinds(l):
"""
Parameters
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
OrderedDict)
from pandas.util.terminal import get_terminal_size
from pandas.core.config import get_option, set_option
from pandas.io.common import _get_handle, UnicodeWriter
import pandas.core.common as com
import pandas.lib as lib
from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime
Expand Down Expand Up @@ -1475,7 +1476,7 @@ def save(self):
f = self.path_or_buf
close = False
else:
f = com._get_handle(self.path_or_buf, self.mode,
f = _get_handle(self.path_or_buf, self.mode,
encoding=self.encoding,
compression=self.compression)
close = True
Expand All @@ -1488,7 +1489,7 @@ def save(self):
quotechar=self.quotechar)
if self.encoding is not None:
writer_kwargs['encoding'] = self.encoding
self.writer = com.UnicodeWriter(f, **writer_kwargs)
self.writer = UnicodeWriter(f, **writer_kwargs)
else:
self.writer = csv.writer(f, **writer_kwargs)

Expand Down
148 changes: 148 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

import sys
import os
import csv
import codecs
import zipfile
from contextlib import contextmanager, closing

from pandas.compat import StringIO, string_types, BytesIO
from pandas import compat
from pandas.core.common import pprint_thing, is_number


if compat.PY3:
Expand Down Expand Up @@ -284,3 +287,148 @@ def ZipFile(*args, **kwargs):
yield zf
else:
ZipFile = zipfile.ZipFile


def _get_handle(path, mode, encoding=None, compression=None):
"""Gets file handle for given path and mode.
"""
if compression is not None:
if encoding is not None and not compat.PY3:
msg = 'encoding + compression not yet supported in Python 2'
raise ValueError(msg)

if compression == 'gzip':
import gzip
f = gzip.GzipFile(path, mode)
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(path, mode)
else:
raise ValueError('Unrecognized compression type: %s' %
compression)
if compat.PY3:
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
return f
else:
if compat.PY3:
if encoding:
f = open(path, mode, encoding=encoding)
else:
f = open(path, mode, errors='replace')
else:
f = open(path, mode)

return f


class UTF8Recoder:

"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""

def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)

def __iter__(self):
return self

def read(self, bytes=-1):
return self.reader.read(bytes).encode("utf-8")

def readline(self):
return self.reader.readline().encode("utf-8")

def next(self):
return next(self.reader).encode("utf-8")

# Python 3 iterator
__next__ = next


if compat.PY3: # pragma: no cover
def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
# ignore encoding
return csv.reader(f, dialect=dialect, **kwds)

def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)
else:
class UnicodeReader:

"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.

On Python 3, this is replaced (below) by csv.reader, which handles
unicode.
"""

def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)

def next(self):
row = next(self.reader)
return [compat.text_type(s, "utf-8") for s in row]

# python 3 iterator
__next__ = next

def __iter__(self): # pragma: no cover
return self

class UnicodeWriter:

"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""

def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
self.quoting = kwds.get("quoting", None)

def writerow(self, row):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)

row = [x if _check_as_is(x)
else pprint_thing(x).encode("utf-8") for x in row]

self.writer.writerow([s for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)

def writerows(self, rows):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)

for i, row in enumerate(rows):
rows[i] = [x if _check_as_is(x)
else pprint_thing(x).encode("utf-8") for x in row]

self.writer.writerows([[s for s in row] for row in rows])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
Loading