Skip to content

Commit 833afea

Browse files
author
Pyry Kovanen
committed
Merge remote-tracking branch 'upstream/master' into empty-json-empty-df-fix
2 parents 743c08f + 0c65c57 commit 833afea

File tree

18 files changed

+192
-94
lines changed

18 files changed

+192
-94
lines changed

asv_bench/benchmarks/frame_methods.py

+18
Original file line numberDiff line numberDiff line change
@@ -512,3 +512,21 @@ def time_nlargest(self, keep):
512512

513513
def time_nsmallest(self, keep):
514514
self.df.nsmallest(100, 'A', keep=keep)
515+
516+
517+
class Describe(object):
518+
519+
goal_time = 0.2
520+
521+
def setup(self):
522+
self.df = DataFrame({
523+
'a': np.random.randint(0, 100, int(1e6)),
524+
'b': np.random.randint(0, 100, int(1e6)),
525+
'c': np.random.randint(0, 100, int(1e6))
526+
})
527+
528+
def time_series_describe(self):
529+
self.df['a'].describe()
530+
531+
def time_dataframe_describe(self):
532+
self.df.describe()

doc/source/ecosystem.rst

+3
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput
3838
Use pandas DataFrames in your `scikit-learn <http://scikit-learn.org/>`__
3939
ML pipeline.
4040

41+
`Featuretools <https://github.com/featuretools/featuretools/>`__
42+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4143

44+
Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community.
4245

4346
.. _ecosystem.visualization:
4447

doc/source/whatsnew/v0.23.1.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Performance Improvements
3232
~~~~~~~~~~~~~~~~~~~~~~~~
3333

3434
- Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`)
35+
- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`)
3536
-
3637
-
3738

@@ -85,14 +86,18 @@ Indexing
8586
- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
8687
- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
8788
- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
89+
- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`)
90+
- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`)
8891
-
8992

9093
I/O
9194
^^^
9295

9396
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9497
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
95-
- Bug in IO JSON :func:`read_json`reading empty JSON schema with ``orient='table'`` back to :class:DataFrame caused an error (:issue:`21287`)
98+
- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
99+
- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
100+
- Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`)
96101

97102
Plotting
98103
^^^^^^^^

doc/source/whatsnew/v0.24.0.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,7 @@ Removal of prior version deprecations/changes
6363
Performance Improvements
6464
~~~~~~~~~~~~~~~~~~~~~~~~
6565

66-
-
67-
-
66+
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
6867
-
6968

7069
.. _whatsnew_0240.docs:

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8519,7 +8519,7 @@ def describe_numeric_1d(series):
85198519
stat_index = (['count', 'mean', 'std', 'min'] +
85208520
formatted_percentiles + ['max'])
85218521
d = ([series.count(), series.mean(), series.std(), series.min()] +
8522-
[series.quantile(x) for x in percentiles] + [series.max()])
8522+
series.quantile(percentiles).tolist() + [series.max()])
85238523
return pd.Series(d, index=stat_index, name=series.name)
85248524

85258525
def describe_categorical_1d(data):

pandas/core/indexes/category.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ def _engine(self):
378378
# introspection
379379
@cache_readonly
380380
def is_unique(self):
381-
return not self.duplicated().any()
381+
return self._engine.is_unique
382382

383383
@property
384384
def is_monotonic_increasing(self):

pandas/core/indexes/interval.py

+4
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values):
112112
-------
113113
array
114114
"""
115+
if is_categorical_dtype(values):
116+
# GH 21243/21253
117+
values = np.array(values)
118+
115119
if isinstance(values, (list, tuple)) and len(values) == 0:
116120
# GH 19016
117121
# empty lists/tuples get object dtype by default, but this is not

pandas/core/series.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1837,7 +1837,7 @@ def round(self, decimals=0, *args, **kwargs):
18371837

18381838
def quantile(self, q=0.5, interpolation='linear'):
18391839
"""
1840-
Return value at the given quantile, a la numpy.percentile.
1840+
Return value at the given quantile.
18411841
18421842
Parameters
18431843
----------
@@ -1876,6 +1876,7 @@ def quantile(self, q=0.5, interpolation='linear'):
18761876
See Also
18771877
--------
18781878
pandas.core.window.Rolling.quantile
1879+
numpy.percentile
18791880
"""
18801881

18811882
self._check_percentile(q)

pandas/core/strings.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2172,9 +2172,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
21722172
21732173
Returns
21742174
-------
2175-
concat : str if `other is None`, Series/Index of objects if `others is
2176-
not None`. In the latter case, the result will remain categorical
2177-
if the calling Series/Index is categorical.
2175+
concat : str or Series/Index of objects
2176+
If `others` is None, `str` is returned, otherwise a `Series/Index`
2177+
(same type as caller) of objects is returned.
21782178
21792179
See Also
21802180
--------

pandas/io/formats/csvs.py

+20-16
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010

1111
from pandas.core.dtypes.missing import notna
12+
from pandas.core.dtypes.inference import is_file_like
1213
from pandas.core.index import Index, MultiIndex
1314
from pandas import compat
1415
from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
127128
else:
128129
encoding = self.encoding
129130

130-
if hasattr(self.path_or_buf, 'write'):
131-
f = self.path_or_buf
132-
close = False
131+
# PR 21300 uses string buffer to receive csv writing and dump into
132+
# file-like output with compression as option. GH 21241, 21118
133+
f = StringIO()
134+
if not is_file_like(self.path_or_buf):
135+
# path_or_buf is path
136+
path_or_buf = self.path_or_buf
137+
elif hasattr(self.path_or_buf, 'name'):
138+
# path_or_buf is file handle
139+
path_or_buf = self.path_or_buf.name
133140
else:
134-
f, handles = _get_handle(self.path_or_buf, self.mode,
135-
encoding=encoding,
136-
compression=None)
137-
close = True if self.compression is None else False
141+
# path_or_buf is file-like IO objects.
142+
f = self.path_or_buf
143+
path_or_buf = None
138144

139145
try:
140146
writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
151157
self._save()
152158

153159
finally:
154-
# GH 17778 handles compression for byte strings.
155-
if not close and self.compression:
156-
f.close()
157-
with open(f.name, 'r') as f:
158-
data = f.read()
159-
f, handles = _get_handle(f.name, self.mode,
160+
# GH 17778 handles zip compression for byte strings separately.
161+
buf = f.getvalue()
162+
if path_or_buf:
163+
f, handles = _get_handle(path_or_buf, self.mode,
160164
encoding=encoding,
161165
compression=self.compression)
162-
f.write(data)
163-
close = True
164-
if close:
166+
f.write(buf)
165167
f.close()
168+
for _fh in handles:
169+
_fh.close()
166170

167171
def _save_header(self):
168172

pandas/io/stata.py

+26-36
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182182
preserve_dtypes=preserve_dtypes,
183183
columns=columns,
184184
order_categoricals=order_categoricals,
185-
chunksize=chunksize, encoding=encoding)
185+
chunksize=chunksize)
186186

187187
if iterator or chunksize:
188188
data = reader
@@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype):
838838

839839

840840
class StataParser(object):
841-
_default_encoding = 'latin-1'
842841

843-
def __init__(self, encoding):
844-
if encoding is not None:
845-
if encoding not in VALID_ENCODINGS:
846-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
847-
'supported.')
848-
849-
self._encoding = encoding
842+
def __init__(self):
850843

851844
# type code.
852845
# --------------------
@@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True,
964957
convert_categoricals=True, index_col=None,
965958
convert_missing=False, preserve_dtypes=True,
966959
columns=None, order_categoricals=True,
967-
encoding='latin-1', chunksize=None):
968-
super(StataReader, self).__init__(encoding)
960+
encoding=None, chunksize=None):
961+
super(StataReader, self).__init__()
969962
self.col_sizes = ()
970963

971964
# Arguments to the reader (can be temporarily overridden in
@@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977970
self._preserve_dtypes = preserve_dtypes
978971
self._columns = columns
979972
self._order_categoricals = order_categoricals
980-
if encoding is not None:
981-
if encoding not in VALID_ENCODINGS:
982-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
983-
'supported.')
984973
self._encoding = encoding
985974
self._chunksize = chunksize
986975

@@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998987
path_or_buf = _stringify_path(path_or_buf)
999988
if isinstance(path_or_buf, str):
1000989
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
1001-
path_or_buf, encoding=self._default_encoding
1002-
)
990+
path_or_buf)
1003991

1004992
if isinstance(path_or_buf, (str, text_type, bytes)):
1005993
self.path_or_buf = open(path_or_buf, 'rb')
1006994
else:
1007995
# Copy to BytesIO, and ensure no encoding
1008996
contents = path_or_buf.read()
1009-
try:
1010-
contents = contents.encode(self._default_encoding)
1011-
except:
1012-
pass
1013997
self.path_or_buf = BytesIO(contents)
1014998

1015999
self._read_header()
@@ -1030,6 +1014,15 @@ def close(self):
10301014
except IOError:
10311015
pass
10321016

1017+
def _set_encoding(self):
1018+
"""
1019+
Set string encoding which depends on file version
1020+
"""
1021+
if self.format_version < 118:
1022+
self._encoding = 'latin-1'
1023+
else:
1024+
self._encoding = 'utf-8'
1025+
10331026
def _read_header(self):
10341027
first_char = self.path_or_buf.read(1)
10351028
if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char):
10491042
self.format_version = int(self.path_or_buf.read(3))
10501043
if self.format_version not in [117, 118]:
10511044
raise ValueError(_version_error)
1045+
self._set_encoding()
10521046
self.path_or_buf.read(21) # </release><byteorder>
10531047
self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
10541048
self.path_or_buf.read(15) # </byteorder><K>
@@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char):
12351229
self.format_version = struct.unpack('b', first_char)[0]
12361230
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
12371231
raise ValueError(_version_error)
1232+
self._set_encoding()
12381233
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
12391234
0] == 0x1 and '>' or '<'
12401235
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1333,9 @@ def _decode(self, s):
13381333
return s.decode('utf-8')
13391334

13401335
def _null_terminate(self, s):
1341-
if compat.PY3 or self._encoding is not None:
1342-
# have bytes not strings, so must decode
1343-
s = s.partition(b"\0")[0]
1344-
return s.decode(self._encoding or self._default_encoding)
1345-
else:
1346-
null_byte = "\0"
1347-
try:
1348-
return s.lstrip(null_byte)[:s.index(null_byte)]
1349-
except:
1350-
return s
1336+
# have bytes not strings, so must decode
1337+
s = s.partition(b"\0")[0]
1338+
return s.decode(self._encoding)
13511339

13521340
def _read_value_labels(self):
13531341
if self._value_labels_read:
@@ -1433,10 +1421,7 @@ def _read_strls(self):
14331421
self.path_or_buf.read(4))[0]
14341422
va = self.path_or_buf.read(length)
14351423
if typ == 130:
1436-
encoding = 'utf-8'
1437-
if self.format_version == 117:
1438-
encoding = self._encoding or self._default_encoding
1439-
va = va[0:-1].decode(encoding)
1424+
va = va[0:-1].decode(self._encoding)
14401425
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411426
self.GSO[str(v_o)] = va
14421427

@@ -1980,9 +1965,14 @@ class StataWriter(StataParser):
19801965
def __init__(self, fname, data, convert_dates=None, write_index=True,
19811966
encoding="latin-1", byteorder=None, time_stamp=None,
19821967
data_label=None, variable_labels=None):
1983-
super(StataWriter, self).__init__(encoding)
1968+
super(StataWriter, self).__init__()
19841969
self._convert_dates = {} if convert_dates is None else convert_dates
19851970
self._write_index = write_index
1971+
if encoding is not None:
1972+
if encoding not in VALID_ENCODINGS:
1973+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
1974+
'supported.')
1975+
self._encoding = encoding
19861976
self._time_stamp = time_stamp
19871977
self._data_label = data_label
19881978
self._variable_labels = variable_labels

pandas/tests/frame/test_to_csv.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
919919
recons = pd.read_csv(StringIO(csv_str), index_col=0)
920920
assert_frame_equal(self.frame, recons)
921921

922-
def test_to_csv_compression(self, compression):
923-
924-
df = DataFrame([[0.123456, 0.234567, 0.567567],
925-
[12.32112, 123123.2, 321321.2]],
926-
index=['A', 'B'], columns=['X', 'Y', 'Z'])
922+
@pytest.mark.parametrize('df,encoding', [
923+
(DataFrame([[0.123456, 0.234567, 0.567567],
924+
[12.32112, 123123.2, 321321.2]],
925+
index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
926+
# GH 21241, 21118
927+
(DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
928+
(DataFrame(5 * [[123, u"你好", u"世界"]],
929+
columns=['X', 'Y', 'Z']), 'gb2312'),
930+
(DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
931+
columns=['X', 'Y', 'Z']), 'cp737')
932+
])
933+
def test_to_csv_compression(self, df, encoding, compression):
927934

928935
with ensure_clean() as filename:
929936

930-
df.to_csv(filename, compression=compression)
937+
df.to_csv(filename, compression=compression, encoding=encoding)
931938

932939
# test the round trip - to_csv -> read_csv
933-
rs = read_csv(filename, compression=compression,
934-
index_col=0)
935-
assert_frame_equal(df, rs)
940+
result = read_csv(filename, compression=compression,
941+
index_col=0, encoding=encoding)
942+
943+
with open(filename, 'w') as fh:
944+
df.to_csv(fh, compression=compression, encoding=encoding)
945+
946+
result_fh = read_csv(filename, compression=compression,
947+
index_col=0, encoding=encoding)
948+
assert_frame_equal(df, result)
949+
assert_frame_equal(df, result_fh)
936950

937951
# explicitly make sure file is compressed
938952
with tm.decompress_file(filename, compression) as fh:
939-
text = fh.read().decode('utf8')
953+
text = fh.read().decode(encoding or 'utf8')
940954
for col in df.columns:
941955
assert col in text
942956

943957
with tm.decompress_file(filename, compression) as fh:
944-
assert_frame_equal(df, read_csv(fh, index_col=0))
958+
assert_frame_equal(df, read_csv(fh,
959+
index_col=0,
960+
encoding=encoding))
945961

946962
def test_to_csv_date_format(self):
947963
with ensure_clean('__tmp_to_csv_date_format__') as path:

0 commit comments

Comments
 (0)