Skip to content

Commit baee6b2

Browse files
committed
Merge remote-tracking branch 'upstream/master' into ea-repr
2 parents 5d8d2fc + 383d052 commit baee6b2

File tree

235 files changed

+2291
-2145
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

235 files changed

+2291
-2145
lines changed

ci/code_checks.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
151151

152152
MSG='Doctests generic.py' ; echo $MSG
153153
pytest -q --doctest-modules pandas/core/generic.py \
154-
-k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -to_json -transpose -values -xs"
154+
-k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs"
155155
RET=$(($RET + $?)) ; echo $MSG "DONE"
156156

157157
MSG='Doctests top-level reshaping functions' ; echo $MSG

doc/source/io.rst

+37
Original file line numberDiff line numberDiff line change
@@ -4673,6 +4673,43 @@ Passing ``index=True`` will *always* write the index, even if that's not the
46734673
underlying engine's default behavior.
46744674

46754675

4676+
Partitioning Parquet files
4677+
''''''''''''''''''''''''''
4678+
4679+
.. versionadded:: 0.24.0
4680+
4681+
Parquet supports partitioning of data based on the values of one or more columns.
4682+
4683+
.. ipython:: python
4684+
4685+
df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]})
4686+
df.to_parquet(fname='test', engine='pyarrow', partition_cols=['a'], compression=None)
4687+
4688+
The `fname` specifies the parent directory to which data will be saved.
4689+
The `partition_cols` are the column names by which the dataset will be partitioned.
4690+
Columns are partitioned in the order they are given. The partition splits are
4691+
determined by the unique values in the partition columns.
4692+
The above example creates a partitioned dataset that may look like:
4693+
4694+
.. code-block:: text
4695+
4696+
test
4697+
├── a=0
4698+
│ ├── 0bac803e32dc42ae83fddfd029cbdebc.parquet
4699+
│ └── ...
4700+
└── a=1
4701+
├── e6ab24a4f45147b49b54a662f0c412a3.parquet
4702+
└── ...
4703+
4704+
.. ipython:: python
4705+
:suppress:
4706+
4707+
from shutil import rmtree
4708+
try:
4709+
rmtree('test')
4710+
except Exception:
4711+
pass
4712+
46764713
.. _io.sql:
46774714

46784715
SQL Queries

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ Other Enhancements
236236
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
237237
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
238238
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
239+
- :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`)
239240
- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
240241

241242
.. _whatsnew_0240.api_breaking:

pandas/core/arrays/datetimelike.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def cmp_method(self, other):
6666
with warnings.catch_warnings(record=True):
6767
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
6868
with np.errstate(all='ignore'):
69-
result = op(self.values, np.asarray(other))
69+
result = op(self._data, np.asarray(other))
7070

7171
return result
7272

@@ -119,15 +119,10 @@ def _box_values(self, values):
119119
def __iter__(self):
120120
return (self._box_func(v) for v in self.asi8)
121121

122-
@property
123-
def values(self):
124-
""" return the underlying data as an ndarray """
125-
return self._data.view(np.ndarray)
126-
127122
@property
128123
def asi8(self):
129124
# do not cache or you'll create a memory leak
130-
return self.values.view('i8')
125+
return self._data.view('i8')
131126

132127
# ------------------------------------------------------------------
133128
# Array-like Methods

pandas/core/arrays/datetimes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -886,7 +886,7 @@ def to_period(self, freq=None):
886886

887887
freq = get_period_alias(freq)
888888

889-
return PeriodArray._from_datetime64(self.values, freq, tz=self.tz)
889+
return PeriodArray._from_datetime64(self._data, freq, tz=self.tz)
890890

891891
def to_perioddelta(self, freq):
892892
"""

pandas/core/arrays/timedeltas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def wrapper(self, other):
8181
raise TypeError(msg.format(cls=type(self).__name__,
8282
typ=type(other).__name__))
8383
else:
84-
other = type(self)(other).values
84+
other = type(self)(other)._data
8585
result = meth(self, other)
8686
result = com.values_from_object(result)
8787

pandas/core/frame.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -1970,7 +1970,7 @@ def to_feather(self, fname):
19701970
to_feather(self, fname)
19711971

19721972
def to_parquet(self, fname, engine='auto', compression='snappy',
1973-
index=None, **kwargs):
1973+
index=None, partition_cols=None, **kwargs):
19741974
"""
19751975
Write a DataFrame to the binary parquet format.
19761976
@@ -1984,7 +1984,11 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
19841984
Parameters
19851985
----------
19861986
fname : str
1987-
String file path.
1987+
File path or Root Directory path. Will be used as Root Directory
1988+
path while writing a partitioned dataset.
1989+
1990+
.. versionchanged:: 0.24.0
1991+
19881992
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
19891993
Parquet library to use. If 'auto', then the option
19901994
``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -1999,6 +2003,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
19992003
20002004
.. versionadded:: 0.24.0
20012005
2006+
partition_cols : list, optional, default None
2007+
Column names by which to partition the dataset
2008+
Columns are partitioned in the order they are given
2009+
2010+
.. versionadded:: 0.24.0
2011+
20022012
**kwargs
20032013
Additional arguments passed to the parquet library. See
20042014
:ref:`pandas io <io.parquet>` for more details.
@@ -2027,7 +2037,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
20272037
"""
20282038
from pandas.io.parquet import to_parquet
20292039
to_parquet(self, fname, engine,
2030-
compression=compression, index=index, **kwargs)
2040+
compression=compression, index=index,
2041+
partition_cols=partition_cols, **kwargs)
20312042

20322043
@Substitution(header='Write out the column names. If a list of strings '
20332044
'is given, it is assumed to be aliases for the '

0 commit comments

Comments
 (0)