Skip to content

Commit 4306803

Browse files
committed
Add "split blocks" and "self destruct" options to Table.to_pandas, with
zero-copy operations for improved memory use when converting from Arrow to pandas
1 parent 9df2272 commit 4306803

File tree

12 files changed

+1343
-1383
lines changed

12 files changed

+1343
-1383
lines changed

cpp/src/arrow/python/arrow_to_pandas.cc

Lines changed: 1063 additions & 1260 deletions
Large diffs are not rendered by default.

cpp/src/arrow/python/arrow_to_pandas.h

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@
1818
// Functions for converting between pandas's NumPy-based data representation
1919
// and Arrow data structures
2020

21-
#ifndef ARROW_PYTHON_ADAPTERS_PANDAS_H
22-
#define ARROW_PYTHON_ADAPTERS_PANDAS_H
21+
#pragma once
2322

2423
#include "arrow/python/platform.h"
2524

@@ -53,26 +52,42 @@ struct PandasOptions {
5352
bool date_as_object = false;
5453
bool use_threads = false;
5554

55+
/// Coerce all date and timestamp to datetime64[ns]
56+
bool coerce_temporal_nanoseconds = false;
57+
5658
/// \brief If true, do not create duplicate PyObject versions of equal
5759
/// objects. This only applies to immutable objects like strings or datetime
5860
/// objects
5961
bool deduplicate_objects = false;
62+
63+
/// \brief If true, create one block per column rather than consolidated
64+
/// blocks (1 per data type). Do zero-copy wrapping when there are no
65+
/// nulls. When using this, you may want to set pandas's consolidation policy
66+
/// to leave the blocks split
67+
bool split_blocks = false;
68+
69+
/// \brief If true, attempt to deallocate buffers in passed Arrow object if
70+
/// it is the only remaining shared_ptr copy of it. See ARROW-3789 for
71+
/// original context for this feature. Only currently implemented for Table
72+
/// conversions
73+
bool self_destruct = false;
74+
75+
// Columns that should be casted to categorical
76+
std::unordered_set<std::string> categorical_columns;
77+
78+
// Columns that should be passed through to be converted to
79+
// ExtensionArray/Block
80+
std::unordered_set<std::string> extension_columns;
6081
};
6182

6283
ARROW_PYTHON_EXPORT
63-
Status ConvertArrayToPandas(const PandasOptions& options,
64-
const std::shared_ptr<Array>& arr, PyObject* py_ref,
65-
PyObject** out);
84+
Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
85+
PyObject* py_ref, PyObject** out);
6686

6787
ARROW_PYTHON_EXPORT
6888
Status ConvertChunkedArrayToPandas(const PandasOptions& options,
69-
const std::shared_ptr<ChunkedArray>& col,
70-
PyObject* py_ref, PyObject** out);
71-
72-
ARROW_PYTHON_EXPORT
73-
Status ConvertColumnToPandas(const PandasOptions& options,
74-
const std::shared_ptr<Column>& col, PyObject* py_ref,
75-
PyObject** out);
89+
std::shared_ptr<ChunkedArray> col, PyObject* py_ref,
90+
PyObject** out);
7691

7792
// Convert a whole table as efficiently as possible to a pandas.DataFrame.
7893
//
@@ -81,25 +96,8 @@ Status ConvertColumnToPandas(const PandasOptions& options,
8196
//
8297
// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
8398
ARROW_PYTHON_EXPORT
84-
Status ConvertTableToPandas(const PandasOptions& options,
85-
const std::shared_ptr<Table>& table, PyObject** out);
86-
87-
/// Convert a whole table as efficiently as possible to a pandas.DataFrame.
88-
///
89-
/// Explicitly name columns that should be a categorical
90-
/// This option is only used on conversions that are applied to a table.
91-
ARROW_PYTHON_EXPORT
92-
Status ConvertTableToPandas(const PandasOptions& options,
93-
const std::unordered_set<std::string>& categorical_columns,
94-
const std::shared_ptr<Table>& table, PyObject** out);
95-
96-
ARROW_PYTHON_EXPORT
97-
Status ConvertTableToPandas(const PandasOptions& options,
98-
const std::unordered_set<std::string>& categorical_columns,
99-
const std::unordered_set<std::string>& extension_columns,
100-
const std::shared_ptr<Table>& table, PyObject** out);
99+
Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
100+
PyObject** out);
101101

102102
} // namespace py
103103
} // namespace arrow
104-
105-
#endif // ARROW_PYTHON_ADAPTERS_PANDAS_H

cpp/src/arrow/python/type_traits.h

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232

3333
namespace arrow {
3434
namespace py {
35+
36+
static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
37+
constexpr int64_t kNanosecondsInDay = 86400000000000LL;
38+
3539
namespace internal {
3640

3741
//
@@ -86,6 +90,8 @@ struct npy_traits<NPY_FLOAT16> {
8690
using TypeClass = HalfFloatType;
8791
using BuilderClass = HalfFloatBuilder;
8892

93+
static constexpr npy_half na_sentinel = NPY_HALF_NAN;
94+
8995
static constexpr bool supports_nulls = true;
9096

9197
static inline bool isnull(npy_half v) { return v == NPY_HALF_NAN; }
@@ -97,6 +103,8 @@ struct npy_traits<NPY_FLOAT32> {
97103
using TypeClass = FloatType;
98104
using BuilderClass = FloatBuilder;
99105

106+
static constexpr float na_sentinel = NAN;
107+
100108
static constexpr bool supports_nulls = true;
101109

102110
static inline bool isnull(float v) { return v != v; }
@@ -108,6 +116,8 @@ struct npy_traits<NPY_FLOAT64> {
108116
using TypeClass = DoubleType;
109117
using BuilderClass = DoubleBuilder;
110118

119+
static constexpr double na_sentinel = NAN;
120+
111121
static constexpr bool supports_nulls = true;
112122

113123
static inline bool isnull(double v) { return v != v; }
@@ -208,10 +218,6 @@ struct arrow_traits<Type::DOUBLE> {
208218
typedef typename npy_traits<NPY_FLOAT64>::value_type T;
209219
};
210220

211-
static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
212-
213-
constexpr int64_t kNanosecondsInDay = 86400000000000LL;
214-
215221
template <>
216222
struct arrow_traits<Type::TIMESTAMP> {
217223
static constexpr int npy_type = NPY_DATETIME;
@@ -287,6 +293,21 @@ struct arrow_traits<Type::BINARY> {
287293
static constexpr bool supports_nulls = true;
288294
};
289295

296+
static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) {
297+
switch (unit) {
298+
case TimestampType::Unit::SECOND:
299+
return NPY_FR_s;
300+
case TimestampType::Unit::MILLI:
301+
return NPY_FR_ms;
302+
break;
303+
case TimestampType::Unit::MICRO:
304+
return NPY_FR_us;
305+
default:
306+
// NANO
307+
return NPY_FR_ns;
308+
}
309+
}
310+
290311
static inline int NumPyTypeSize(int npy_type) {
291312
npy_type = fix_numpy_type_num(npy_type);
292313

cpp/src/arrow/table.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,22 @@ class SimpleTable : public Table {
425425

426426
Table::Table() : num_rows_(0) {}
427427

428+
std::vector<std::shared_ptr<ChunkedArray>> Table::columns() const {
429+
std::vector<std::shared_ptr<ChunkedArray>> result;
430+
for (int i = 0; i < this->num_columns(); ++i) {
431+
result.emplace_back(this->column(i));
432+
}
433+
return result;
434+
}
435+
436+
std::vector<std::shared_ptr<Field>> Table::fields() const {
437+
std::vector<std::shared_ptr<Field>> result;
438+
for (int i = 0; i < this->num_columns(); ++i) {
439+
result.emplace_back(this->field(i));
440+
}
441+
return result;
442+
}
443+
428444
std::shared_ptr<Table> Table::Make(
429445
const std::shared_ptr<Schema>& schema,
430446
const std::vector<std::shared_ptr<ChunkedArray>>& columns, int64_t num_rows) {

cpp/src/arrow/table.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,15 +185,21 @@ class ARROW_EXPORT Table {
185185
static Status FromChunkedStructArray(const std::shared_ptr<ChunkedArray>& array,
186186
std::shared_ptr<Table>* table);
187187

188-
/// Return the table schema
188+
/// \brief Return the table schema
189189
std::shared_ptr<Schema> schema() const { return schema_; }
190190

191-
/// Return a column by index
191+
/// \brief Return a column by index
192192
virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
193193

194+
/// \brief Return vector of all columns for table
195+
std::vector<std::shared_ptr<ChunkedArray>> columns() const;
196+
194197
/// Return a column's field by index
195198
std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
196199

200+
/// \brief Return vector of all fields for table
201+
std::vector<std::shared_ptr<Field>> fields() const;
202+
197203
/// \brief Construct a zero-copy slice of the table with the
198204
/// indicated offset and length
199205
///

cpp/src/arrow/table_test.cc

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,29 @@ TEST_F(TestTable, InvalidColumns) {
273273
ASSERT_RAISES(Invalid, table_->ValidateFull());
274274
}
275275

276+
TEST_F(TestTable, AllColumnsAndFields) {
277+
const int length = 100;
278+
MakeExample1(length);
279+
table_ = Table::Make(schema_, columns_);
280+
281+
auto columns = table_->columns();
282+
auto fields = table_->fields();
283+
284+
for (int i = 0; i < table_->num_columns(); ++i) {
285+
AssertChunkedEqual(*table_->column(i), *columns[i]);
286+
AssertFieldEqual(*table_->field(i), *fields[i]);
287+
}
288+
289+
// Zero length
290+
std::vector<std::shared_ptr<Array>> t2_columns;
291+
auto t2 = Table::Make(::arrow::schema({}), t2_columns);
292+
columns = t2->columns();
293+
fields = t2->fields();
294+
295+
ASSERT_EQ(0, columns.size());
296+
ASSERT_EQ(0, fields.size());
297+
}
298+
276299
TEST_F(TestTable, Equals) {
277300
const int length = 100;
278301
MakeExample1(length);

cpp/src/arrow/type_traits.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,19 @@ static inline bool is_primitive(Type::type type_id) {
703703
return false;
704704
}
705705

706+
static inline bool is_base_binary_like(Type::type type_id) {
707+
switch (type_id) {
708+
case Type::BINARY:
709+
case Type::LARGE_BINARY:
710+
case Type::STRING:
711+
case Type::LARGE_STRING:
712+
return true;
713+
default:
714+
break;
715+
}
716+
return false;
717+
}
718+
706719
static inline bool is_binary_like(Type::type type_id) {
707720
switch (type_id) {
708721
case Type::BINARY:

0 commit comments

Comments
 (0)