apache
diff --git a/‎cpp/src/arrow/python/arrow_to_pandas.cc
Lines changed: 1065 additions & 1261 deletions b/‎cpp/src/arrow/python/arrow_to_pandas.cc
Lines changed: 1065 additions & 1261 deletions
diff --git a/‎cpp/src/arrow/python/arrow_to_pandas.h
Lines changed: 30 additions & 31 deletions b/‎cpp/src/arrow/python/arrow_to_pandas.h
Lines changed: 30 additions & 31 deletions
diff --git a/‎cpp/src/arrow/python/type_traits.h
Lines changed: 25 additions & 4 deletions b/‎cpp/src/arrow/python/type_traits.h
Lines changed: 25 additions & 4 deletions
diff --git a/‎cpp/src/arrow/table.cc
Lines changed: 16 additions & 0 deletions b/‎cpp/src/arrow/table.cc
Lines changed: 16 additions & 0 deletions
diff --git a/‎cpp/src/arrow/table.h
Lines changed: 8 additions & 2 deletions b/‎cpp/src/arrow/table.h
Lines changed: 8 additions & 2 deletions
diff --git a/‎cpp/src/arrow/table_test.cc
Lines changed: 23 additions & 0 deletions b/‎cpp/src/arrow/table_test.cc
Lines changed: 23 additions & 0 deletions
diff --git a/‎cpp/src/arrow/type_traits.h
Lines changed: 13 additions & 0 deletions b/‎cpp/src/arrow/type_traits.h
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/source/python/pandas.rst
Lines changed: 72 additions & 0 deletions b/‎docs/source/python/pandas.rst
Lines changed: 72 additions & 0 deletions
@@ -18,8 +18,7 @@
 // Functions for converting between pandas's NumPy-based data representation
 // and Arrow data structures
 
-#ifndef ARROW_PYTHON_ADAPTERS_PANDAS_H
-#define ARROW_PYTHON_ADAPTERS_PANDAS_H
+#pragma once
 
 #include "arrow/python/platform.h"
 
@@ -53,26 +52,43 @@ struct PandasOptions {
   bool date_as_object = false;
   bool use_threads = false;
 
+  /// Coerce all date and timestamp to datetime64[ns]
+  bool coerce_temporal_nanoseconds = false;
+
   /// \brief If true, do not create duplicate PyObject versions of equal
   /// objects. This only applies to immutable objects like strings or datetime
   /// objects
   bool deduplicate_objects = false;
+
+  /// \brief If true, create one block per column rather than consolidated
+  /// blocks (1 per data type). Do zero-copy wrapping when there are no
+  /// nulls. pandas currently will consolidate the blocks on its own, causing
+  /// increased memory use, so keep this in mind if you are working on a
+  /// memory-constrained situation.
+  bool split_blocks = false;
+
+  /// \brief If true, attempt to deallocate buffers in passed Arrow object if
+  /// it is the only remaining shared_ptr copy of it. See ARROW-3789 for
+  /// original context for this feature. Only currently implemented for Table
+  /// conversions
+  bool self_destruct = false;
+
+  // Columns that should be casted to categorical
+  std::unordered_set<std::string> categorical_columns;
+
+  // Columns that should be passed through to be converted to
+  // ExtensionArray/Block
+  std::unordered_set<std::string> extension_columns;
 };
 
 ARROW_PYTHON_EXPORT
-Status ConvertArrayToPandas(const PandasOptions& options,
-                            const std::shared_ptr<Array>& arr, PyObject* py_ref,
-                            PyObject** out);
+Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
+                            PyObject* py_ref, PyObject** out);
 
 ARROW_PYTHON_EXPORT
 Status ConvertChunkedArrayToPandas(const PandasOptions& options,
-                                   const std::shared_ptr<ChunkedArray>& col,
-                                   PyObject* py_ref, PyObject** out);
-
-ARROW_PYTHON_EXPORT
-Status ConvertColumnToPandas(const PandasOptions& options,
-                             const std::shared_ptr<Column>& col, PyObject* py_ref,
-                             PyObject** out);
+                                   std::shared_ptr<ChunkedArray> col, PyObject* py_ref,
+                                   PyObject** out);
 
 // Convert a whole table as efficiently as possible to a pandas.DataFrame.
 //
@@ -81,25 +97,8 @@ Status ConvertColumnToPandas(const PandasOptions& options,
 //
 // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
 ARROW_PYTHON_EXPORT
-Status ConvertTableToPandas(const PandasOptions& options,
-                            const std::shared_ptr<Table>& table, PyObject** out);
-
-/// Convert a whole table as efficiently as possible to a pandas.DataFrame.
-///
-/// Explicitly name columns that should be a categorical
-/// This option is only used on conversions that are applied to a table.
-ARROW_PYTHON_EXPORT
-Status ConvertTableToPandas(const PandasOptions& options,
-                            const std::unordered_set<std::string>& categorical_columns,
-                            const std::shared_ptr<Table>& table, PyObject** out);
-
-ARROW_PYTHON_EXPORT
-Status ConvertTableToPandas(const PandasOptions& options,
-                            const std::unordered_set<std::string>& categorical_columns,
-                            const std::unordered_set<std::string>& extension_columns,
-                            const std::shared_ptr<Table>& table, PyObject** out);
+Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
+                            PyObject** out);
 
 }  // namespace py
 }  // namespace arrow
-
-#endif  // ARROW_PYTHON_ADAPTERS_PANDAS_H
@@ -32,6 +32,10 @@
 
 namespace arrow {
 namespace py {
+
+static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
+constexpr int64_t kNanosecondsInDay = 86400000000000LL;
+
 namespace internal {
 
 //
@@ -86,6 +90,8 @@ struct npy_traits<NPY_FLOAT16> {
   using TypeClass = HalfFloatType;
   using BuilderClass = HalfFloatBuilder;
 
+  static constexpr npy_half na_sentinel = NPY_HALF_NAN;
+
   static constexpr bool supports_nulls = true;
 
   static inline bool isnull(npy_half v) { return v == NPY_HALF_NAN; }
@@ -97,6 +103,8 @@ struct npy_traits<NPY_FLOAT32> {
   using TypeClass = FloatType;
   using BuilderClass = FloatBuilder;
 
+  static constexpr float na_sentinel = NAN;
+
   static constexpr bool supports_nulls = true;
 
   static inline bool isnull(float v) { return v != v; }
@@ -108,6 +116,8 @@ struct npy_traits<NPY_FLOAT64> {
   using TypeClass = DoubleType;
   using BuilderClass = DoubleBuilder;
 
+  static constexpr double na_sentinel = NAN;
+
   static constexpr bool supports_nulls = true;
 
   static inline bool isnull(double v) { return v != v; }
@@ -208,10 +218,6 @@ struct arrow_traits<Type::DOUBLE> {
   typedef typename npy_traits<NPY_FLOAT64>::value_type T;
 };
 
-static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
-
-constexpr int64_t kNanosecondsInDay = 86400000000000LL;
-
 template <>
 struct arrow_traits<Type::TIMESTAMP> {
   static constexpr int npy_type = NPY_DATETIME;
@@ -287,6 +293,21 @@ struct arrow_traits<Type::BINARY> {
   static constexpr bool supports_nulls = true;
 };
 
+static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) {
+  switch (unit) {
+    case TimestampType::Unit::SECOND:
+      return NPY_FR_s;
+    case TimestampType::Unit::MILLI:
+      return NPY_FR_ms;
+      break;
+    case TimestampType::Unit::MICRO:
+      return NPY_FR_us;
+    default:
+      // NANO
+      return NPY_FR_ns;
+  }
+}
+
 static inline int NumPyTypeSize(int npy_type) {
   npy_type = fix_numpy_type_num(npy_type);
 
 
@@ -425,6 +425,22 @@ class SimpleTable : public Table {
 
 Table::Table() : num_rows_(0) {}
 
+std::vector<std::shared_ptr<ChunkedArray>> Table::columns() const {
+  std::vector<std::shared_ptr<ChunkedArray>> result;
+  for (int i = 0; i < this->num_columns(); ++i) {
+    result.emplace_back(this->column(i));
+  }
+  return result;
+}
+
+std::vector<std::shared_ptr<Field>> Table::fields() const {
+  std::vector<std::shared_ptr<Field>> result;
+  for (int i = 0; i < this->num_columns(); ++i) {
+    result.emplace_back(this->field(i));
+  }
+  return result;
+}
+
 std::shared_ptr<Table> Table::Make(
     const std::shared_ptr<Schema>& schema,
     const std::vector<std::shared_ptr<ChunkedArray>>& columns, int64_t num_rows) {
 
@@ -185,15 +185,21 @@ class ARROW_EXPORT Table {
   static Status FromChunkedStructArray(const std::shared_ptr<ChunkedArray>& array,
                                        std::shared_ptr<Table>* table);
 
-  /// Return the table schema
+  /// \brief Return the table schema
   std::shared_ptr<Schema> schema() const { return schema_; }
 
-  /// Return a column by index
+  /// \brief Return a column by index
   virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
 
+  /// \brief Return vector of all columns for table
+  std::vector<std::shared_ptr<ChunkedArray>> columns() const;
+
   /// Return a column's field by index
   std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
 
+  /// \brief Return vector of all fields for table
+  std::vector<std::shared_ptr<Field>> fields() const;
+
   /// \brief Construct a zero-copy slice of the table with the
   /// indicated offset and length
   ///
 
@@ -273,6 +273,29 @@ TEST_F(TestTable, InvalidColumns) {
   ASSERT_RAISES(Invalid, table_->ValidateFull());
 }
 
+TEST_F(TestTable, AllColumnsAndFields) {
+  const int length = 100;
+  MakeExample1(length);
+  table_ = Table::Make(schema_, columns_);
+
+  auto columns = table_->columns();
+  auto fields = table_->fields();
+
+  for (int i = 0; i < table_->num_columns(); ++i) {
+    AssertChunkedEqual(*table_->column(i), *columns[i]);
+    AssertFieldEqual(*table_->field(i), *fields[i]);
+  }
+
+  // Zero length
+  std::vector<std::shared_ptr<Array>> t2_columns;
+  auto t2 = Table::Make(::arrow::schema({}), t2_columns);
+  columns = t2->columns();
+  fields = t2->fields();
+
+  ASSERT_EQ(0, columns.size());
+  ASSERT_EQ(0, fields.size());
+}
+
 TEST_F(TestTable, Equals) {
   const int length = 100;
   MakeExample1(length);
 
@@ -703,6 +703,19 @@ static inline bool is_primitive(Type::type type_id) {
   return false;
 }
 
+static inline bool is_base_binary_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::BINARY:
+    case Type::LARGE_BINARY:
+    case Type::STRING:
+    case Type::LARGE_STRING:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
 static inline bool is_binary_like(Type::type type_id) {
   switch (type_id) {
     case Type::BINARY:
 
@@ -221,3 +221,75 @@ Time types
 ~~~~~~~~~~
 
 TODO
+
+Memory Usage and Zero Copy
+--------------------------
+
+When converting from Arrow data structures to pandas objects using various
+``to_pandas`` methods, one must occasionally be mindful of issues related to
+performance and memory usage.
+
+Since pandas's internal data representation is generally different from the
+Arrow columnar format, zero copy conversions (where no memory allocation or
+computation is required) are only possible in certain limited cases.
+
+In the worst case scenario, calling ``to_pandas`` will result in two versions
+of the data in memory, one for Arrow and one for pandas, yielding approximately
+twice the memory footprint. We have implement some mitigations for this case,
+particularly when creating large ``DataFrame`` objects, that we describe below.
+
+Zero Copy Series Conversions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Zero copy conversions from ``Array`` or ``ChunkedArray`` to NumPy arrays or
+pandas Series are possible in certain narrow cases:
+
+* The Arrow data is stored in an integer (signed or unsigned ``int8`` through
+  ``int64``) or floating point type (``float16`` through ``float64``). This
+  includes many numeric types as well as timestamps.
+* The Arrow data has no null values (since these are represented using bitmaps
+  which are not supported by pandas).
+* For ``ChunkedArray``, the data consists of a single chunk,
+  i.e. ``arr.num_chunks == 1``. Multiple chunks will always require a copy
+  because of pandas's contiguousness requirement.
+
+In these scenarios, ``to_pandas`` or ``to_numpy`` will be zero copy. In all
+other scenarios, a copy will be required.
+
+Reducing Memory Use in ``Table.to_pandas``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As of this writing, pandas applies a data management strategy called
+"consolidation" to collect like-typed DataFrame columns in two-dimensional
+NumPy arrays, referred to internally as "blocks". We have gone to great effort
+to construct the precise "consolidated" blocks so that pandas will not perform
+any further allocation or copies after we hand off the data to
+``pandas.DataFrame``. The obvious downside of this consolidation strategy is
+that it forces a "memory doubling".
+
+To try to limit the potential effects of "memory doubling" during
+``Table.to_pandas``, we provide a couple of options:
+
+* ``split_blocks=True``, when enabled ``Table.to_pandas`` produces one internal
+  DataFrame "block" for each column, skipping the "consolidation" step. Note
+  that many pandas operations will trigger consolidation anyway, but the peak
+  memory use may be less than the worst case scenario of a full memory
+  doubling. As a result of this option, we are able to do zero copy conversions
+  of columns in the same cases where we can do zero copy with ``Array`` and
+  ``ChunkedArray``.
+* ``self_destruct=True``, this destroys the internal Arrow memory buffers in
+  each column ``Table`` object as they are converted to the pandas-compatible
+  representation, potentially releasing memory to the operating system as soon
+  as a column is converted. Note that this renders the calling ``Table`` object
+  unsafe for further use, and any further methods called will cause your Python
+  process to crash.
+
+Used together, the call
+
+.. code-block:: python
+
+   df = table.to_pandas(split_blocks=True, self_destruct=True)
+   del table  # not necessary, but a good practice
+
+will yield significantly lower memory usage in some scenarios. Without these
+options, ``to_pandas`` will always double memory.