lic9
diff --git a/‎cpp/src/arrow/python/arrow_to_pandas.cc‎
Lines changed: 156 additions & 130 deletions b/‎cpp/src/arrow/python/arrow_to_pandas.cc‎
Lines changed: 156 additions & 130 deletions
diff --git a/‎cpp/src/arrow/python/arrow_to_pandas.h‎
Lines changed: 21 additions & 20 deletions b/‎cpp/src/arrow/python/arrow_to_pandas.h‎
Lines changed: 21 additions & 20 deletions
diff --git a/‎cpp/src/arrow/type.cc‎
Lines changed: 3 additions & 4 deletions b/‎cpp/src/arrow/type.cc‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎cpp/src/arrow/type.h‎
Lines changed: 6 additions & 3 deletions b/‎cpp/src/arrow/type.h‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cpp/src/arrow/type_traits.h‎
Lines changed: 5 additions & 0 deletions b/‎cpp/src/arrow/type_traits.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/src/arrow/util/hashing.h‎
Lines changed: 18 additions & 3 deletions b/‎cpp/src/arrow/util/hashing.h‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎python/benchmarks/convert_pandas.py‎
Lines changed: 22 additions & 0 deletions b/‎python/benchmarks/convert_pandas.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎python/pyarrow/array.pxi‎
Lines changed: 58 additions & 33 deletions b/‎python/pyarrow/array.pxi‎
Lines changed: 58 additions & 33 deletions
diff --git a/‎python/pyarrow/compat.py‎
Lines changed: 5 additions & 1 deletion b/‎python/pyarrow/compat.py‎
Lines changed: 5 additions & 1 deletion
@@ -43,32 +43,32 @@ namespace py {
 
 struct PandasOptions {
   /// If true, we will convert all string columns to categoricals
-  bool strings_to_categorical;
-  bool zero_copy_only;
-  bool integer_object_nulls;
-  bool date_as_object;
-  bool use_threads;
-
-  PandasOptions()
-      : strings_to_categorical(false),
-        zero_copy_only(false),
-        integer_object_nulls(false),
-        date_as_object(false),
-        use_threads(false) {}
+  bool strings_to_categorical = false;
+  bool zero_copy_only = false;
+  bool integer_object_nulls = false;
+  bool date_as_object = false;
+  bool use_threads = false;
+
+  /// \brief If true, do not create duplicate PyObject versions of equal
+  /// objects. This only applies to immutable objects like strings or datetime
+  /// objects
+  bool deduplicate_objects = false;
 };
 
 ARROW_PYTHON_EXPORT
-Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr<Array>& arr,
-                            PyObject* py_ref, PyObject** out);
+Status ConvertArrayToPandas(const PandasOptions& options,
+                            const std::shared_ptr<Array>& arr, PyObject* py_ref,
+                            PyObject** out);
 
 ARROW_PYTHON_EXPORT
-Status ConvertChunkedArrayToPandas(PandasOptions options,
+Status ConvertChunkedArrayToPandas(const PandasOptions& options,
                                    const std::shared_ptr<ChunkedArray>& col,
                                    PyObject* py_ref, PyObject** out);
 
 ARROW_PYTHON_EXPORT
-Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr<Column>& col,
-                             PyObject* py_ref, PyObject** out);
+Status ConvertColumnToPandas(const PandasOptions& options,
+                             const std::shared_ptr<Column>& col, PyObject* py_ref,
+                             PyObject** out);
 
 // Convert a whole table as efficiently as possible to a pandas.DataFrame.
 //
@@ -77,15 +77,16 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr<Column
 //
 // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
 ARROW_PYTHON_EXPORT
-Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr<Table>& table,
-                            MemoryPool* pool, PyObject** out);
+Status ConvertTableToPandas(const PandasOptions& options,
+                            const std::shared_ptr<Table>& table, MemoryPool* pool,
+                            PyObject** out);
 
 /// Convert a whole table as efficiently as possible to a pandas.DataFrame.
 ///
 /// Explicitly name columns that should be a categorical
 /// This option is only used on conversions that are applied to a table.
 ARROW_PYTHON_EXPORT
-Status ConvertTableToPandas(PandasOptions options,
+Status ConvertTableToPandas(const PandasOptions& options,
                             const std::unordered_set<std::string>& categorical_columns,
                             const std::shared_ptr<Table>& table, MemoryPool* pool,
                             PyObject** out);
 
@@ -137,12 +137,11 @@ std::string FixedSizeBinaryType::ToString() const {
 // ----------------------------------------------------------------------
 // Date types
 
-DateType::DateType(Type::type type_id, DateUnit unit)
-    : FixedWidthType(type_id), unit_(unit) {}
+DateType::DateType(Type::type type_id) : FixedWidthType(type_id) {}
 
-Date32Type::Date32Type() : DateType(Type::DATE32, DateUnit::DAY) {}
+Date32Type::Date32Type() : DateType(Type::DATE32) {}
 
-Date64Type::Date64Type() : DateType(Type::DATE64, DateUnit::MILLI) {}
+Date64Type::Date64Type() : DateType(Type::DATE64) {}
 
 std::string Date64Type::ToString() const { return std::string("date64[ms]"); }
 
 
@@ -600,17 +600,17 @@ enum class DateUnit : char { DAY = 0, MILLI = 1 };
 /// \brief Base type class for date data
 class ARROW_EXPORT DateType : public FixedWidthType {
  public:
-  DateUnit unit() const { return unit_; }
+  virtual DateUnit unit() const = 0;
 
  protected:
-  DateType(Type::type type_id, DateUnit unit);
-  DateUnit unit_;
+  explicit DateType(Type::type type_id);
 };
 
 /// Concrete type class for 32-bit date data (as number of days since UNIX epoch)
 class ARROW_EXPORT Date32Type : public DateType {
  public:
   static constexpr Type::type type_id = Type::DATE32;
+  static constexpr DateUnit UNIT = DateUnit::DAY;
 
   using c_type = int32_t;
 
@@ -622,12 +622,14 @@ class ARROW_EXPORT Date32Type : public DateType {
   std::string ToString() const override;
 
   std::string name() const override { return "date32"; }
+  DateUnit unit() const override { return UNIT; }
 };
 
 /// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch)
 class ARROW_EXPORT Date64Type : public DateType {
  public:
   static constexpr Type::type type_id = Type::DATE64;
+  static constexpr DateUnit UNIT = DateUnit::MILLI;
 
   using c_type = int64_t;
 
@@ -639,6 +641,7 @@ class ARROW_EXPORT Date64Type : public DateType {
   std::string ToString() const override;
 
   std::string name() const override { return "date64"; }
+  DateUnit unit() const override { return UNIT; }
 };
 
 struct TimeUnit {
 
@@ -371,6 +371,11 @@ template <typename T>
 using enable_if_boolean =
     typename std::enable_if<std::is_same<BooleanType, T>::value>::type;
 
+template <typename T>
+using enable_if_binary_like =
+    typename std::enable_if<std::is_base_of<BinaryType, T>::value ||
+                            std::is_base_of<FixedSizeBinaryType, T>::value>::type;
+
 template <typename T>
 using enable_if_fixed_size_binary =
     typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value>::type;
 
@@ -102,6 +102,18 @@ struct ScalarHelper<Scalar, AlgNum,
   }
 };
 
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<
+    Scalar, AlgNum,
+    typename std::enable_if<std::is_same<util::string_view, Scalar>::value>::type>
+    : public ScalarHelperBase<Scalar, AlgNum> {
+  // ScalarHelper specialization for util::string_view
+
+  static hash_t ComputeHash(const util::string_view& value) {
+    return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size()));
+  }
+};
+
 template <typename Scalar, uint64_t AlgNum>
 struct ScalarHelper<Scalar, AlgNum,
                     typename std::enable_if<std::is_floating_point<Scalar>::value>::type>
@@ -332,7 +344,7 @@ class ScalarMemoTable {
   explicit ScalarMemoTable(int64_t entries = 0)
       : hash_table_(static_cast<uint64_t>(entries)) {}
 
-  int32_t Get(const Scalar value) const {
+  int32_t Get(const Scalar& value) const {
     auto cmp_func = [value](const Payload* payload) -> bool {
       return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
     };
@@ -346,7 +358,7 @@ class ScalarMemoTable {
   }
 
   template <typename Func1, typename Func2>
-  int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) {
+  int32_t GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found) {
     auto cmp_func = [value](const Payload* payload) -> bool {
       return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
     };
@@ -364,7 +376,7 @@ class ScalarMemoTable {
     return memo_index;
   }
 
-  int32_t GetOrInsert(const Scalar value) {
+  int32_t GetOrInsert(const Scalar& value) {
     return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {});
   }
 
@@ -389,6 +401,7 @@ class ScalarMemoTable {
     Scalar value;
     int32_t memo_index;
   };
+
   using HashTableType = HashTableTemplateType<Payload>;
   using HashTableEntry = typename HashTableType::Entry;
   HashTableType hash_table_;
@@ -621,9 +634,11 @@ class BinaryMemoTable {
   struct Payload {
     int32_t memo_index;
   };
+
   using HashTableType = HashTable<Payload>;
   using HashTableEntry = typename HashTable<Payload>::Entry;
   HashTableType hash_table_;
+
   std::vector<int32_t> offsets_;
   std::string values_;
 
 
@@ -17,6 +17,8 @@
 
 import numpy as np
 import pandas as pd
+import pandas.util.testing as tm
+
 import pyarrow as pa
 
 
@@ -50,6 +52,26 @@ def time_to_series(self, n, dtype):
         self.arrow_data.to_pandas()
 
 
+class ToPandasStrings(object):
+
+    param_names = ('uniqueness', 'total')
+    params = ((0.001, 0.01, 0.1, 0.5), (1000000,))
+    string_length = 25
+
+    def setup(self, uniqueness, total):
+        nunique = int(total * uniqueness)
+        unique_values = [tm.rands(self.string_length) for i in range(nunique)]
+        values = unique_values * (total // nunique)
+        self.arr = pa.array(values, type=pa.string())
+        self.table = pa.Table.from_arrays([self.arr], ['f0'])
+
+    def time_to_pandas_dedup(self, *args):
+        self.arr.to_pandas()
+
+    def time_to_pandas_no_dedup(self, *args):
+        self.arr.to_pandas(deduplicate_objects=False)
+
+
 class ZeroCopyPandasRead(object):
 
     def setup(self):
 
@@ -339,7 +339,61 @@ def _restore_array(data):
     return pyarrow_wrap_array(MakeArray(ad))
 
 
-cdef class Array:
+cdef class _PandasConvertible:
+
+    def to_pandas(self, categories=None, bint strings_to_categorical=False,
+                  bint zero_copy_only=False, bint integer_object_nulls=False,
+                  bint date_as_object=False,
+                  bint use_threads=True,
+                  bint deduplicate_objects=True,
+                  bint ignore_metadata=False):
+        """
+        Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
+
+        Parameters
+        ----------
+        strings_to_categorical : boolean, default False
+            Encode string (UTF8) and binary types to pandas.Categorical
+        categories: list, default empty
+            List of fields that should be returned as pandas.Categorical. Only
+            applies to table-like data structures
+        zero_copy_only : boolean, default False
+            Raise an ArrowException if this function call would require copying
+            the underlying data
+        integer_object_nulls : boolean, default False
+            Cast integers with nulls to objects
+        date_as_object : boolean, default False
+            Cast dates to objects
+        use_threads: boolean, default True
+            Whether to parallelize the conversion using multiple threads
+        deduplicate_objects : boolean, default False
+            Do not create multiple copies Python objects when created, to save
+            on memory use. Conversion will be slower
+        ignore_metadata : boolean, default False
+            If True, do not use the 'pandas' metadata to reconstruct the
+            DataFrame index, if present
+
+        Returns
+        -------
+        NumPy array or DataFrame depending on type of object
+        """
+        cdef:
+            PyObject* out
+            PandasOptions options
+
+        options = PandasOptions(
+            strings_to_categorical=strings_to_categorical,
+            zero_copy_only=zero_copy_only,
+            integer_object_nulls=integer_object_nulls,
+            date_as_object=date_as_object,
+            use_threads=use_threads,
+            deduplicate_objects=deduplicate_objects)
+
+        return self._to_pandas(options, categories=categories,
+                               ignore_metadata=ignore_metadata)
+
+
+cdef class Array(_PandasConvertible):
 
     def __init__(self):
         raise TypeError("Do not call {}'s constructor directly, use one of "
@@ -602,42 +656,13 @@ cdef class Array:
 
         return pyarrow_wrap_array(result)
 
-    def to_pandas(self, bint strings_to_categorical=False,
-                  bint zero_copy_only=False, bint integer_object_nulls=False,
-                  bint date_as_object=False):
-        """
-        Convert to a NumPy array object suitable for use in pandas.
-
-        Parameters
-        ----------
-        strings_to_categorical : boolean, default False
-            Encode string (UTF8) and binary types to pandas.Categorical
-        zero_copy_only : boolean, default False
-            Raise an ArrowException if this function call would require copying
-            the underlying data
-        integer_object_nulls : boolean, default False
-            Cast integers with nulls to objects
-        date_as_object : boolean, default False
-            Cast dates to objects
-
-        See also
-        --------
-        Column.to_pandas
-        Table.to_pandas
-        RecordBatch.to_pandas
-        """
+    def _to_pandas(self, options, **kwargs):
         cdef:
             PyObject* out
-            PandasOptions options
+            PandasOptions c_options = options
 
-        options = PandasOptions(
-            strings_to_categorical=strings_to_categorical,
-            zero_copy_only=zero_copy_only,
-            integer_object_nulls=integer_object_nulls,
-            date_as_object=date_as_object,
-            use_threads=False)
         with nogil:
-            check_status(ConvertArrayToPandas(options, self.sp_array,
+            check_status(ConvertArrayToPandas(c_options, self.sp_array,
                                               self, &out))
         return wrap_array_output(out)
 
 
@@ -192,11 +192,15 @@ def _iterate_python_module_paths(package_name):
             for finder in sys.meta_path:
                 try:
                     spec = finder.find_spec(absolute_name, None)
-                except AttributeError:
+                except (AttributeError, TypeError):
                     # On Travis (Python 3.5) the above produced:
                     # AttributeError: 'VendorImporter' object has no
                     # attribute 'find_spec'
+                    #
+                    # ARROW-4117: When running "asv dev", TypeError is raised
+                    # due to the meta-importer
                     spec = None
+
                 if spec is not None:
                     break