Skip to content

Commit ce092a6

Browse files
committed
CLN: make submodules of pandas.util private
xref pandas-dev#13634
1 parent 1002cc3 commit ce092a6

28 files changed

+345
-304
lines changed

doc/source/whatsnew/v0.20.0.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1238,10 +1238,10 @@ If indicated, a deprecation warning will be issued if you reference theses modul
12381238
"pandas.types", "pandas.core.dtypes", ""
12391239
"pandas.io.sas.saslib", "pandas.io.sas.libsas", ""
12401240
"pandas._join", "pandas._libs.join", ""
1241-
"pandas._hash", "pandas.util.libhashing", ""
1241+
"pandas._hash", "pandas.util._hashing", ""
12421242
"pandas._period", "pandas._libs.period", ""
12431243
"pandas._sparse", "pandas.core.sparse.libsparse", ""
1244-
"pandas._testing", "pandas.util.libtesting", ""
1244+
"pandas._testing", "pandas.util._testing", ""
12451245
"pandas._window", "pandas.core.libwindow", ""
12461246

12471247

File renamed without changes.

pandas/core/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from pandas.util.decorators import (Appender, cache_readonly,
3838
deprecate_kwarg, Substitution)
3939

40-
from pandas.util.terminal import get_terminal_size
40+
from pandas.io.formats.terminal import get_terminal_size
4141
from pandas.util.validators import validate_bool_kwarg
4242
from pandas.core.config import get_option
4343

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1382,7 +1382,7 @@ def to_clipboard(self, excel=None, sep=None, **kwargs):
13821382
- Windows: none
13831383
- OS X: none
13841384
"""
1385-
from pandas.io import clipboard
1385+
from pandas.io.clipboard import clipboard
13861386
clipboard.to_clipboard(self, excel=excel, sep=sep, **kwargs)
13871387

13881388
def to_xarray(self):

pandas/core/indexes/multi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,7 @@ def _inferred_type_levels(self):
718718
@cache_readonly
719719
def _hashed_values(self):
720720
""" return a uint64 ndarray of my hashed values """
721-
from pandas.util.hashing import hash_tuples
721+
from pandas.core.util.hashing import hash_tuples
722722
return hash_tuples(self)
723723

724724
def _hashed_indexing_key(self, key):
@@ -740,7 +740,7 @@ def _hashed_indexing_key(self, key):
740740
we need to stringify if we have mixed levels
741741
742742
"""
743-
from pandas.util.hashing import hash_tuples
743+
from pandas.core.util.hashing import hash_tuples
744744

745745
if not isinstance(key, tuple):
746746
return hash_tuples(key)

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
from pandas.core.indexes.timedeltas import TimedeltaIndex
6161
from pandas.core.indexes.period import PeriodIndex
6262
from pandas import compat
63-
from pandas.util.terminal import get_terminal_size
63+
from pandas.io.formats.terminal import get_terminal_size
6464
from pandas.compat import zip, u, OrderedDict, StringIO
6565
from pandas.compat.numpy import function as nv
6666

File renamed without changes.

pandas/core/util/hashing.py

+282
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
"""
2+
data hash pandas / numpy objects
3+
"""
4+
import itertools
5+
6+
import numpy as np
7+
from pandas._libs import hashing
8+
from pandas._libs.lib import is_bool_array
9+
from pandas.core.dtypes.generic import (
10+
ABCMultiIndex,
11+
ABCIndexClass,
12+
ABCSeries,
13+
ABCDataFrame)
14+
from pandas.core.dtypes.common import (
15+
is_categorical_dtype, is_numeric_dtype,
16+
is_datetime64_dtype, is_timedelta64_dtype,
17+
is_list_like)
18+
19+
# 16 byte long hashing key
20+
_default_hash_key = '0123456789123456'
21+
22+
23+
def _combine_hash_arrays(arrays, num_items):
24+
"""
25+
Parameters
26+
----------
27+
arrays : generator
28+
num_items : int
29+
30+
Should be the same as CPython's tupleobject.c
31+
"""
32+
try:
33+
first = next(arrays)
34+
except StopIteration:
35+
return np.array([], dtype=np.uint64)
36+
37+
arrays = itertools.chain([first], arrays)
38+
39+
mult = np.uint64(1000003)
40+
out = np.zeros_like(first) + np.uint64(0x345678)
41+
for i, a in enumerate(arrays):
42+
inverse_i = num_items - i
43+
out ^= a
44+
out *= mult
45+
mult += np.uint64(82520 + inverse_i + inverse_i)
46+
assert i + 1 == num_items, 'Fed in wrong num_items'
47+
out += np.uint64(97531)
48+
return out
49+
50+
51+
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
52+
categorize=True):
53+
"""
54+
Return a data hash of the Index/Series/DataFrame
55+
56+
.. versionadded:: 0.19.2
57+
58+
Parameters
59+
----------
60+
index : boolean, default True
61+
include the index in the hash (if Series/DataFrame)
62+
encoding : string, default 'utf8'
63+
encoding for data & key when strings
64+
hash_key : string key to encode, default to _default_hash_key
65+
categorize : bool, default True
66+
Whether to first categorize object arrays before hashing. This is more
67+
efficient when the array contains duplicate values.
68+
69+
.. versionadded:: 0.20.0
70+
71+
Returns
72+
-------
73+
Series of uint64, same length as the object
74+
75+
"""
76+
from pandas import Series
77+
if hash_key is None:
78+
hash_key = _default_hash_key
79+
80+
if isinstance(obj, ABCMultiIndex):
81+
return Series(hash_tuples(obj, encoding, hash_key),
82+
dtype='uint64', copy=False)
83+
84+
if isinstance(obj, ABCIndexClass):
85+
h = hash_array(obj.values, encoding, hash_key,
86+
categorize).astype('uint64', copy=False)
87+
h = Series(h, index=obj, dtype='uint64', copy=False)
88+
elif isinstance(obj, ABCSeries):
89+
h = hash_array(obj.values, encoding, hash_key,
90+
categorize).astype('uint64', copy=False)
91+
if index:
92+
index_iter = (hash_pandas_object(obj.index,
93+
index=False,
94+
encoding=encoding,
95+
hash_key=hash_key,
96+
categorize=categorize).values
97+
for _ in [None])
98+
arrays = itertools.chain([h], index_iter)
99+
h = _combine_hash_arrays(arrays, 2)
100+
101+
h = Series(h, index=obj.index, dtype='uint64', copy=False)
102+
103+
elif isinstance(obj, ABCDataFrame):
104+
hashes = (hash_array(series.values) for _, series in obj.iteritems())
105+
num_items = len(obj.columns)
106+
if index:
107+
index_hash_generator = (hash_pandas_object(obj.index,
108+
index=False,
109+
encoding=encoding,
110+
hash_key=hash_key,
111+
categorize=categorize).values # noqa
112+
for _ in [None])
113+
num_items += 1
114+
hashes = itertools.chain(hashes, index_hash_generator)
115+
h = _combine_hash_arrays(hashes, num_items)
116+
117+
h = Series(h, index=obj.index, dtype='uint64', copy=False)
118+
else:
119+
raise TypeError("Unexpected type for hashing %s" % type(obj))
120+
return h
121+
122+
123+
def hash_tuples(vals, encoding='utf8', hash_key=None):
124+
"""
125+
Hash an MultiIndex / list-of-tuples efficiently
126+
127+
.. versionadded:: 0.20.0
128+
129+
Parameters
130+
----------
131+
vals : MultiIndex, list-of-tuples, or single tuple
132+
encoding : string, default 'utf8'
133+
hash_key : string key to encode, default to _default_hash_key
134+
135+
Returns
136+
-------
137+
ndarray of hashed values array
138+
"""
139+
140+
is_tuple = False
141+
if isinstance(vals, tuple):
142+
vals = [vals]
143+
is_tuple = True
144+
elif not is_list_like(vals):
145+
raise TypeError("must be convertible to a list-of-tuples")
146+
147+
from pandas import Categorical, MultiIndex
148+
149+
if not isinstance(vals, ABCMultiIndex):
150+
vals = MultiIndex.from_tuples(vals)
151+
152+
# create a list-of-Categoricals
153+
vals = [Categorical(vals.labels[level],
154+
vals.levels[level],
155+
ordered=False,
156+
fastpath=True)
157+
for level in range(vals.nlevels)]
158+
159+
# hash the list-of-ndarrays
160+
hashes = (_hash_categorical(cat,
161+
encoding=encoding,
162+
hash_key=hash_key)
163+
for cat in vals)
164+
h = _combine_hash_arrays(hashes, len(vals))
165+
if is_tuple:
166+
h = h[0]
167+
168+
return h
169+
170+
171+
def _hash_categorical(c, encoding, hash_key):
172+
"""
173+
Hash a Categorical by hashing its categories, and then mapping the codes
174+
to the hashes
175+
176+
Parameters
177+
----------
178+
c : Categorical
179+
encoding : string, default 'utf8'
180+
hash_key : string key to encode, default to _default_hash_key
181+
182+
Returns
183+
-------
184+
ndarray of hashed values array, same size as len(c)
185+
"""
186+
hashed = hash_array(c.categories.values, encoding, hash_key,
187+
categorize=False)
188+
189+
# we have uint64, as we don't directly support missing values
190+
# we don't want to use take_nd which will coerce to float
191+
# instead, directly construt the result with a
192+
# max(np.uint64) as the missing value indicator
193+
#
194+
# TODO: GH 15362
195+
196+
mask = c.isnull()
197+
if len(hashed):
198+
result = hashed.take(c.codes)
199+
else:
200+
result = np.zeros(len(mask), dtype='uint64')
201+
202+
if mask.any():
203+
result[mask] = np.iinfo(np.uint64).max
204+
205+
return result
206+
207+
208+
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
209+
"""
210+
Given a 1d array, return an array of deterministic integers.
211+
212+
.. versionadded:: 0.19.2
213+
214+
Parameters
215+
----------
216+
vals : ndarray, Categorical
217+
encoding : string, default 'utf8'
218+
encoding for data & key when strings
219+
hash_key : string key to encode, default to _default_hash_key
220+
categorize : bool, default True
221+
Whether to first categorize object arrays before hashing. This is more
222+
efficient when the array contains duplicate values.
223+
224+
.. versionadded:: 0.20.0
225+
226+
Returns
227+
-------
228+
1d uint64 numpy array of hash values, same length as the vals
229+
230+
"""
231+
232+
if not hasattr(vals, 'dtype'):
233+
raise TypeError("must pass a ndarray-like")
234+
235+
if hash_key is None:
236+
hash_key = _default_hash_key
237+
238+
# For categoricals, we hash the categories, then remap the codes to the
239+
# hash values. (This check is above the complex check so that we don't ask
240+
# numpy if categorical is a subdtype of complex, as it will choke.
241+
if is_categorical_dtype(vals.dtype):
242+
return _hash_categorical(vals, encoding, hash_key)
243+
244+
# we'll be working with everything as 64-bit values, so handle this
245+
# 128-bit value early
246+
if np.issubdtype(vals.dtype, np.complex128):
247+
return hash_array(vals.real) + 23 * hash_array(vals.imag)
248+
249+
# First, turn whatever array this is into unsigned 64-bit ints, if we can
250+
# manage it.
251+
if is_bool_array(vals):
252+
vals = vals.astype('u8')
253+
elif (is_datetime64_dtype(vals) or
254+
is_timedelta64_dtype(vals)):
255+
vals = vals.view('i8').astype('u8', copy=False)
256+
elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
257+
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
258+
else:
259+
# With repeated values, its MUCH faster to categorize object dtypes,
260+
# then hash and rename categories. We allow skipping the categorization
261+
# when the values are known/likely to be unique.
262+
if categorize:
263+
from pandas import factorize, Categorical, Index
264+
codes, categories = factorize(vals, sort=False)
265+
cat = Categorical(codes, Index(categories),
266+
ordered=False, fastpath=True)
267+
return _hash_categorical(cat, encoding, hash_key)
268+
269+
try:
270+
vals = hashing.hash_object_array(vals, hash_key, encoding)
271+
except TypeError:
272+
# we have mixed types
273+
vals = hashing.hash_object_array(vals.astype(str).astype(object),
274+
hash_key, encoding)
275+
276+
# Then, redistribute these 64-bit ints within the space of 64-bit ints
277+
vals ^= vals >> 30
278+
vals *= np.uint64(0xbf58476d1ce4e5b9)
279+
vals ^= vals >> 27
280+
vals *= np.uint64(0x94d049bb133111eb)
281+
vals ^= vals >> 31
282+
return vals

pandas/io/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# flake8: noqa
66

77
from pandas.io.parsers import read_csv, read_table, read_fwf
8-
from pandas.io.clipboard import read_clipboard
8+
from pandas.io.clipboard.clipboard import read_clipboard
99
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
1010
from pandas.io.pytables import HDFStore, get_store, read_hdf
1111
from pandas.io.json import read_json
File renamed without changes.

pandas/io/clipboard.py renamed to pandas/io/clipboard/clipboard.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def read_clipboard(sep='\s+', **kwargs): # pragma: no cover
2626
raise NotImplementedError(
2727
'reading from clipboard only supports utf-8 encoding')
2828

29-
from pandas.util.clipboard import clipboard_get
29+
from pandas.io.clipboard import clipboard_get
3030
from pandas.io.parsers import read_table
3131
text = clipboard_get()
3232

@@ -92,7 +92,7 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover
9292
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
9393
raise ValueError('clipboard only supports utf-8 encoding')
9494

95-
from pandas.util.clipboard import clipboard_set
95+
from pandas.io.clipboard import clipboard_set
9696
if excel is None:
9797
excel = True
9898

File renamed without changes.

pandas/io/formats/console.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import sys
66
import locale
7-
from pandas.util.terminal import get_terminal_size
7+
from pandas.io.formats.terminal import get_terminal_size
88

99
# -----------------------------------------------------------------------------
1010
# Global formatting options

pandas/io/formats/format.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from pandas import compat
3131
from pandas.compat import (StringIO, lzip, range, map, zip, u,
3232
OrderedDict, unichr)
33-
from pandas.util.terminal import get_terminal_size
33+
from pandas.io.formats.terminal import get_terminal_size
3434
from pandas.core.config import get_option, set_option
3535
from pandas.io.common import _get_handle, UnicodeWriter, _expand_user
3636
from pandas.io.formats.printing import adjoin, justify, pprint_thing
File renamed without changes.

pandas/tests/io/formats/test_format.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
import pandas.io.formats.printing as printing
3030

3131
import pandas.util.testing as tm
32-
from pandas.util.terminal import get_terminal_size
32+
from pandas.io.formats.terminal import get_terminal_size
3333
from pandas.core.config import (set_option, get_option, option_context,
3434
reset_option)
3535

0 commit comments

Comments
 (0)