Skip to content

Commit 58f682d

Browse files
committed
memory optimization
1 parent 0c13df7 commit 58f682d

File tree

2 files changed

+29
-7
lines changed

2 files changed

+29
-7
lines changed

pandas/tools/hashing.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,30 @@
99
from pandas.lib import is_bool_array
1010
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
1111
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
12-
is_datetime64_dtype, is_timedelta64_dtype)
12+
is_datetime64_dtype, is_timedelta64_dtype,
13+
is_list_like)
1314

1415
# 16 byte long hashing key
1516
_default_hash_key = '0123456789123456'
1617

1718

1819
def _combine_hash_arrays(arrays, num_items):
19-
"Should be the same as CPython's tupleobject.c"
20-
first = next(arrays)
20+
"""
21+
Parameters
22+
----------
23+
arrays : generator
24+
num_items : int
25+
26+
Should be the same as CPython's tupleobject.c
27+
"""
28+
try:
29+
first = next(arrays)
30+
except StopIteration:
31+
return np.array([], dtype=np.uint64)
32+
2133
arrays = itertools.chain([first], arrays)
2234

23-
mult = np.zeros_like(first) + np.uint64(1000003)
35+
mult = np.uint64(1000003)
2436
out = np.zeros_like(first) + np.uint64(0x345678)
2537
for i, a in enumerate(arrays):
2638
inverse_i = num_items - i
@@ -135,11 +147,11 @@ def _hash_lists(vals, encoding='utf8', hash_key=None):
135147

136148
def hash_tuples(vals, encoding='utf8', hash_key=None):
137149
"""
138-
Hash an MultiIndex / array_of_tuples efficiently
150+
Hash an MultiIndex / list-of-tuples efficiently
139151
140152
Parameters
141153
----------
142-
vals : MultiIndex, ndarray of tuples, or single tuple
154+
vals : MultiIndex, list-of-tuples, or single tuple
143155
encoding : string, default 'utf8'
144156
hash_key : string key to encode, default to _default_hash_key
145157
@@ -152,6 +164,8 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
152164
if isinstance(vals, tuple):
153165
vals = [vals]
154166
is_tuple = True
167+
elif not is_list_like(vals):
168+
raise TypeError("must be convertible to a list-of-tuples")
155169

156170
if not isinstance(vals, MultiIndex):
157171
vals = MultiIndex.from_tuples(vals)

pandas/tools/tests/test_hashing.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def check_not_equal_with_index(self, obj):
5353
if not isinstance(obj, Index):
5454
a = hash_pandas_object(obj, index=True)
5555
b = hash_pandas_object(obj, index=False)
56-
self.assertFalse((a == b).all())
56+
if len(obj):
57+
self.assertFalse((a == b).all())
5758

5859
def test_hash_tuples(self):
5960
tups = [(1, 'one'), (1, 'two'), (2, 'one')]
@@ -64,6 +65,11 @@ def test_hash_tuples(self):
6465
result = hash_tuples(tups[0])
6566
self.assertEqual(result, expected[0])
6667

68+
def test_hash_tuples_err(self):
69+
70+
for val in [5, 'foo', pd.Timestamp('20130101')]:
71+
self.assertRaises(TypeError, hash_tuples, val)
72+
6773
def test_multiindex_unique(self):
6874
mi = MultiIndex.from_tuples([(118, 472), (236, 118),
6975
(51, 204), (102, 51)])
@@ -81,9 +87,11 @@ def test_hash_pandas_object(self):
8187
Series(['a', np.nan, 'c']),
8288
Series(['a', None, 'c']),
8389
Series([True, False, True]),
90+
Series(),
8491
Index([1, 2, 3]),
8592
Index([True, False, True]),
8693
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
94+
DataFrame(),
8795
tm.makeMissingDataframe(),
8896
tm.makeMixedDataFrame(),
8997
tm.makeTimeDataFrame(),

0 commit comments

Comments
 (0)