Skip to content

Commit 5ffb6c0

Browse files
committed
Merge commit 'v0.12.0rc1-112-gb79996c' into debian
* commit 'v0.12.0rc1-112-gb79996c': TST: ujson dont force endianness pandas-dev#4274 BUG: Fixed non-unique indexing memory allocation issue with .ix/.loc (GH4280) BUG: fix data.py regression DOC: cookbook example TST/CI: remove html5lib from 3.2 build TST: properly skip html5lib
2 parents 9121871 + b79996c commit 5ffb6c0

File tree

10 files changed

+75
-17
lines changed

10 files changed

+75
-17
lines changed

ci/requirements-3.2.txt

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ python-dateutil==2.1
22
pytz==2013b
33
openpyxl==1.6.2
44
xlrd==0.9.2
5-
html5lib==1.0b2
65
numpy==1.6.2
76
cython==0.19.1
87
numexpr==2.1

doc/source/cookbook.rst

+3
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,9 @@ The :ref:`grouping <groupby>` docs.
149149
`Create multiple aggregated columns
150150
<http://stackoverflow.com/questions/14897100/create-multiple-columns-in-pandas-aggregation-function>`__
151151

152+
`Create a value counts column and reassign back to the DataFrame
153+
<http://stackoverflow.com/questions/17709270/i-want-to-create-a-column-of-value-counts-in-my-pandas-dataframe>`__
154+
152155
Expanding Data
153156
~~~~~~~~~~~~~~
154157

doc/source/release.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,8 @@ pandas 0.12
235235
names (:issue:`3873`)
236236
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
237237
``reindex`` for location-based taking
238-
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
238+
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
239+
- Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
239240

240241
- Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
241242
- Allow index name to be used in groupby for non MultiIndex (:issue:`4014`)
@@ -342,6 +343,8 @@ pandas 0.12
342343
- Fixed bug in initializing ``DatetimeIndex`` with an array of strings
343344
in a certain time zone (:issue:`4229`)
344345
- Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`)
346+
- Fixed bug where get_data_famafrench wasn't using the correct file edges
347+
(:issue:`4281`)
345348

346349
pandas 0.11.0
347350
=============

doc/source/v0.12.0.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,8 @@ Bug Fixes
437437
names (:issue:`3873`)
438438
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
439439
``reindex`` for location-based taking
440-
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
440+
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
441+
- Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
441442

442443
- ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`)
443444
- ``read_html`` now correctly skips tests (:issue:`3741`)
@@ -475,6 +476,8 @@ Bug Fixes
475476
- Fixed bug in initializing ``DatetimeIndex`` with an array of strings
476477
in a certain time zone (:issue:`4229`)
477478
- Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`)
479+
- Fixed bug where get_data_famafrench wasn't using the correct file edges
480+
(:issue:`4281`)
478481

479482
See the :ref:`full release notes
480483
<release>` or issue tracker

pandas/index.pyx

+16-2
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,19 @@ cdef class IndexEngine:
278278
dict d = {}
279279
object val
280280
int count = 0, count_missing = 0
281-
Py_ssize_t i, j, n, n_t
281+
Py_ssize_t i, j, n, n_t, n_alloc
282282

283283
self._ensure_mapping_populated()
284284
values = self._get_index_values()
285285
stargets = set(targets)
286286
n = len(values)
287287
n_t = len(targets)
288-
result = np.empty(n*n_t, dtype=np.int64)
288+
if n > 10000:
289+
n_alloc = 10000
290+
else:
291+
n_alloc = n
292+
293+
result = np.empty(n_alloc, dtype=np.int64)
289294
missing = np.empty(n_t, dtype=np.int64)
290295

291296
# form the set of the results (like ismember)
@@ -304,12 +309,21 @@ cdef class IndexEngine:
304309
# found
305310
if val in d:
306311
for j in d[val]:
312+
313+
# realloc if needed
314+
if count >= n_alloc:
315+
n_alloc += 10000
316+
result = np.resize(result, n_alloc)
317+
307318
result[count] = j
308319
count += 1
309320

310321
# value not found
311322
else:
312323

324+
if count >= n_alloc:
325+
n_alloc += 10000
326+
result = np.resize(result, n_alloc)
313327
result[count] = -1
314328
count += 1
315329
missing[count_missing] = i

pandas/io/data.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -453,8 +453,8 @@ def get_data_fred(name, start=dt.datetime(2010, 1, 1),
453453
def get_data_famafrench(name):
454454
# path of zip files
455455
zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/'
456-
'ken.french/ftp/')
457-
zip_file_path = '{0}{1}.zip'.format(zip_file_url, name)
456+
'ken.french/ftp')
457+
zip_file_path = '{0}/{1}.zip'.format(zip_file_url, name)
458458

459459
with urlopen(zip_file_path) as url:
460460
raw = url.read()
@@ -463,13 +463,13 @@ def get_data_famafrench(name):
463463
tmpf.write(raw)
464464

465465
with ZipFile(tmpf, 'r') as zf:
466-
data = zf.read(name + '.txt').splitlines()
466+
data = zf.open(name + '.txt').readlines()
467467

468468
line_lengths = np.array(map(len, data))
469-
file_edges = np.where(line_lengths)[0]
469+
file_edges = np.where(line_lengths == 2)[0]
470470

471471
datasets = {}
472-
edges = itertools.izip(file_edges[:-1], file_edges[1:])
472+
edges = itertools.izip(file_edges + 1, file_edges[1:])
473473
for i, (left_edge, right_edge) in enumerate(edges):
474474
dataset = [d.split() for d in data[left_edge:right_edge]]
475475
if len(dataset) > 10:
@@ -479,14 +479,15 @@ def get_data_famafrench(name):
479479
header = dataset[header_index]
480480
ds_header = dataset[header_index + 1:]
481481
# to ensure the header is unique
482-
header = ['{0} {1}'.format(*items) for items in enumerate(header,
483-
start=1)]
484-
index = np.fromiter((d[0] for d in ds_header), dtype=int)
485-
dataset = np.fromiter((d[1:] for d in ds_header), dtype=float)
482+
header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
483+
start=1)]
484+
index = np.array([d[0] for d in ds_header], dtype=int)
485+
dataset = np.array([d[1:] for d in ds_header], dtype=float)
486486
datasets[i] = DataFrame(dataset, index, columns=header)
487487

488488
return datasets
489489

490+
490491
# Items needed for options class
491492
CUR_MONTH = dt.datetime.now().month
492493
CUR_YEAR = dt.datetime.now().year

pandas/io/tests/test_data.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas.io import data as web
1111
from pandas.io.data import DataReader, SymbolWarning
1212
from pandas.util.testing import (assert_series_equal, assert_produces_warning,
13-
assert_frame_equal, network)
13+
network)
1414
from numpy.testing import assert_array_equal
1515

1616

@@ -343,6 +343,7 @@ def test_read_famafrench(self):
343343
"F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
344344
"F-F_ST_Reversal_Factor"):
345345
ff = DataReader(name, "famafrench")
346+
assert ff
346347
assert isinstance(ff, dict)
347348

348349

pandas/io/tests/test_html.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def _skip_if_none_of(module_names):
5050
else:
5151
not_found = [module_name for module_name in module_names if not
5252
_have_module(module_name)]
53-
if not_found == module_names:
53+
if set(not_found) & set(module_names):
5454
raise nose.SkipTest("{0} not found".format(not_found))
5555
if 'bs4' in module_names:
5656
import bs4

pandas/io/tests/test_json/test_pandas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
145145
_check_all_orients(DataFrame(biggie, dtype=np.float64),
146146
dtype=np.float64, convert_axes=False)
147147
_check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, convert_axes=False)
148-
_check_all_orients(DataFrame(biggie, dtype='<U3'), dtype='<U3', convert_axes=False,
148+
_check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3', convert_axes=False,
149149
raise_ok=ValueError)
150150

151151
# empty

pandas/tests/test_indexing.py

+34
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,40 @@ def test_mi_access(self):
11021102
result = df2['A']['B2']
11031103
assert_frame_equal(result,expected)
11041104

1105+
def test_non_unique_loc_memory_error(self):
1106+
1107+
# GH 4280
1108+
# non_unique index with a large selection triggers a memory error
1109+
1110+
columns = list('ABCDEFG')
1111+
def gen_test(l,l2):
1112+
return pd.concat([ DataFrame(randn(l,len(columns)),index=range(l),columns=columns),
1113+
DataFrame(np.ones((l2,len(columns))),index=[0]*l2,columns=columns) ])
1114+
1115+
1116+
def gen_expected(df,mask):
1117+
l = len(mask)
1118+
return pd.concat([
1119+
df.take([0],convert=False),
1120+
DataFrame(np.ones((l,len(columns))),index=[0]*l,columns=columns),
1121+
df.take(mask[1:],convert=False) ])
1122+
1123+
df = gen_test(900,100)
1124+
self.assert_(not df.index.is_unique)
1125+
1126+
mask = np.arange(100)
1127+
result = df.loc[mask]
1128+
expected = gen_expected(df,mask)
1129+
assert_frame_equal(result,expected)
1130+
1131+
df = gen_test(900000,100000)
1132+
self.assert_(not df.index.is_unique)
1133+
1134+
mask = np.arange(100000)
1135+
result = df.loc[mask]
1136+
expected = gen_expected(df,mask)
1137+
assert_frame_equal(result,expected)
1138+
11051139
if __name__ == '__main__':
11061140
import nose
11071141
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)