Merge commit 'v0.12.0rc1-112-gb79996c' into debian

yarikoptic · yarikoptic · commit 5ffb6c094ada · 2013-07-18T13:09:46.000-04:00
* commit 'v0.12.0rc1-112-gb79996c': TST: ujson dont force endianness pandas-dev#4274 BUG: Fixed non-unique indexing memory allocation issue with .ix/.loc (GH4280) BUG: fix data.py regression DOC: cookbook example TST/CI: remove html5lib from 3.2 build TST: properly skip html5lib
diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt
@@ -2,7 +2,6 @@ python-dateutil==2.1
 pytz==2013b
 openpyxl==1.6.2
 xlrd==0.9.2
-html5lib==1.0b2
 numpy==1.6.2
 cython==0.19.1
 numexpr==2.1
diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
@@ -149,6 +149,9 @@ The :ref:`grouping <groupby>` docs.
 `Create multiple aggregated columns
 <http://stackoverflow.com/questions/14897100/create-multiple-columns-in-pandas-aggregation-function>`__
 
+`Create a value counts column and reassign back to the DataFrame
+<http://stackoverflow.com/questions/17709270/i-want-to-create-a-column-of-value-counts-in-my-pandas-dataframe>`__
+
 Expanding Data
 ~~~~~~~~~~~~~~
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -235,7 +235,8 @@ pandas 0.12
       names (:issue:`3873`)
     - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
       ``reindex`` for location-based taking
-    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
+    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
+    - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
 
   - Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
   - Allow index name to be used in groupby for non MultiIndex (:issue:`4014`)
@@ -342,6 +343,8 @@ pandas 0.12
   - Fixed bug in initializing ``DatetimeIndex`` with an array of strings
     in a certain time zone (:issue:`4229`)
   - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`)
+  - Fixed bug where get_data_famafrench wasn't using the correct file edges
+    (:issue:`4281`)
 
 pandas 0.11.0
 =============
diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt
@@ -437,7 +437,8 @@ Bug Fixes
       names (:issue:`3873`)
     - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
       ``reindex`` for location-based taking
-    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
+    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
+    - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
 
   - ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`)
   - ``read_html`` now correctly skips tests (:issue:`3741`)
@@ -475,6 +476,8 @@ Bug Fixes
   - Fixed bug in initializing ``DatetimeIndex`` with an array of strings
     in a certain time zone (:issue:`4229`)
   - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`)
+  - Fixed bug where get_data_famafrench wasn't using the correct file edges
+    (:issue:`4281`)
 
 See the :ref:`full release notes
 <release>` or issue tracker
diff --git a/pandas/index.pyx b/pandas/index.pyx
@@ -278,14 +278,19 @@ cdef class IndexEngine:
             dict d = {}
             object val
             int count = 0, count_missing = 0
-            Py_ssize_t i, j, n, n_t
+            Py_ssize_t i, j, n, n_t, n_alloc
 
         self._ensure_mapping_populated()
         values = self._get_index_values()
         stargets = set(targets)
         n = len(values)
         n_t = len(targets)
-        result  = np.empty(n*n_t, dtype=np.int64)
+        if n > 10000:
+            n_alloc = 10000
+        else:
+            n_alloc = n
+
+        result  = np.empty(n_alloc, dtype=np.int64)
         missing = np.empty(n_t, dtype=np.int64)
 
         # form the set of the results (like ismember)
@@ -304,12 +309,21 @@ cdef class IndexEngine:
             # found
             if val in d:
                 for j in d[val]:
+
+                   # realloc if needed
+                   if count >= n_alloc:
+                      n_alloc += 10000
+                      result = np.resize(result, n_alloc)
+
                    result[count] = j
                    count += 1
 
             # value not found
             else:
 
+                if count >= n_alloc:
+                     n_alloc += 10000
+                     result = np.resize(result, n_alloc)
                 result[count] = -1
                 count += 1
                 missing[count_missing] = i
diff --git a/pandas/io/data.py b/pandas/io/data.py
@@ -453,8 +453,8 @@ def get_data_fred(name, start=dt.datetime(2010, 1, 1),
 def get_data_famafrench(name):
     # path of zip files
     zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/'
-                    'ken.french/ftp/')
-    zip_file_path = '{0}{1}.zip'.format(zip_file_url, name)
+                    'ken.french/ftp')
+    zip_file_path = '{0}/{1}.zip'.format(zip_file_url, name)
 
     with urlopen(zip_file_path) as url:
         raw = url.read()
@@ -463,13 +463,13 @@ def get_data_famafrench(name):
         tmpf.write(raw)
 
         with ZipFile(tmpf, 'r') as zf:
-            data = zf.read(name + '.txt').splitlines()
+            data = zf.open(name + '.txt').readlines()
 
     line_lengths = np.array(map(len, data))
-    file_edges = np.where(line_lengths)[0]
+    file_edges = np.where(line_lengths == 2)[0]
 
     datasets = {}
-    edges = itertools.izip(file_edges[:-1], file_edges[1:])
+    edges = itertools.izip(file_edges + 1, file_edges[1:])
     for i, (left_edge, right_edge) in enumerate(edges):
         dataset = [d.split() for d in data[left_edge:right_edge]]
         if len(dataset) > 10:
@@ -479,14 +479,15 @@ def get_data_famafrench(name):
             header = dataset[header_index]
             ds_header = dataset[header_index + 1:]
             # to ensure the header is unique
-            header = ['{0} {1}'.format(*items) for items in enumerate(header,
-                                                                      start=1)]
-            index = np.fromiter((d[0] for d in ds_header), dtype=int)
-            dataset = np.fromiter((d[1:] for d in ds_header), dtype=float)
+            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
+                                                                     start=1)]
+            index = np.array([d[0] for d in ds_header], dtype=int)
+            dataset = np.array([d[1:] for d in ds_header], dtype=float)
             datasets[i] = DataFrame(dataset, index, columns=header)
 
     return datasets
 
+
 # Items needed for options class
 CUR_MONTH = dt.datetime.now().month
 CUR_YEAR = dt.datetime.now().year
diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py
@@ -10,7 +10,7 @@
 from pandas.io import data as web
 from pandas.io.data import DataReader, SymbolWarning
 from pandas.util.testing import (assert_series_equal, assert_produces_warning,
-                                 assert_frame_equal, network)
+                                 network)
 from numpy.testing import assert_array_equal
 
 
@@ -343,6 +343,7 @@ def test_read_famafrench(self):
                      "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
                      "F-F_ST_Reversal_Factor"):
             ff = DataReader(name, "famafrench")
+            assert ff
             assert isinstance(ff, dict)
 
 
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
@@ -50,7 +50,7 @@ def _skip_if_none_of(module_names):
     else:
         not_found = [module_name for module_name in module_names if not
                      _have_module(module_name)]
-        if not_found == module_names:
+        if set(not_found) & set(module_names):
             raise nose.SkipTest("{0} not found".format(not_found))
         if 'bs4' in module_names:
             import bs4
diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
@@ -145,7 +145,7 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
         _check_all_orients(DataFrame(biggie, dtype=np.float64),
                            dtype=np.float64, convert_axes=False)
         _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, convert_axes=False)
-        _check_all_orients(DataFrame(biggie, dtype='<U3'), dtype='<U3', convert_axes=False,
+        _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3', convert_axes=False,
                            raise_ok=ValueError)
 
         # empty
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -1102,6 +1102,40 @@ def test_mi_access(self):
         result = df2['A']['B2']
         assert_frame_equal(result,expected)
 
+    def test_non_unique_loc_memory_error(self):
+
+        # GH 4280
+        # non_unique index with a large selection triggers a memory error
+
+        columns = list('ABCDEFG')
+        def gen_test(l,l2):
+            return pd.concat([ DataFrame(randn(l,len(columns)),index=range(l),columns=columns),
+                               DataFrame(np.ones((l2,len(columns))),index=[0]*l2,columns=columns) ])
+
+
+        def gen_expected(df,mask):
+            l = len(mask)
+            return pd.concat([
+                df.take([0],convert=False),
+                DataFrame(np.ones((l,len(columns))),index=[0]*l,columns=columns),
+                df.take(mask[1:],convert=False) ])
+
+        df = gen_test(900,100)
+        self.assert_(not df.index.is_unique)
+
+        mask = np.arange(100)
+        result = df.loc[mask]
+        expected = gen_expected(df,mask)
+        assert_frame_equal(result,expected)
+
+        df = gen_test(900000,100000)
+        self.assert_(not df.index.is_unique)
+
+        mask = np.arange(100000)
+        result = df.loc[mask]
+        expected = gen_expected(df,mask)
+        assert_frame_equal(result,expected)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],