From 712cb6d66c6a5c7f1a5e93d6b751c9161888de79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 11 Mar 2015 17:56:58 +0100 Subject: [PATCH 1/7] fixed header=list (to create a MultiIndex) for read_excel --- pandas/io/excel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index cab342dc339f4..5a2255dc8d41d 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -419,7 +419,11 @@ def _parse_cell(cell_contents,cell_typ): data.append(row) if header is not None: - data[header] = _trim_excel_header(data[header]) + if isinstance(header, (list, tuple, np.ndarray)): + for rownum in header: + data[rownum] = _trim_excel_header(data[rownum]) + else: + data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, From 97a104cd47e4a4a57b6877964c58c4d9e8c1d6b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 4 May 2015 10:40:26 +0200 Subject: [PATCH 2/7] remove unused variables --- pandas/hashtable.pyx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 8bdcfb44242ff..d6a396f988ae7 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -211,7 +211,6 @@ cdef class StringHashTable(HashTable): def unique(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 object val char *buf @@ -225,7 +224,6 @@ cdef class StringHashTable(HashTable): if k == self.table.n_buckets: k = kh_put_str(self.table, buf, &ret) # print 'putting %s, %s' % (val, count) - count += 1 uniques.append(val) # return None @@ -319,7 +317,6 @@ cdef class Int32HashTable(HashTable): def lookup(self, ndarray[int32_t] values): cdef: Py_ssize_t i, n = len(values) - int ret = 0 int32_t val khiter_t k ndarray[int32_t] locs = np.empty(n, dtype=np.int64) @@ -518,7 +515,6 @@ cdef class Int64HashTable: #(HashTable): def unique(self, ndarray[int64_t] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 ndarray result int64_t val @@ -531,7 +527,6 @@ cdef class Int64HashTable: #(HashTable): if k == self.table.n_buckets: k = kh_put_int64(self.table, val, &ret) uniques.append(val) - count += 1 result = uniques.to_array() @@ -644,7 +639,6 @@ cdef class Float64HashTable(HashTable): def unique(self, ndarray[float64_t] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 float64_t val khiter_t k @@ -659,7 +653,6 @@ cdef class Float64HashTable(HashTable): if k == self.table.n_buckets: k = kh_put_float64(self.table, val, &ret) uniques.append(val) - count += 1 elif not seen_na: seen_na = 1 uniques.append(ONAN) @@ -786,7 +779,6 @@ cdef class PyObjectHashTable(HashTable): def unique(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 object val ndarray result @@ -938,7 +930,6 @@ cpdef value_count_int64(ndarray[int64_t] values): cdef: Py_ssize_t i kh_int64_t *table - int ret = 0 int k table = kh_init_int64() @@ -1008,9 +999,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): int count, max_count = 2 int j = -1 # so you can do += int k - Py_ssize_t i, n = len(values) kh_pymap_t *table - int ret = 0 table = kh_init_pymap() build_count_table_object(values, mask, table) @@ -1040,7 +1029,6 @@ def mode_int64(ndarray[int64_t] values): int j = -1 # so you can do += int k kh_int64_t *table - list uniques = [] table = kh_init_int64() From 6e06e5dadff0847d148ab1c5c0d2b398f7b3d818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 4 May 2015 10:57:42 +0200 Subject: [PATCH 3/7] do not store result of kh_put* when it is not used --- pandas/hashtable.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index d6a396f988ae7..fc2dd65409b19 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -222,7 +222,7 @@ cdef class StringHashTable(HashTable): buf = util.get_c_string(val) k = kh_get_str(self.table, buf) if k == self.table.n_buckets: - k = kh_put_str(self.table, buf, &ret) + kh_put_str(self.table, buf, &ret) # print 'putting %s, %s' % (val, count) uniques.append(val) @@ -525,7 +525,7 @@ cdef class Int64HashTable: #(HashTable): val = values[i] k = kh_get_int64(self.table, val) if k == self.table.n_buckets: - k = kh_put_int64(self.table, val, &ret) + kh_put_int64(self.table, val, &ret) uniques.append(val) result = uniques.to_array() @@ -651,7 +651,7 @@ cdef class Float64HashTable(HashTable): if val == val: k = kh_get_float64(self.table, val) if k == self.table.n_buckets: - k = kh_put_float64(self.table, val, &ret) + kh_put_float64(self.table, val, &ret) uniques.append(val) elif not seen_na: seen_na = 1 @@ -792,7 +792,7 @@ cdef class PyObjectHashTable(HashTable): if not _checknan(val): k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: - k = kh_put_pymap(self.table, val, &ret) + kh_put_pymap(self.table, val, &ret) uniques.append(val) elif not seen_na: seen_na = 1 From c71933baeb0396f75aaec906e638e726b327b109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 4 May 2015 10:58:07 +0200 Subject: [PATCH 4/7] remove debug (commented) code --- pandas/hashtable.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index fc2dd65409b19..39268237aa618 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -223,10 +223,8 @@ cdef class StringHashTable(HashTable): k = kh_get_str(self.table, buf) if k == self.table.n_buckets: kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) uniques.append(val) - # return None return uniques.to_array() def factorize(self, ndarray[object] values): @@ -256,7 +254,6 @@ cdef class StringHashTable(HashTable): labels[i] = count count += 1 - # return None return reverse, labels cdef class Int32HashTable(HashTable): @@ -354,7 +351,6 @@ cdef class Int32HashTable(HashTable): labels[i] = count count += 1 - # return None return reverse, labels cdef class Int64HashTable: #(HashTable): From 8d2df65cd802d9e11e62059286bd375d46aaed36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 4 May 2015 10:58:41 +0200 Subject: [PATCH 5/7] use khiter_t instead of int --- pandas/hashtable.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 39268237aa618..2890dc73165dd 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -906,7 +906,7 @@ cdef class Int64Factorizer: cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table): cdef: - int k + khiter_t k Py_ssize_t i, n = len(values) int ret = 0 @@ -948,7 +948,7 @@ cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, kh_pymap_t *table): cdef: - int k + khiter_t k Py_ssize_t i, n = len(values) int ret = 0 From 739b4ba99d9bbf2547cbd55f67258ef8cfd26db1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 4 May 2015 11:00:08 +0200 Subject: [PATCH 6/7] rename val & max_val to count & max_count it makes more sense and it is more consistent with mode_object --- pandas/hashtable.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 2890dc73165dd..d89722e41d56c 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1021,7 +1021,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): def mode_int64(ndarray[int64_t] values): cdef: - int val, max_val = 2 + int count, max_count = 2 int j = -1 # so you can do += int k kh_int64_t *table @@ -1033,12 +1033,12 @@ def mode_int64(ndarray[int64_t] values): modes = np.empty(table.n_buckets, dtype=np.int64) for k in range(table.n_buckets): if kh_exist_int64(table, k): - val = table.vals[k] + count = table.vals[k] - if val == max_val: + if count == max_count: j += 1 - elif val > max_val: - max_val = val + elif count > max_count: + max_count = count j = 0 else: continue From 5b28597f0d361d0325822350ab9e4940ee2b85fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 4 May 2015 11:00:51 +0200 Subject: [PATCH 7/7] do not needlessly initialize a variable --- pandas/hashtable.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index d89722e41d56c..c4cd788216018 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -970,7 +970,7 @@ cdef build_count_table_object(ndarray[object] values, cpdef value_count_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: - Py_ssize_t i = len(values) + Py_ssize_t i kh_pymap_t *table int k