pandas-dev
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎.travis.yml
+1-1 b/‎.travis.yml
+1-1
diff --git a/‎appveyor.yml
+13-5 b/‎appveyor.yml
+13-5
diff --git a/‎asv_bench/asv.conf.json
+3-3 b/‎asv_bench/asv.conf.json
+3-3
diff --git a/‎asv_bench/benchmarks/algorithms.py
+31 b/‎asv_bench/benchmarks/algorithms.py
+31
diff --git a/‎asv_bench/benchmarks/frame_methods.py
+16 b/‎asv_bench/benchmarks/frame_methods.py
+16
diff --git a/‎asv_bench/benchmarks/index_object.py
+55 b/‎asv_bench/benchmarks/index_object.py
+55
diff --git a/‎asv_bench/benchmarks/indexing.py
-20 b/‎asv_bench/benchmarks/indexing.py
-20
diff --git a/‎asv_bench/benchmarks/inference.py
+27-1 b/‎asv_bench/benchmarks/inference.py
+27-1
diff --git a/‎asv_bench/benchmarks/join_merge.py
+37-16 b/‎asv_bench/benchmarks/join_merge.py
+37-16
diff --git a/‎asv_bench/benchmarks/parser_vb.py
+21 b/‎asv_bench/benchmarks/parser_vb.py
+21
@@ -18,6 +18,7 @@
 .vagrant
 .noseids
 .ipynb_checkpoints
+.tags
 
 # Compiled source #
 ###################
 
@@ -24,7 +24,7 @@ env:
 
 git:
     # for cloning
-    depth: 500
+    depth: 1000
 
 matrix:
     fast_finish: true
 
@@ -16,11 +16,13 @@ environment:
     CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd"
 
   matrix:
-    - PYTHON: "C:\\Python34_64"
-      PYTHON_VERSION: "3.4"
-      PYTHON_ARCH: "64"
-      CONDA_PY: "34"
-      CONDA_NPY: "19"
+
+    # disable python 3.4 ATM
+    #- PYTHON: "C:\\Python34_64"
+    #  PYTHON_VERSION: "3.4"
+    #  PYTHON_ARCH: "64"
+    #  CONDA_PY: "34"
+    #  CONDA_NPY: "19"
 
     - PYTHON: "C:\\Python27_64"
       PYTHON_VERSION: "2.7"
@@ -62,6 +64,12 @@ install:
   # install our build environment
   - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false
   - cmd: conda update -q conda
+
+  # fix conda-build version
+  # https://github.com/conda/conda-build/issues/1001
+  # disabling 3.4 as windows complains upon compiling byte
+  # code
+  - cmd: conda install conda-build=1.21.7
   - cmd: conda config --set ssl_verify false
 
   # add the pandas channel *before* defaults to have defaults take priority
 
@@ -77,11 +77,11 @@
         // On conda install pytables, otherwise tables
         {"environment_type": "conda", "tables": ""},
         {"environment_type": "conda", "pytables": null},
-        {"environment_type": "virtualenv", "tables": null},
-        {"environment_type": "virtualenv", "pytables": ""},
+        {"environment_type": "(?!conda).*", "tables": null},
+        {"environment_type": "(?!conda).*", "pytables": ""},
         // On conda&win32, install libpython
         {"sys_platform": "(?!win32).*", "libpython": ""},
-        {"sys_platform": "win32", "libpython": null},
+        {"environment_type": "conda", "sys_platform": "win32", "libpython": null},
         {"environment_type": "(?!conda).*", "libpython": ""}
     ],
     "include": [],
 
@@ -0,0 +1,31 @@
+import numpy as np
+import pandas as pd
+
+
+class algorithm(object):
+    goal_time = 0.2
+
+    def setup(self):
+        N = 100000
+
+        self.int_unique = pd.Int64Index(np.arange(N * 5))
+        # cache is_unique
+        self.int_unique.is_unique
+
+        self.int = pd.Int64Index(np.arange(N).repeat(5))
+        self.float = pd.Float64Index(np.random.randn(N).repeat(5))
+
+    def time_int_factorize(self):
+        self.int.factorize()
+
+    def time_float_factorize(self):
+        self.int.factorize()
+
+    def time_int_unique_duplicated(self):
+        self.int_unique.duplicated()
+
+    def time_int_duplicated(self):
+        self.int.duplicated()
+
+    def time_float_duplicated(self):
+        self.float.duplicated()
@@ -1,4 +1,5 @@
 from .pandas_vb_common import *
+import string
 
 
 class frame_apply_axis_1(object):
@@ -606,6 +607,21 @@ def time_frame_isnull(self):
         isnull(self.df)
 
 
+class frame_isnull_strings(object):
+    goal_time = 0.2
+
+    def setup(self):
+        np.random.seed(1234)
+        self.sample = np.array(list(string.ascii_lowercase) +
+                               list(string.ascii_uppercase) +
+                               list(string.whitespace))
+        self.data = np.random.choice(self.sample, (1000, 1000))
+        self.df = DataFrame(self.data)
+
+    def time_frame_isnull(self):
+        isnull(self.df)
+
+
 class frame_isnull_obj(object):
     goal_time = 0.2
 
 
@@ -63,6 +63,27 @@ def time_index_datetime_union(self):
         self.rng.union(self.rng2)
 
 
+class index_datetime_set_difference(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 100000
+        self.A = self.N - 20000
+        self.B = self.N + 20000
+        self.idx1 = DatetimeIndex(range(self.N))
+        self.idx2 = DatetimeIndex(range(self.A, self.B))
+        self.idx3 = DatetimeIndex(range(self.N, self.B))
+
+    def time_index_datetime_difference(self):
+        self.idx1.difference(self.idx2)
+
+    def time_index_datetime_difference_disjoint(self):
+        self.idx1.difference(self.idx3)
+
+    def time_index_datetime_symmetric_difference(self):
+        self.idx1.symmetric_difference(self.idx2)
+
+
 class index_float64_boolean_indexer(object):
     goal_time = 0.2
 
@@ -183,6 +204,40 @@ def time_index_int64_union(self):
         self.left.union(self.right)
 
 
+class index_int64_set_difference(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 500000
+        self.options = np.arange(self.N)
+        self.left = Index(self.options.take(
+            np.random.permutation(self.N)[:(self.N // 2)]))
+        self.right = Index(self.options.take(
+            np.random.permutation(self.N)[:(self.N // 2)]))
+
+    def time_index_int64_difference(self):
+        self.left.difference(self.right)
+
+    def time_index_int64_symmetric_difference(self):
+        self.left.symmetric_difference(self.right)
+
+
+class index_str_set_difference(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 10000
+        self.strs = tm.rands_array(10, self.N)
+        self.left = Index(self.strs[:self.N * 2 // 3])
+        self.right = Index(self.strs[self.N // 3:])
+
+    def time_str_difference(self):
+        self.left.difference(self.right)
+
+    def time_str_symmetric_difference(self):
+        self.left.symmetric_difference(self.right)
+
+
 class index_str_boolean_indexer(object):
     goal_time = 0.2
 
 
@@ -19,24 +19,6 @@ def time_dataframe_getitem_scalar(self):
         self.df[self.col][self.idx]
 
 
-class datamatrix_getitem_scalar(object):
-    goal_time = 0.2
-
-    def setup(self):
-        try:
-            self.klass = DataMatrix
-        except:
-            self.klass = DataFrame
-        self.index = tm.makeStringIndex(1000)
-        self.columns = tm.makeStringIndex(30)
-        self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns)
-        self.idx = self.index[100]
-        self.col = self.columns[10]
-
-    def time_datamatrix_getitem_scalar(self):
-        self.df[self.col][self.idx]
-
-
 class series_get_value(object):
     goal_time = 0.2
 
@@ -498,5 +480,3 @@ def setup(self):
 
     def time_float_loc(self):
         self.ind.get_loc(0)
-
-
@@ -135,4 +135,30 @@ def setup(self):
         self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
 
     def time_dtype_infer_uint32(self):
-        (self.df_uint32['A'] + self.df_uint32['B'])
+        (self.df_uint32['A'] + self.df_uint32['B'])
+
+
+class to_numeric(object):
+
+    param_names = ['dtype', 'downcast']
+    params = [['string-float', 'string-int', 'string-nint', 'datetime64',
+               'int-list', 'int32'],
+              [None, 'integer', 'signed', 'unsigned', 'float']]
+
+    N = 500000
+
+    data_dict = {
+        'string-int': (['1'] * (N / 2)) + ([2] * (N / 2)),
+        'string-nint': (['-1'] * (N / 2)) + ([2] * (N / 2)),
+        'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
+                                         dtype='datetime64[D]'), N),
+        'string-float': (['1.1'] * (N / 2)) + ([2] * (N / 2)),
+        'int-list': ([1] * (N / 2)) + ([2] * (N / 2)),
+        'int32': np.repeat(np.int32(1), N)
+        }
+
+    def setup(self, dtype, downcast):
+        self.data = self.data_dict[dtype]
+
+    def time_downcast(self, dtype, downcast):
+        pd.to_numeric(self.data, downcast=downcast)
@@ -179,10 +179,6 @@ def setup(self):
             self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
         except:
             pass
-        try:
-            self.DataFrame = DataMatrix
-        except:
-            pass
         self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
         self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
         self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -210,10 +206,6 @@ def setup(self):
             self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
         except:
             pass
-        try:
-            self.DataFrame = DataMatrix
-        except:
-            pass
         self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
         self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
         self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -241,10 +233,6 @@ def setup(self):
             self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
         except:
             pass
-        try:
-            self.DataFrame = DataMatrix
-        except:
-            pass
         self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
         self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
         self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -272,10 +260,6 @@ def setup(self):
             self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
         except:
             pass
-        try:
-            self.DataFrame = DataMatrix
-        except:
-            pass
         self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
         self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
         self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -309,6 +293,43 @@ def time_join_dataframe_integer_key(self):
         merge(self.df, self.df2, on='key1')
 
 
+class merge_asof_noby(object):
+
+    def setup(self):
+        np.random.seed(0)
+        one_count = 200000
+        two_count = 1000000
+        self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
+                                 'value1': np.random.randn(one_count)})
+        self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
+                                 'value2': np.random.randn(two_count)})
+        self.df1 = self.df1.sort_values('time')
+        self.df2 = self.df2.sort_values('time')
+
+    def time_merge_asof_noby(self):
+        merge_asof(self.df1, self.df2, on='time')
+
+
+class merge_asof_by(object):
+
+    def setup(self):
+        import string
+        np.random.seed(0)
+        one_count = 200000
+        two_count = 1000000
+        self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
+                                 'key': np.random.choice(list(string.uppercase), one_count),
+                                 'value1': np.random.randn(one_count)})
+        self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
+                                 'key': np.random.choice(list(string.uppercase), two_count),
+                                 'value2': np.random.randn(two_count)})
+        self.df1 = self.df1.sort_values('time')
+        self.df2 = self.df2.sort_values('time')
+
+    def time_merge_asof_by(self):
+        merge_asof(self.df1, self.df2, on='time', by='key')
+
+
 class join_non_unique_equal(object):
     goal_time = 0.2
 
 
@@ -114,6 +114,27 @@ def teardown(self):
         os.remove('test.csv')
 
 
+class read_csv_categorical(object):
+    goal_time = 0.2
+
+    def setup(self):
+        N = 100000
+        group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
+        df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
+                        'b': np.random.choice(group1, N).astype('object'),
+                        'c': np.random.choice(group1, N).astype('object')})
+        df.to_csv('strings.csv', index=False)
+
+    def time_read_csv_categorical_post(self):
+        read_csv('strings.csv').apply(pd.Categorical)
+
+    def time_read_csv_categorical_direct(self):
+        read_csv('strings.csv', dtype='category')
+
+    def teardown(self):
+        os.remove('strings.csv')
+
+
 class read_table_multiple_date(object):
     goal_time = 0.2