TST: benchmark scripts

wesm · wesm · commit a3bbdf2052a2 · 2011-11-22T00:05:58.000-05:00
diff --git a/bench/zoo_bench.R b/bench/zoo_bench.R
@@ -1,25 +1,71 @@
 library(zoo)
 library(xts)
+library(fts)
+library(tseries)
+library(its)
+library(xtable)
+
+## indices = rep(NA, 100000)
+## for (i in 1:100000)
+##   indices[i] <- paste(sample(letters, 10), collapse="")
 
-indices = rep(NA, 100000)
-for (i in 1:100000)
-  indices[i] <- paste(sample(letters, 10), collapse="")
 
-timings <- numeric()
 
 ## x <- zoo(rnorm(100000), indices)
 ## y <- zoo(rnorm(90000), indices[sample(1:100000, 90000)])
 
 ## indices <- as.POSIXct(1:100000)
 
-indices <- as.POSIXct(Sys.Date()) + 1:1000000
+indices <- as.POSIXct(Sys.Date()) + seq(1, 100000000, 100)
+
+sz <- 500000
+
+## x <- xts(rnorm(sz), sample(indices, sz))
+## y <- xts(rnorm(sz), sample(indices, sz))
+
+zoo.bench <- function(){
+    x <- zoo(rnorm(sz), sample(indices, sz))
+    y <- zoo(rnorm(sz), sample(indices, sz))
+    timeit(function() {x + y})
+}
+
+xts.bench <- function(){
+    x <- xts(rnorm(sz), sample(indices, sz))
+    y <- xts(rnorm(sz), sample(indices, sz))
+    timeit(function() {x + y})
+}
+
+fts.bench <- function(){
+    x <- fts(rnorm(sz), sort(sample(indices, sz)))
+    y <- fts(rnorm(sz), sort(sample(indices, sz)))
+    timeit(function() {x + y})
+}
+
+its.bench <- function(){
+    x <- its(rnorm(sz), sort(sample(indices, sz)))
+    y <- its(rnorm(sz), sort(sample(indices, sz)))
+    timeit(function() {x + y})
+}
 
-x <- xts(rnorm(1000000), indices)
-y <- xts(rnorm(900000), indices[sample(1:1000000, 900000)])
+irts.bench <- function(){
+    x <- irts(sort(sample(indices, sz)), rnorm(sz))
+    y <- irts(sort(sample(indices, sz)), rnorm(sz))
+    timeit(function() {x + y})
+}
+
+timeit <- function(f){
+  timings <- numeric()
+  for (i in 1:10) {
+    gc()
+    timings[i] = system.time(f())[3]
+  }
+  mean(timings)
+}
 
-for (i in 1:10) {
-  gc()
-  timings[i] = system.time(x + y)[3]
+bench <- function(){
+  results <- c(xts.bench(), fts.bench(), its.bench(), zoo.bench())
+  names <- c("xts", "fts", "its", "zoo")
+  data.frame(results, names)
 }
 
-mean(timings)
+result <- bench()
diff --git a/bench/zoo_bench.py b/bench/zoo_bench.py
@@ -1,15 +1,13 @@
 from pandas import *
 from pandas.util.testing import rands
 
-from la import larry
+# from la import larry
 
-n = 100000
-indices = Index([rands(10) for _ in xrange(n)])
+n = 1000000
+# indices = Index([rands(10) for _ in xrange(n)])
 
 def sample(values, k):
-    from random import shuffle
-    sampler = np.arange(len(values))
-    shuffle(sampler)
+    sampler = np.random.permutation(len(values))
     return values.take(sampler[:k])
 
 subsample_size = 90000
@@ -22,19 +20,24 @@ def sample(values, k):
 # lx = larry(np.random.randn(100000), [list(indices)])
 # ly = larry(np.random.randn(subsample_size), [list(y.index)])
 
-stamps = np.random.randint(1000000000, 1000000000000, 2000000)
+sz = 500000
 
-idx1 = np.sort(sample(stamps, 1000000))
-idx2 = np.sort(sample(stamps, 1000000))
+rng = np.arange(0, 10000000000000, 10000000)
+stamps = np.datetime64(datetime.now()).view('i8') + rng
 
-ts1 = Series(np.random.randn(1000000), idx1)
-ts2 = Series(np.random.randn(1000000), idx2)
+# stamps = np.random.randint(1000000000, 1000000000000, 2000000)
+
+idx1 = np.sort(sample(stamps, sz))
+idx2 = np.sort(sample(stamps, sz))
+
+ts1 = Series(np.random.randn(sz), idx1)
+ts2 = Series(np.random.randn(sz), idx2)
 
 # Benchmark 1: Two 1-million length time series (int64-based index) with
 # randomly chosen timestamps
 
 # Benchmark 2: Join two 5-variate time series DataFrames (outer and inner join)
 
-df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5))
-df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10))
+# df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5))
+# df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10))
 
diff --git a/pandas/src/sandbox.pyx b/pandas/src/sandbox.pyx
@@ -17,3 +17,11 @@ def foo(object o):
 
 def foo2():
     print sizeof(PyObject*)
+
+def bench_dict():
+    cdef:
+        # Py_ssize_t i
+        dict d = {}
+
+    for i in range(1000000):
+        d[i] = i
diff --git a/scripts/groupby_sample.py b/scripts/groupby_sample.py
@@ -13,9 +13,9 @@
                                   "banana","lemon","guava","blackberry",
                                   "grape"]})
 value = df_small['value'].values.repeat(3)
-df = DataFrame({'group1' : g1.repeat(40000),
-                'group2' : np.tile(g2, 400),
-                'value' : value.repeat(40000)})
+df = DataFrame({'group1' : g1.repeat(4000 * 5),
+                'group2' : np.tile(g2, 400 * 5),
+                'value' : value.repeat(4000 * 5)})
 
 
 def random_sample():
@@ -32,3 +32,18 @@ def random_sample_v2():
     indices = [choice(v) for k, v in grouped.groups.iteritems()]
     return df.reindex(indices)
 
+def do_shuffle(arr):
+   from random import shuffle
+   result = arr.copy().values
+   shuffle(result)
+   return result
+
+def shuffle_uri(df,grouped):
+    perm = np.r_[tuple([np.random.permutation(idxs) for idxs in grouped.groups.itervalues()])]
+    df['state_permuted'] = np.asarray(df.ix[perm]['value'])
+
+df2 = df.copy()
+grouped = df2.groupby('group1')
+shuffle_uri(df2, grouped)
+
+df2['state_perm'] = grouped['value'].transform(do_shuffle)