- These notes contain translations from R into Pandas/Numpy/Scipy.
- Notes are based upon Paul Teetor's R Cookbook published by O'Reilly
- http://shop.oreilly.com/product/9780596809164.do
The following imports are assumed, as well as the use of ipython as your python interpreter:
import pandas as pd import numpy as np import matplotlib.pyplot as plt
The combination of numpy, pandas, and matplotlib are python's answer to R. Pandas is built on top of the numpy libraries, which is the basis for a large amount of scientific computing today.
References below to python really mean python along with pandas, numpy, as well as matplotlib if any graphing is needed.
A quote from 'Pandas For Data Analysis' by Wes McKinney:
For users of the R language for statistical computing, the DataFrame name will be familiar, as the object was named after the similar R data.frame object. They are not the same, however; the functionality provided by data.frame in R is essentially a strict subset of that provided by the pandas DataFrame.
R:
# r code here
Python:
# python code here.
R:
x <- 3 print(x)
Python:
x = 3 print(x)
R:
# r code here
Python:
# python code here.
R:
rm(x)
Python:
del(x)
R:
x <- c(1, 2, 3, 4, 4, 5)
Python:
x = [1, 2, 3, 4, 4, 5]
Numpy:
x = np.array([1, 2, 3, 4, 4, 5])
Pandas:
x = pd.Series([1, 2, 3, 4, 4, 5])
R:
x <- c(0,1,1,2,3,5,8,13,21,34) y <- log(x+1) mean(x) median(x) sd(x) var(x) cor(x, y) cov(x, y) # na.rm=TRUE argument to skip NA values. mean(dataframe) sd(dataframe)
Python:
x = np.array([0, 1, 1, 2, 3, 5, 8, 13, 21, 34]) y = np.log(x + 1) np.mean(x) x.mean() np.median(x) # R divides by (N - 1), numpy default to N, ddof changes this. x.std(ddof=1) np.std(x, ddof=1) x.var(ddof=1) np.std(x, ddof=1) # numpy gives matrix, R just gives coefficients. np.cov(x, y) np.corrcoef(x, y) dataframe.mean() dataframe.std(ddof=1)
R:
# [1, 5] 1:5 # 1, 3, 5 seq(from=1, to=5, by=2) # 1, 1, 1, 1, 1 rep(1, times=5)
Python:
# [1, 5] or [1, 6) np.arange(1, 6, step=2) range(1, 6) # python iterator (doesn't create entire list, iterator) xrange(1, 6) # python iterator (doesn't create entire list, iterator) # 1, 3, 5 np.arange(1, 6, step=2) # 1, 1, 1, 1 np.ones(4) # 3, 3, 3, 3 x = np.empty(4) x.fill(3)
R:
== != < > <= >= any(x == 3) all(x == 3)
Python:
# Basically identical. Same logical operators here. np.any() np.all()
R:
# 3rd element. v[3] # 1st, 2nd, 3rd v[1:3] # 1, 3, 5. v[c(1, 3, 5)] v[v < 10] v[v > median(v)] v[v > np.median(v) | v == 5] years <- c(1960, 1964, 1976, 1994) names(years) <- c("Kennedy", "Johnson", "Carter", "Clinton") years['Carter'] # 1976
Python:
# 3rd element. Python is zero-indexed, R starts at 1. # e.g. first element in R is v[1], in python is v[0] v[2] v[3 - 1] # 1st, 2nd, 3rd v[0:3] # 1, 3, 5. v[[0, 2, 4]] np.take(v, [0, 2, 4]) # Same in R & Python. v[v < 10] v[v > np.median(v)] v[(v > np.median(v)) | (v == 5)] years = [1960, 1964, 1976, 1994] names = ['Kennedy', 'Johnson', 'Carter', 'Clinton'] presidents = pd.Series(data=years, index=names) # OR presidents = pd.Series(data, index) presidents['Carter']
R:
v <- c(4, 2, 3, 5, 4) w <- c(5, 2, 3, 4, 5) v + w v - w v * w v / w w ^ v w + 2 w - 2 w * 2 w ^ 2 w / 2 2 ^ w sqrt(w) log(w) sin(w)
Python:
# Basically identical. v = np.array([4, 2, 3, 5, 4]) w = np.array([5, 2, 3, 4, 5]) v + w v - w v * w v / w w ^ v w + 2 w - 2 w * 2 w ^ 2 w / 2 2 ^ w np.sqrt(w) np.log(w) np.sin(w)
R:
# r code here
Python:
# python code here.
R:
cv <- function(x) sd(x) / mean(x) # OR cv <- function(x) { sd(x) / mean(x) } cv(1:10) # .55048
Python:
def cv(x): return np.std(x, ddof=1) / np.mean(x) cv(arange(1, 11)) # .55048
R:
# r code here
Python:
# python code here.
R:
# r code here
Python:
# python code here.
R:
# r code here
Python:
# python code here.
R:
# r code here
Python:
# python code here.
R:
history()
Python:
%history
R:
.Last.value
Python:
_
R:
# r code here
Python:
# python code here.
R:
library(packagename) some_func()
Python:
import packagename packagename.some_func() # OR from packagename import some_func some_func()
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
source('somefilename.R')
Python:
%run somefilename.py
R:
Rscript somefilename.R
Python:
ipython -- somefilename.py # OR python somefilename.py
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
# digits defaults to 7. print(pi) print(pi, digits=2) # discouraged by the author. # this changes it everywhere. options(digits=5)
Python:
#
R:
#
Python:
#
R:
list.files()
Python:
!ls
R:
#
Python:
#
R:
records <- read.fwf('my-data-file.txt', widths=c(10, 10, 4, -1, 4), col.names=c('Last', 'First', 'Born', 'Died'))
Python:
# widths if contiguous, else use colspecs. # colspecs, half open intervals. records = pd.read_fwf('my-data-file.txt', colspecs=[((0, 10), (10, 20), (20, 24), (25, 29)], names=['Last', 'First', 'Born', 'Died']) # if no space between 'Born' & 'Died' records = pd.read_fwf('my-data-file.txt', widths=[10, 10, 4, 4], names=['Last', 'First', 'Born', 'Died'])
R:
# default is white-space separator. records <- read.table('my-filename.txt') # ':' separated. records <- read.table('my-filename.txt', sep=':') # With column headings. records <- read.table('my-filename.txt', header=TRUE)
Python:
# default is '\t' records = pd.read_table('my-filename.txt', header=None) # ':' separated. records = pd.read_table('my-filename.txt', sep=':', header=None) # With column headings. records = pd.read_table('my-filename.txt', sep=':')
R:
# default's to reading in headings. records <- read.csv('myfile.csv') records <- read.csv('myfile.csv', header=FALSE)
Python:
records = pd.read_csv('myfile.csv') records = pd.read_csv('myfile.csv', header=None)
R:
# data in variable 'df' write.csv(df, file='some-file.csv', row.names=FALSE)
Python:
df.to_csv('some-file.csv', index=False)
R:
df <- read.csv('http://www.justinmrao.com/salary_data.csv')
Python:
df = pd.read_csv('http://www.justinmrao.com/salary_data.csv'
R:
#
Python:
#
R:
#
Python:
#
R:
library(RMySQL) con <- dbConnect(MySQL(), user="userid", password="pswd", host="hostname", client.flag=CLIENT_MULTI_RESULTS) sql <- 'SELECT * FROM someTable WHERE City = "Melbourne"' df <- dbGetQuery(con, sql)
Python:
import MySQLdb con = MySQLdb.connect(host=HOST, passwd=PASSWD, db=DB, user=USER) sql = 'SELECT * FROM someTable WHERE City = "Melbourne"' df = sql.read_frame(sql, con)
R:
#
Python:
#
R:
v <- c(v, newItems) # OR if single item. v[length(v) + 1] <- newItem
Python:
v = np.append(v, newItems) v = np.append(v, newItem)
R:
# append(v, newValues, after=n) append(1:10, 99, after=5)
Python:
# x = np.arange(1, 11) x = np.insert(x, 5, 99)
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
vec = 1:6 matrix(vec, 2, 3)
Python:
x = np.arange(1, 7) x.reshape(2, 3)
R:
# transpose. t(A) # inverse solve(A) # identity matrix. diag(n)
Python:
# transpose. A.T # inverse of numpy array np.linalg.inv(A) # inverse of matrix A.I # identity matrix. np.eye(n) # same? np.identity(n)
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#
R:
#
Python:
#