Skip to content

Commit 36097bc

Browse files
committed
Update to BindingDB_All_2015m10
For `process.ipynb`: + Improve documentation with markdown cells. + Switch to commit specific links for dhimmel/uniprot. + Adopt pandas 17.0 gzipped url support. See pandas-dev/pandas#8685 + Exclude rows 192304-192473 (one indexed) where `BindingDB Reactant_set_id` was missing. + Handle affinities that cannot be converted to floats. For `collapse.Rmd`: + Use readr for tsv io. + Retain pubmed_ids and sources when collapsing.
1 parent a34aa51 commit 36097bc

6 files changed

+42724
-45063
lines changed

collapse.Rmd

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,20 @@ library(dplyr)
1111
library(ggplot2)
1212
library(DT)
1313
library(scales)
14+
library(readr)
1415
1516
options(stringsAsFactors=FALSE)
16-
17-
write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
18-
write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
19-
}
2017
```
2118

2219
```{r}
2320
# Read bindingdb and remove non-human interactions
2421
binding.db <- file.path('data', 'binding.tsv.gz') %>%
25-
read.delim(stringsAsFactors=FALSE) %>%
26-
dplyr::filter(organism == 'Homo sapiens')
22+
readr::read_tsv() %>%
23+
dplyr::filter(organism == 'Homo sapiens') %>%
24+
dplyr::filter(! is.na(affinity_nM)) %>%
25+
dplyr::mutate(
26+
source=plyr::mapvalues(source, c('Curated from the literature by BindingDB'), c('BindingDB'))
27+
)
2728
2829
# View a subset of the data.frame
2930
binding.db %>% dplyr::sample_n(200) %>% dplyr::select(-c(pubmed, doi)) %>% DT::datatable()
@@ -32,11 +33,9 @@ binding.db %>% dplyr::sample_n(200) %>% dplyr::select(-c(pubmed, doi)) %>% DT::d
3233

3334
```{r}
3435
# Read the drugbank to bindingDB fuzzy mappings produced using UniChem
35-
map.df <- 'http://git.dhimmel.com/drugbank/data/mapping/bindingdb.tsv' %>%
36-
read.delim(stringsAsFactors=FALSE)
37-
3836
# Restrict to compounds in drugbank
39-
joined.df <- map.df %>%
37+
joined.df <- 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/bindingdb.tsv' %>%
38+
readr::read_tsv() %>%
4039
dplyr::inner_join(binding.db)
4140
```
4241

@@ -51,14 +50,15 @@ geom.mean <- function(x) {
5150
ResolveAffinity <- function(df) {
5251
# Preferentially selects the affinity measure. If multiple meansurements
5352
# exist for the same compound-protein pair, the geometric mean is taken.
54-
measures <- df$measure
5553
for (measure in c('Kd', 'Ki', 'IC50')) {
56-
if (is.element(measure, measures)) {
57-
values <- df$affinity_nM[measures == measure]
54+
if (is.element(measure, df$measure)) {
55+
measure.df <- df[df$measure == measure, ]
5856
return.df <- data.frame(
5957
measure = measure,
60-
affinity_nM = round(geom.mean(values), 5),
61-
n_measures = length(values))
58+
affinity_nM = round(geom.mean(measure.df$affinity_nM), 5),
59+
n_measures = nrow(measure.df),
60+
sources = paste(unique(na.omit(measure.df$source)), collapse=','),
61+
pubmeds = paste(unique(na.omit(measure.df$pubmed)), collapse=','))
6262
return(return.df)
6363
}
6464
}
@@ -71,7 +71,7 @@ collapse.df <- joined.df %>%
7171
dplyr::ungroup()
7272
7373
collapse.df %>%
74-
write.delim('data/bindings-drugbank-collapsed.tsv')
74+
readr::write_tsv('data/bindings-drugbank-collapsed.tsv')
7575
7676
# View a subset of the data.frame
7777
collapse.df %>% dplyr::sample_n(200) %>% DT::datatable()
@@ -80,26 +80,29 @@ collapse.df %>% dplyr::sample_n(200) %>% DT::datatable()
8080
`r nrow(collapse.df)` compound--protein pairs were assayed.
8181

8282
```{r}
83-
drugbank.df <- 'http://git.dhimmel.com/drugbank/data/drugbank.tsv' %>%
84-
read.delim() %>%
83+
drugbank.df <- 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv' %>%
84+
readr::read_tsv() %>%
8585
dplyr::mutate(drugbank_approved = as.integer(grepl('approved', groups))) %>%
8686
dplyr::transmute(drugbank_id, drugbank_name = name, drugbank_approved)
8787
88-
entrez.df <- 'http://git.dhimmel.com/entrez-gene/data/symbols-human.tsv' %>%
89-
read.delim() %>%
88+
entrez.df <- 'https://raw.githubusercontent.com/dhimmel/entrez-gene/5352b31e04ec136e99d25a0ba63e8867aa71b69f/data/genes-human.tsv' %>%
89+
readr::read_tsv() %>%
9090
dplyr::transmute(entrez_gene = GeneID, gene_symbol = Symbol)
9191
9292
gene.df <- collapse.df %>%
9393
dplyr::group_by(drugbank_id, entrez_gene) %>%
9494
dplyr::summarize(
9595
affinity_nM = min(affinity_nM),
96-
n_pairs = n()) %>%
96+
n_pairs = n(),
97+
sources = paste(unique(sources), collapse=','),
98+
pubmeds = paste(unique(pubmeds), collapse=',')
99+
) %>%
97100
dplyr::ungroup() %>%
98101
dplyr::left_join(drugbank.df) %>%
99102
dplyr::left_join(entrez.df)
100103
101104
gene.df %>%
102-
write.delim('data/bindings-drugbank-gene.tsv')
105+
readr::write_tsv('data/bindings-drugbank-gene.tsv')
103106
104107
# View bindings for approved drugs
105108
gene.df %>%

collapse.html

Lines changed: 97 additions & 142 deletions
Large diffs are not rendered by default.

data/binding.tsv.gz

-396 KB
Binary file not shown.

0 commit comments

Comments
 (0)