Skip to content

Commit da2152c

Browse files
author
Julian Orth
committed
Improve shootout-reverse-complement
1 parent 00cc6d2 commit da2152c

File tree

1 file changed

+259
-88
lines changed

1 file changed

+259
-88
lines changed

src/test/bench/shootout-reverse-complement.rs

Lines changed: 259 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -38,113 +38,284 @@
3838
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
3939
// OF THE POSSIBILITY OF SUCH DAMAGE.
4040

41-
// ignore-pretty very bad with line comments
42-
// ignore-android doesn't terminate?
43-
44-
#![feature(slicing_syntax)]
45-
46-
use std::iter::range_step;
47-
use std::io::{stdin, stdout, File};
48-
49-
static LINE_LEN: uint = 60;
50-
51-
fn make_complements() -> [u8, ..256] {
52-
let transforms = [
53-
('A', 'T'), ('C', 'G'), ('G', 'C'), ('T', 'A'),
54-
('U', 'A'), ('M', 'K'), ('R', 'Y'), ('W', 'W'),
55-
('S', 'S'), ('Y', 'R'), ('K', 'M'), ('V', 'B'),
56-
('H', 'D'), ('D', 'H'), ('B', 'V'), ('N', 'N'),
57-
('\n', '\n')];
58-
let mut complements: [u8, ..256] = [0, ..256];
59-
for (i, c) in complements.iter_mut().enumerate() {
60-
*c = i as u8;
41+
#![feature(slicing_syntax, asm, if_let, tuple_indexing)]
42+
43+
extern crate libc;
44+
45+
use std::io::stdio::{stdin_raw, stdout_raw};
46+
use std::sync::{Future};
47+
use std::num::{div_rem};
48+
use std::ptr::{copy_memory};
49+
use std::io::{IoResult, EndOfFile};
50+
use std::slice::raw::{mut_buf_as_slice};
51+
52+
use shared_memory::{SharedMemory};
53+
54+
mod tables {
55+
use std::sync::{Once, ONCE_INIT};
56+
57+
/// Lookup tables.
58+
static mut CPL16: [u16, ..1 << 16] = [0, ..1 << 16];
59+
static mut CPL8: [u8, ..1 << 8] = [0, ..1 << 8];
60+
61+
/// Generates the tables.
62+
pub fn get() -> Tables {
63+
/// To make sure we initialize the tables only once.
64+
static INIT: Once = ONCE_INIT;
65+
INIT.doit(|| {
66+
unsafe {
67+
for i in range(0, 1 << 8) {
68+
CPL8[i] = match i as u8 {
69+
b'A' | b'a' => b'T',
70+
b'C' | b'c' => b'G',
71+
b'G' | b'g' => b'C',
72+
b'T' | b't' => b'A',
73+
b'U' | b'u' => b'A',
74+
b'M' | b'm' => b'K',
75+
b'R' | b'r' => b'Y',
76+
b'W' | b'w' => b'W',
77+
b'S' | b's' => b'S',
78+
b'Y' | b'y' => b'R',
79+
b'K' | b'k' => b'M',
80+
b'V' | b'v' => b'B',
81+
b'H' | b'h' => b'D',
82+
b'D' | b'd' => b'H',
83+
b'B' | b'b' => b'V',
84+
b'N' | b'n' => b'N',
85+
i => i,
86+
};
87+
}
88+
89+
for (i, v) in CPL16.iter_mut().enumerate() {
90+
*v = *CPL8.unsafe_get(i & 255) as u16 << 8 |
91+
*CPL8.unsafe_get(i >> 8) as u16;
92+
}
93+
}
94+
});
95+
Tables { _dummy: () }
96+
}
97+
98+
/// Accessor for the static arrays.
99+
///
100+
/// To make sure that the tables can't be accessed without having been initialized.
101+
pub struct Tables {
102+
_dummy: ()
61103
}
62-
let lower = 'A' as u8 - 'a' as u8;
63-
for &(from, to) in transforms.iter() {
64-
complements[from as uint] = to as u8;
65-
complements[(from as u8 - lower) as uint] = to as u8;
104+
105+
impl Tables {
106+
/// Retreives the complement for `i`.
107+
pub fn cpl8(self, i: u8) -> u8 {
108+
// Not really unsafe.
109+
unsafe { CPL8[i as uint] }
110+
}
111+
112+
/// Retreives the complement for `i`.
113+
pub fn cpl16(self, i: u16) -> u16 {
114+
unsafe { CPL16[i as uint] }
115+
}
66116
}
67-
complements
68117
}
69118

70-
fn main() {
71-
let complements = make_complements();
72-
let data = if std::os::getenv("RUST_BENCH").is_some() {
73-
File::open(&Path::new("shootout-k-nucleotide.data")).read_to_end()
74-
} else {
75-
stdin().read_to_end()
76-
};
77-
let mut data = data.unwrap();
119+
mod shared_memory {
120+
use std::sync::{Arc};
121+
use std::mem::{transmute};
122+
use std::raw::{Slice};
78123

79-
for seq in data.as_mut_slice().split_mut(|c| *c == '>' as u8) {
80-
// skip header and last \n
81-
let begin = match seq.iter().position(|c| *c == '\n' as u8) {
82-
None => continue,
83-
Some(c) => c
84-
};
85-
let len = seq.len();
86-
let seq = seq[mut begin+1..len-1];
87-
88-
// arrange line breaks
89-
let len = seq.len();
90-
let off = LINE_LEN - len % (LINE_LEN + 1);
91-
for i in range_step(LINE_LEN, len, LINE_LEN + 1) {
92-
for j in std::iter::count(i, -1).take(off) {
93-
seq[j] = seq[j - 1];
124+
/// Structure for sharing disjoint parts of a vector mutably across tasks.
125+
pub struct SharedMemory {
126+
ptr: Arc<Vec<u8>>,
127+
start: uint,
128+
len: uint,
129+
}
130+
131+
impl SharedMemory {
132+
pub fn new(ptr: Vec<u8>) -> SharedMemory {
133+
let len = ptr.len();
134+
SharedMemory {
135+
ptr: Arc::new(ptr),
136+
start: 0,
137+
len: len,
138+
}
139+
}
140+
141+
pub fn as_mut_slice(&mut self) -> &mut [u8] {
142+
unsafe {
143+
transmute(Slice {
144+
data: self.ptr.as_ptr().offset(self.start as int) as *const u8,
145+
len: self.len,
146+
})
94147
}
95-
seq[i - off] = '\n' as u8;
96148
}
97149

98-
// reverse complement, as
99-
// seq.reverse(); for c in seq.iter_mut() { *c = complements[*c] }
100-
// but faster:
101-
for (front, back) in two_side_iter(seq) {
102-
let tmp = complements[*front as uint];
103-
*front = complements[*back as uint];
104-
*back = tmp;
150+
pub fn len(&self) -> uint {
151+
self.len
105152
}
106-
if seq.len() % 2 == 1 {
107-
let middle = &mut seq[seq.len() / 2];
108-
*middle = complements[*middle as uint];
153+
154+
pub fn split_at(self, mid: uint) -> (SharedMemory, SharedMemory) {
155+
assert!(mid <= self.len);
156+
let left = SharedMemory {
157+
ptr: self.ptr.clone(),
158+
start: self.start,
159+
len: mid,
160+
};
161+
let right = SharedMemory {
162+
ptr: self.ptr,
163+
start: self.start + mid,
164+
len: self.len - mid,
165+
};
166+
(left, right)
109167
}
110-
}
111168

112-
stdout().write(data.as_slice()).unwrap();
169+
/// Resets the object so that it covers the whole range of the contained vector.
170+
///
171+
/// You must not call this method if `self` is not the only reference to the
172+
/// shared memory.
173+
///
174+
/// FIXME: If `Arc` had a method to check if the reference is unique, then we
175+
/// wouldn't need the `unsafe` here.
176+
///
177+
/// FIXME: If `Arc` had a method to unwrap the contained value, then we could
178+
/// simply unwrap here.
179+
pub unsafe fn reset(self) -> SharedMemory {
180+
let len = self.ptr.len();
181+
SharedMemory {
182+
ptr: self.ptr,
183+
start: 0,
184+
len: len,
185+
}
186+
}
187+
}
113188
}
114189

115-
pub struct TwoSideIter<'a, T: 'a> {
116-
first: *mut T,
117-
last: *mut T,
118-
marker: std::kinds::marker::ContravariantLifetime<'a>,
119-
marker2: std::kinds::marker::NoCopy
190+
191+
/// Reads all remaining bytes from the stream.
192+
fn read_to_end<R: Reader>(r: &mut R) -> IoResult<Vec<u8>> {
193+
const CHUNK: uint = 64 * 1024;
194+
195+
let mut vec = Vec::with_capacity(1024 * 1024);
196+
loop {
197+
if vec.capacity() - vec.len() < CHUNK {
198+
let cap = vec.capacity();
199+
let mult = if cap < 256 * 1024 * 1024 {
200+
// FIXME (mahkoh): Temporary workaround for jemalloc on linux. Replace
201+
// this by 2x once the jemalloc preformance issue has been fixed.
202+
16
203+
} else {
204+
2
205+
};
206+
vec.reserve_exact(mult * cap);
207+
}
208+
unsafe {
209+
let ptr = vec.as_mut_ptr().offset(vec.len() as int);
210+
match mut_buf_as_slice(ptr, CHUNK, |s| r.read(s)) {
211+
Ok(n) => {
212+
let len = vec.len();
213+
vec.set_len(len + n);
214+
},
215+
Err(ref e) if e.kind == EndOfFile => break,
216+
Err(e) => return Err(e),
217+
}
218+
}
219+
}
220+
Ok(vec)
120221
}
121222

122-
pub fn two_side_iter<'a, T>(slice: &'a mut [T]) -> TwoSideIter<'a, T> {
123-
let len = slice.len();
124-
let first = slice.as_mut_ptr();
125-
let last = if len == 0 {
126-
first
127-
} else {
128-
unsafe { first.offset(len as int - 1) }
223+
/// Finds the first position at which `b` occurs in `s`.
224+
fn memchr(h: &[u8], n: u8) -> Option<uint> {
225+
use libc::{c_void, c_int, size_t};
226+
extern {
227+
fn memchr(h: *const c_void, n: c_int, s: size_t) -> *mut c_void;
228+
}
229+
let res = unsafe {
230+
memchr(h.as_ptr() as *const c_void, n as c_int, h.len() as size_t)
129231
};
130-
131-
TwoSideIter {
132-
first: first,
133-
last: last,
134-
marker: std::kinds::marker::ContravariantLifetime,
135-
marker2: std::kinds::marker::NoCopy
232+
if res.is_null() {
233+
None
234+
} else {
235+
Some(res as uint - h.as_ptr() as uint)
136236
}
137237
}
138238

139-
impl<'a, T> Iterator<(&'a mut T, &'a mut T)> for TwoSideIter<'a, T> {
140-
fn next(&mut self) -> Option<(&'a mut T, &'a mut T)> {
141-
if self.first < self.last {
142-
let result = unsafe { (&mut *self.first, &mut *self.last) };
143-
self.first = unsafe { self.first.offset(1) };
144-
self.last = unsafe { self.last.offset(-1) };
145-
Some(result)
146-
} else {
147-
None
239+
/// Length of a normal line without the terminating \n.
240+
const LINE_LEN: uint = 60;
241+
242+
/// Compute the reverse complement.
243+
fn reverse_complement(mut view: SharedMemory, tables: tables::Tables) {
244+
// Drop the last newline
245+
let seq = view.as_mut_slice().init_mut();
246+
let len = seq.len();
247+
let off = LINE_LEN - len % (LINE_LEN + 1);
248+
let mut i = LINE_LEN;
249+
while i < len {
250+
unsafe {
251+
copy_memory(seq.as_mut_ptr().offset((i - off + 1) as int),
252+
seq.as_ptr().offset((i - off) as int), off);
253+
*seq.unsafe_mut(i - off) = b'\n';
148254
}
255+
i += LINE_LEN + 1;
149256
}
257+
258+
let (div, rem) = div_rem(len, 4);
259+
unsafe {
260+
let mut left = seq.as_mut_ptr() as *mut u16;
261+
// This is slow if len % 2 != 0 but still faster than bytewise operations.
262+
let mut right = seq.as_mut_ptr().offset(len as int - 2) as *mut u16;
263+
let end = left.offset(div as int);
264+
while left != end {
265+
let tmp = tables.cpl16(*left);
266+
*left = tables.cpl16(*right);
267+
*right = tmp;
268+
left = left.offset(1);
269+
right = right.offset(-1);
270+
}
271+
272+
let end = end as *mut u8;
273+
match rem {
274+
1 => *end = tables.cpl8(*end),
275+
2 => {
276+
let tmp = tables.cpl8(*end);
277+
*end = tables.cpl8(*end.offset(1));
278+
*end.offset(1) = tmp;
279+
},
280+
3 => {
281+
*end.offset(1) = tables.cpl8(*end.offset(1));
282+
let tmp = tables.cpl8(*end);
283+
*end = tables.cpl8(*end.offset(2));
284+
*end.offset(2) = tmp;
285+
},
286+
_ => { },
287+
}
288+
}
289+
}
290+
291+
fn main() {
292+
let mut data = SharedMemory::new(read_to_end(&mut stdin_raw()).unwrap());
293+
let tables = tables::get();
294+
295+
let mut futures = vec!();
296+
loop {
297+
let (_, mut tmp_data) = match memchr(data.as_mut_slice(), b'\n') {
298+
Some(i) => data.split_at(i + 1),
299+
_ => break,
300+
};
301+
let (view, tmp_data) = match memchr(tmp_data.as_mut_slice(), b'>') {
302+
Some(i) => tmp_data.split_at(i),
303+
None => {
304+
let len = tmp_data.len();
305+
tmp_data.split_at(len)
306+
},
307+
};
308+
futures.push(Future::spawn(proc() reverse_complement(view, tables)));
309+
data = tmp_data;
310+
}
311+
312+
for f in futures.iter_mut() {
313+
f.get();
314+
}
315+
316+
// Not actually unsafe. If Arc had a way to check uniqueness then we could do that in
317+
// `reset` and it would tell us that, yes, it is unique at this point.
318+
data = unsafe { data.reset() };
319+
320+
stdout_raw().write(data.as_mut_slice()).unwrap();
150321
}

0 commit comments

Comments
 (0)