Skip to content

Commit cb98c0c

Browse files
committed
Implement a C API for the tokenizer
1 parent 89f2f45 commit cb98c0c

File tree

6 files changed

+357
-0
lines changed

6 files changed

+357
-0
lines changed

Makefile.in

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ $(LIB): $(CARGO_SOURCES)
3838
(cd $(VPATH) && cargo build)
3939
touch $(LIB)
4040

41+
.PHONY: for_c
42+
for_c: libhtml5ever_for_c.a
43+
44+
libhtml5ever_for_c.a: $(LIB) $(CARGO_SOURCES)
45+
$(RUSTC_CMD) -o $@ --cfg for_c --crate-type staticlib $(VPATH)/src/lib.rs
46+
4147
define DEF_EXAMPLE
4248
$(1): $$(VPATH)/examples/$(1).rs $$(LIB)
4349
$$(RUSTC_CMD) $$<

capi/html5ever.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright 2014 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
#ifndef __HTML5EVER_H
11+
#define __HTML5EVER_H
12+
13+
#include <stdlib.h>
14+
15+
struct h5e_buf {
16+
unsigned char *data;
17+
size_t len;
18+
};
19+
20+
struct h5e_buf h5e_buf_from_cstr(const char *str);
21+
22+
struct h5e_token_ops {
23+
void (*do_doctype)(void *user, struct h5e_buf name,
24+
struct h5e_buf pub, struct h5e_buf sys, int force_quirks);
25+
void (*do_start_tag)(void *user, struct h5e_buf name,
26+
int self_closing, size_t num_attrs);
27+
void (*do_tag_attr)(void *user, struct h5e_buf name, struct h5e_buf value);
28+
void (*do_end_tag)(void *user, struct h5e_buf name);
29+
void (*do_comment)(void *user, struct h5e_buf text);
30+
void (*do_chars)(void *user, struct h5e_buf text);
31+
void (*do_null_char)(void *user);
32+
void (*do_eof)(void *user);
33+
void (*do_error)(void *user, struct h5e_buf message);
34+
};
35+
36+
struct h5e_token_sink {
37+
struct h5e_token_ops *ops;
38+
void *user;
39+
};
40+
41+
struct h5e_tokenizer;
42+
43+
struct h5e_tokenizer *h5e_tokenizer_new(struct h5e_token_sink *sink);
44+
void h5e_tokenizer_free(struct h5e_tokenizer *tok);
45+
void h5e_tokenizer_feed(struct h5e_tokenizer *tok, struct h5e_buf buf);
46+
void h5e_tokenizer_end(struct h5e_tokenizer *tok);
47+
48+
#endif

examples/capi/tokenize.c

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Copyright 2014 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
#include <stdio.h>
11+
12+
#include "html5ever.h"
13+
14+
void put_str(const char *x) {
15+
fputs(x, stdout);
16+
}
17+
18+
void put_buf(struct h5e_buf text) {
19+
fwrite(text.data, text.len, 1, stdout);
20+
}
21+
22+
void do_chars(void *user, struct h5e_buf text) {
23+
put_str("CHARS : ");
24+
put_buf(text);
25+
put_str("\n");
26+
}
27+
28+
void do_start_tag(void *user, struct h5e_buf name, int self_closing, size_t num_attrs) {
29+
put_str("TAG : <");
30+
put_buf(name);
31+
if (self_closing) {
32+
putchar('/');
33+
}
34+
put_str(">\n");
35+
}
36+
37+
void do_tag_attr(void *user, struct h5e_buf name, struct h5e_buf value) {
38+
put_str(" ATTR: ");
39+
put_buf(name);
40+
put_str("=\"");
41+
put_buf(value);
42+
put_str("\"\n");
43+
}
44+
45+
void do_end_tag(void *user, struct h5e_buf name) {
46+
put_str("TAG : </");
47+
put_buf(name);
48+
put_str(">\n");
49+
}
50+
51+
struct h5e_token_ops ops = {
52+
.do_chars = do_chars,
53+
.do_start_tag = do_start_tag,
54+
.do_tag_attr = do_tag_attr,
55+
.do_end_tag = do_end_tag,
56+
};
57+
58+
struct h5e_token_sink sink = {
59+
.ops = &ops,
60+
.user = NULL,
61+
};
62+
63+
int main(int argc, char *argv[]) {
64+
if (argc < 2) {
65+
printf("Usage: %s 'HTML fragment'\n", argv[0]);
66+
return 1;
67+
}
68+
69+
struct h5e_tokenizer *tok = h5e_tokenizer_new(&sink);
70+
h5e_tokenizer_feed(tok, h5e_buf_from_cstr(argv[1]));
71+
h5e_tokenizer_end(tok);
72+
h5e_tokenizer_free(tok);
73+
return 0;
74+
}

src/for_c/common.rs

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright 2014 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
use core::prelude::*;
11+
12+
use core::slice::raw::buf_as_slice;
13+
use core::str::raw::from_utf8;
14+
use core::kinds::marker::ContravariantLifetime;
15+
16+
use libc::{size_t, c_int, c_char, strlen};
17+
18+
#[repr(C)]
19+
pub struct h5e_buf {
20+
data: *const u8,
21+
len: size_t,
22+
}
23+
24+
impl h5e_buf {
25+
pub fn null() -> h5e_buf {
26+
h5e_buf {
27+
data: RawPtr::null(),
28+
len: 0,
29+
}
30+
}
31+
32+
pub unsafe fn with_slice<R>(&self, f: |&str| -> R) -> R {
33+
buf_as_slice(self.data, self.len as uint,
34+
|bytes| f(from_utf8(bytes)))
35+
}
36+
}
37+
38+
pub struct LifetimeBuf<'a> {
39+
buf: h5e_buf,
40+
marker: ContravariantLifetime<'a>,
41+
}
42+
43+
impl<'a> LifetimeBuf<'a> {
44+
pub fn from_str<T: Str>(x: &'a T) -> LifetimeBuf<'a> {
45+
let x = x.as_slice();
46+
LifetimeBuf {
47+
buf: h5e_buf {
48+
data: x.as_bytes().as_ptr(),
49+
len: x.len() as size_t,
50+
},
51+
marker: ContravariantLifetime,
52+
}
53+
}
54+
55+
pub fn null() -> LifetimeBuf<'a> {
56+
LifetimeBuf {
57+
buf: h5e_buf::null(),
58+
marker: ContravariantLifetime,
59+
}
60+
}
61+
62+
#[inline]
63+
pub fn get(self) -> h5e_buf {
64+
self.buf
65+
}
66+
}
67+
68+
#[no_mangle]
69+
pub unsafe extern "C" fn h5e_buf_from_cstr(s: *const c_char) -> h5e_buf {
70+
h5e_buf {
71+
data: s as *const u8,
72+
len: strlen(s),
73+
}
74+
}
75+
76+
pub fn c_bool(x: bool) -> c_int {
77+
match x {
78+
false => 0,
79+
true => 1,
80+
}
81+
}

src/for_c/tokenizer.rs

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// Copyright 2014 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
#![allow(non_camel_case_types)]
11+
12+
use core::prelude::*;
13+
14+
use for_c::common::{LifetimeBuf, h5e_buf, c_bool};
15+
16+
use tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken};
17+
use tokenizer::{CommentToken, CharacterTokens, NullCharacterToken};
18+
use tokenizer::{TagToken, StartTag, EndTag, EOFToken, Tokenizer};
19+
20+
use core::mem;
21+
use core::default::Default;
22+
use alloc::boxed::Box;
23+
use collections::String;
24+
use libc::{c_void, c_int, size_t};
25+
26+
#[repr(C)]
27+
pub struct h5e_token_ops {
28+
do_doctype: extern "C" fn(user: *mut c_void, name: h5e_buf,
29+
public: h5e_buf, system: h5e_buf, force_quirks: c_int),
30+
31+
do_start_tag: extern "C" fn(user: *mut c_void, name: h5e_buf,
32+
self_closing: c_int, num_attrs: size_t),
33+
34+
do_tag_attr: extern "C" fn(user: *mut c_void, name: h5e_buf, value: h5e_buf),
35+
do_end_tag: extern "C" fn(user: *mut c_void, name: h5e_buf),
36+
do_comment: extern "C" fn(user: *mut c_void, text: h5e_buf),
37+
do_chars: extern "C" fn(user: *mut c_void, text: h5e_buf),
38+
do_null_char: extern "C" fn(user: *mut c_void),
39+
do_eof: extern "C" fn(user: *mut c_void),
40+
do_error: extern "C" fn(user: *mut c_void, message: h5e_buf),
41+
}
42+
43+
#[repr(C)]
44+
pub struct h5e_token_sink {
45+
ops: *const h5e_token_ops,
46+
user: *mut c_void,
47+
}
48+
49+
impl TokenSink for h5e_token_sink {
50+
fn process_token(&mut self, token: Token) {
51+
macro_rules! call ( ($name:ident $(, $arg:expr)*) => (
52+
unsafe {
53+
if !((*self.ops).$name as *const ()).is_null() {
54+
((*(self.ops)).$name)(self.user $(, $arg)*);
55+
}
56+
}
57+
))
58+
59+
fn opt_str_to_buf<'a>(s: &'a Option<String>) -> LifetimeBuf<'a> {
60+
match *s {
61+
None => LifetimeBuf::null(),
62+
Some(ref s) => LifetimeBuf::from_str(s),
63+
}
64+
}
65+
66+
match token {
67+
DoctypeToken(Doctype { name, public_id, system_id, force_quirks }) => {
68+
let name = opt_str_to_buf(&name);
69+
let public_id = opt_str_to_buf(&public_id);
70+
let system_id = opt_str_to_buf(&system_id);
71+
call!(do_doctype, name.get(), public_id.get(), system_id.get(),
72+
c_bool(force_quirks));
73+
}
74+
75+
TagToken(Tag { kind, name, self_closing, attrs }) => {
76+
let name = LifetimeBuf::from_str(&name);
77+
match kind {
78+
StartTag => {
79+
call!(do_start_tag, name.get(), c_bool(self_closing),
80+
attrs.len() as size_t);
81+
for attr in attrs.move_iter() {
82+
let name = LifetimeBuf::from_str(&attr.name);
83+
let value = LifetimeBuf::from_str(&attr.value);
84+
call!(do_tag_attr, name.get(), value.get());
85+
}
86+
}
87+
EndTag => call!(do_end_tag, name.get()),
88+
}
89+
}
90+
91+
CommentToken(text) => {
92+
let text = LifetimeBuf::from_str(&text);
93+
call!(do_comment, text.get());
94+
}
95+
96+
CharacterTokens(text) => {
97+
let text = LifetimeBuf::from_str(&text);
98+
call!(do_chars, text.get());
99+
}
100+
101+
NullCharacterToken => call!(do_null_char),
102+
103+
EOFToken => call!(do_eof),
104+
105+
ParseError(msg) => {
106+
let msg = LifetimeBuf::from_str(&msg);
107+
call!(do_error, msg.get());
108+
}
109+
}
110+
}
111+
}
112+
113+
pub type h5e_tokenizer_ptr = *const ();
114+
115+
#[no_mangle]
116+
pub unsafe extern "C" fn h5e_tokenizer_new(sink: *mut h5e_token_sink) -> h5e_tokenizer_ptr {
117+
let tok: Box<Tokenizer<h5e_token_sink>>
118+
= box Tokenizer::new(mem::transmute::<_, &mut h5e_token_sink>(sink),
119+
Default::default());
120+
121+
mem::transmute(tok)
122+
}
123+
124+
#[no_mangle]
125+
pub unsafe extern "C" fn h5e_tokenizer_free(tok: h5e_tokenizer_ptr) {
126+
let _: Box<Tokenizer<h5e_token_sink>> = mem::transmute(tok);
127+
}
128+
129+
#[no_mangle]
130+
pub unsafe extern "C" fn h5e_tokenizer_feed(tok: h5e_tokenizer_ptr, buf: h5e_buf) {
131+
let tok: &mut Tokenizer<h5e_token_sink> = mem::transmute(tok);
132+
tok.feed(buf.with_slice(|s| String::from_str(s)));
133+
}
134+
135+
#[no_mangle]
136+
pub unsafe extern "C" fn h5e_tokenizer_end(tok: h5e_tokenizer_ptr) {
137+
let tok: &mut Tokenizer<h5e_token_sink> = mem::transmute(tok);
138+
tok.end();
139+
}

src/lib.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ extern crate core;
2626
#[phase(plugin, link)]
2727
extern crate std;
2828

29+
#[cfg(for_c)]
30+
extern crate libc;
31+
2932
#[phase(plugin, link)]
3033
extern crate collections;
3134

@@ -84,6 +87,12 @@ pub mod sink {
8487

8588
pub mod driver;
8689

90+
#[cfg(for_c)]
91+
pub mod for_c {
92+
pub mod common;
93+
pub mod tokenizer;
94+
}
95+
8796
/// A fake `std` module so that `deriving` and other macros will work.
8897
/// See rust-lang/rust#16803.
8998
#[cfg(for_c)]

0 commit comments

Comments
 (0)