Skip to content

Add support for NFC and NFKC #15986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,13 +464,26 @@ def emit_charwidth_module(f, width_table):
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
f.write("}\n\n")

def emit_norm_module(f, canon, compat, combine):
def emit_norm_module(f, canon, compat, combine, norm_props):
canon_keys = canon.keys()
canon_keys.sort()

compat_keys = compat.keys()
compat_keys.sort()

canon_comp = {}
comp_exclusions = norm_props["Full_Composition_Exclusion"]
for char in canon_keys:
if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions):
continue
decomp = canon[char]
if len(decomp) == 2:
if not canon_comp.has_key(decomp[0]):
canon_comp[decomp[0]] = []
canon_comp[decomp[0]].append( (decomp[1], char) )
canon_comp_keys = canon_comp.keys()
canon_comp_keys.sort()

f.write("pub mod normalization {\n")

def mkdata_fun(table):
Expand All @@ -494,6 +507,22 @@ def f(char):
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
pfun=mkdata_fun(compat))

def comp_pfun(char):
data = "(%s,&[" % escape_char(char)
canon_comp[char].sort(lambda x, y: x[0] - y[0])
first = True
for pair in canon_comp[char]:
if not first:
data += ","
first = False
data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
data += "])"
return data

f.write(" // Canonical compositions\n")
emit_table(f, "composition_table", canon_comp_keys,
"&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)

f.write("""
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
use core::option::{Some, None};
Expand Down Expand Up @@ -579,6 +608,8 @@ def optimize_width_table(wtable):
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
norm_props = load_properties("DerivedNormalizationProps.txt",
["Full_Composition_Exclusion"])

# grapheme cluster category from DerivedCoreProperties
# the rest are defined below
Expand Down Expand Up @@ -612,7 +643,7 @@ def optimize_width_table(wtable):
emit_regex_module(rf, allcats, perl_words)

# normalizations and conversions module
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
emit_conversions_module(rf, lowerupper, upperlower)

### character width module
Expand Down
228 changes: 198 additions & 30 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ use core::cmp;
use core::iter::AdditiveIterator;
use core::mem;

use {Collection, MutableSeq};
use {Collection, Deque, MutableSeq};
use hash;
use ringbuf::RingBuf;
use string::String;
use unicode;
use vec::Vec;
Expand Down Expand Up @@ -302,6 +303,106 @@ impl<'a> Iterator<char> for Decompositions<'a> {
}
}

#[deriving(Clone)]
enum RecompositionState {
Composing,
Purging,
Finished
}

/// External iterator for a string's recomposition's characters.
/// Use with the `std::iter` module.
#[deriving(Clone)]
pub struct Recompositions<'a> {
iter: Decompositions<'a>,
state: RecompositionState,
buffer: RingBuf<char>,
composee: Option<char>,
last_ccc: Option<u8>
}

impl<'a> Iterator<char> for Recompositions<'a> {
#[inline]
fn next(&mut self) -> Option<char> {
loop {
match self.state {
Composing => {
for ch in self.iter {
let ch_class = unicode::char::canonical_combining_class(ch);
if self.composee.is_none() {
if ch_class != 0 {
return Some(ch);
}
self.composee = Some(ch);
continue;
}
let k = self.composee.clone().unwrap();

match self.last_ccc {
None => {
match unicode::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
None => {
if ch_class == 0 {
self.composee = Some(ch);
return Some(k);
}
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
}
}
Some(l_class) => {
if l_class >= ch_class {
// `ch` is blocked from `composee`
if ch_class == 0 {
self.composee = Some(ch);
self.last_ccc = None;
self.state = Purging;
return Some(k);
}
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
continue;
}
match unicode::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
None => {
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
}
}
}
}
self.state = Finished;
if self.composee.is_some() {
return self.composee.take();
}
}
Purging => {
match self.buffer.pop_front() {
None => self.state = Composing,
s => return s
}
}
Finished => {
match self.buffer.pop_front() {
None => return self.composee.take(),
s => return s
}
}
}
}
}
}

/// Replace all occurrences of one string with another
///
/// # Arguments
Expand Down Expand Up @@ -744,6 +845,32 @@ pub trait StrAllocating: Str {
kind: Compatible
}
}

/// An Iterator over the string in Unicode Normalization Form C
/// (canonical decomposition followed by canonical composition).
#[inline]
fn nfc_chars<'a>(&'a self) -> Recompositions<'a> {
Recompositions {
iter: self.nfd_chars(),
state: Composing,
buffer: RingBuf::new(),
composee: None,
last_ccc: None
}
}

/// An Iterator over the string in Unicode Normalization Form KC
/// (compatibility decomposition followed by canonical composition).
#[inline]
fn nfkc_chars<'a>(&'a self) -> Recompositions<'a> {
Recompositions {
iter: self.nfkd_chars(),
state: Composing,
buffer: RingBuf::new(),
composee: None,
last_ccc: None
}
}
}

impl<'a> StrAllocating for &'a str {
Expand Down Expand Up @@ -1754,39 +1881,80 @@ mod tests {

#[test]
fn test_nfd_chars() {
assert_eq!("abc".nfd_chars().collect::<String>(), String::from_str("abc"));
assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<String>(),
String::from_str("d\u0307\u01c4"));
assert_eq!("\u2026".nfd_chars().collect::<String>(), String::from_str("\u2026"));
assert_eq!("\u2126".nfd_chars().collect::<String>(), String::from_str("\u03a9"));
assert_eq!("\u1e0b\u0323".nfd_chars().collect::<String>(),
String::from_str("d\u0323\u0307"));
assert_eq!("\u1e0d\u0307".nfd_chars().collect::<String>(),
String::from_str("d\u0323\u0307"));
assert_eq!("a\u0301".nfd_chars().collect::<String>(), String::from_str("a\u0301"));
assert_eq!("\u0301a".nfd_chars().collect::<String>(), String::from_str("\u0301a"));
assert_eq!("\ud4db".nfd_chars().collect::<String>(),
String::from_str("\u1111\u1171\u11b6"));
assert_eq!("\uac1c".nfd_chars().collect::<String>(), String::from_str("\u1100\u1162"));
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfd_chars().collect::<String>(), $expected.into_string());
}
}
t!("abc", "abc");
t!("\u1e0b\u01c4", "d\u0307\u01c4");
t!("\u2026", "\u2026");
t!("\u2126", "\u03a9");
t!("\u1e0b\u0323", "d\u0323\u0307");
t!("\u1e0d\u0307", "d\u0323\u0307");
t!("a\u0301", "a\u0301");
t!("\u0301a", "\u0301a");
t!("\ud4db", "\u1111\u1171\u11b6");
t!("\uac1c", "\u1100\u1162");
}

#[test]
fn test_nfkd_chars() {
assert_eq!("abc".nfkd_chars().collect::<String>(), String::from_str("abc"));
assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<String>(),
String::from_str("d\u0307DZ\u030c"));
assert_eq!("\u2026".nfkd_chars().collect::<String>(), String::from_str("..."));
assert_eq!("\u2126".nfkd_chars().collect::<String>(), String::from_str("\u03a9"));
assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<String>(),
String::from_str("d\u0323\u0307"));
assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<String>(),
String::from_str("d\u0323\u0307"));
assert_eq!("a\u0301".nfkd_chars().collect::<String>(), String::from_str("a\u0301"));
assert_eq!("\u0301a".nfkd_chars().collect::<String>(),
String::from_str("\u0301a"));
assert_eq!("\ud4db".nfkd_chars().collect::<String>(),
String::from_str("\u1111\u1171\u11b6"));
assert_eq!("\uac1c".nfkd_chars().collect::<String>(), String::from_str("\u1100\u1162"));
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkd_chars().collect::<String>(), $expected.into_string());
}
}
t!("abc", "abc");
t!("\u1e0b\u01c4", "d\u0307DZ\u030c");
t!("\u2026", "...");
t!("\u2126", "\u03a9");
t!("\u1e0b\u0323", "d\u0323\u0307");
t!("\u1e0d\u0307", "d\u0323\u0307");
t!("a\u0301", "a\u0301");
t!("\u0301a", "\u0301a");
t!("\ud4db", "\u1111\u1171\u11b6");
t!("\uac1c", "\u1100\u1162");
}

#[test]
fn test_nfc_chars() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfc_chars().collect::<String>(), $expected.into_string());
}
}
t!("abc", "abc");
t!("\u1e0b\u01c4", "\u1e0b\u01c4");
t!("\u2026", "\u2026");
t!("\u2126", "\u03a9");
t!("\u1e0b\u0323", "\u1e0d\u0307");
t!("\u1e0d\u0307", "\u1e0d\u0307");
t!("a\u0301", "\xe1");
t!("\u0301a", "\u0301a");
t!("\ud4db", "\ud4db");
t!("\uac1c", "\uac1c");
t!("a\u0300\u0305\u0315\u05aeb", "\xe0\u05ae\u0305\u0315b");
}

#[test]
fn test_nfkc_chars() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkc_chars().collect::<String>(), $expected.into_string());
}
}
t!("abc", "abc");
t!("\u1e0b\u01c4", "\u1e0bD\u017d");
t!("\u2026", "...");
t!("\u2126", "\u03a9");
t!("\u1e0b\u0323", "\u1e0d\u0307");
t!("\u1e0d\u0307", "\u1e0d\u0307");
t!("a\u0301", "\xe1");
t!("\u0301a", "\u0301a");
t!("\ud4db", "\ud4db");
t!("\uac1c", "\uac1c");
t!("a\u0300\u0305\u0315\u05aeb", "\xe0\u05ae\u0305\u0315b");
}

#[test]
Expand Down
4 changes: 2 additions & 2 deletions src/libunicode/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ extern crate core;
// regex module
pub use tables::regex;

mod decompose;
mod normalize;
mod tables;
mod u_char;
mod u_str;
Expand All @@ -61,7 +61,7 @@ pub mod char {
pub use core::char::{from_digit, escape_unicode, escape_default};
pub use core::char::{len_utf8_bytes, Char};

pub use decompose::{decompose_canonical, decompose_compatible};
pub use normalize::{decompose_canonical, decompose_compatible, compose};

pub use tables::normalization::canonical_combining_class;

Expand Down
Loading