Casting Vec<u8> to str problem


#1

I trying to make a simple url parser module. but have a trouble with casting vec to str,please help.

use std::mem;


// '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
const HOST_NOT_ALLOWS: [u8; 17] = [33, 34, 36, 38, 39, 40, 41, 42, 43, 44, 58, 59, 60, 61, 62, 91, 93];
// '$', '&', '+', ',', '/', ':', ';', '=', '?', '@':
const RESERVED_CHARACTERS: [u8; 10] = [36, 38, 43, 44, 47, 58, 59, 61, 63, 64];
// '-', '_', '.', '~'
const UNRESERVED_CHARACTERS: [u8; 4] = [45, 46, 95, 126];

const NOT_ALLOWS_PASSWORD: [u8; 4] = [47, 58, 63, 64];

#[derive(PartialEq)]
enum EncodeMode {
    Path,
    Host,
    Zone,
    UserPassword,
    QueryComponent,
    Fragment,
}


fn fast_str(v: &[u8]) -> &'static str {
    let x: &str = unsafe { mem::transmute(v) };
    x
}



fn should_escape(c: u8, mode: &EncodeMode) -> bool {
    if is_alpha_numeric(c) {
        return false;
    }
    match mode {
        &EncodeMode::Host | &EncodeMode::Zone => {
            if HOST_NOT_ALLOWS.contains(&c) {
                return true;
            } else {
                return false;
            }
        }
        _ => {

            if UNRESERVED_CHARACTERS.contains(&c) {
                return false;
            }

            if RESERVED_CHARACTERS.contains(&c) {
                let x = match mode {
                    // 63='?'
                    &EncodeMode::Path => c == 63,
                    &EncodeMode::UserPassword => NOT_ALLOWS_PASSWORD.contains(&c),
                    &EncodeMode::QueryComponent => true,
                    _ => false,
                };
                return x;

            }
            true
        }
    }
}

fn escape(url: &'static str, mode: &EncodeMode) -> &'static str {
    let mut space_count = 0usize;
    let mut hex_count = 0usize;

    let url_bytes = url.as_bytes();
    for b in url_bytes {
        if should_escape(*b, mode) {
            if *b == 32 && mode == &EncodeMode::QueryComponent {
                space_count += 1;
            } else {
                hex_count += 1;
            }
        }
    }
    if space_count == 0 && hex_count == 0 {
        return url;
    }
    let l = url_bytes.len() + 2 * hex_count;

    let mut t: Vec<u8> = vec![0;l as usize];
    let mut j = 0;
    let s = "0123456789ABCDEF".as_bytes();

    for v in url_bytes {
        if *v == 32 && mode == &EncodeMode::QueryComponent {
            t[j] = 43;
            j += 1;
        }else if should_escape(*v, mode) {

            t[j] = 37;
            t[j + 1] = s[(v >> 4) as usize];
            t[j + 2] = s[(v & 15) as usize];
            j += 3;
        } else {
            t[j] = *v;
            j += 1;
        }
    }

    let result: &str = fast_str(&t);
    result
}


fn is_alpha_numeric(cc: u8) -> bool {
    match cc {
        97...122 | 65...90 | 48...57 => true,
        _ => false,
    }
}


fn main() {
    println!("{:?}",
             escape("/test?id=32&value=我们", &EncodeMode::Path));
}
 

Output:

"\u{1}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}"

but, if add println!("{:?}",t);,like this:

 use std::mem;


// '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
const HOST_NOT_ALLOWS: [u8; 17] = [33, 34, 36, 38, 39, 40, 41, 42, 43, 44, 58, 59, 60, 61, 62, 91, 93];
// '$', '&', '+', ',', '/', ':', ';', '=', '?', '@':
const RESERVED_CHARACTERS: [u8; 10] = [36, 38, 43, 44, 47, 58, 59, 61, 63, 64];
// '-', '_', '.', '~'
const UNRESERVED_CHARACTERS: [u8; 4] = [45, 46, 95, 126];

const NOT_ALLOWS_PASSWORD: [u8; 4] = [47, 58, 63, 64];

#[derive(PartialEq)]
enum EncodeMode {
    Path,
    Host,
    Zone,
    UserPassword,
    QueryComponent,
    Fragment,
}


fn fast_str(v: &[u8]) -> &'static str {
    let x: &str = unsafe { mem::transmute(v) };
    x
}


fn should_escape(c: u8, mode: &EncodeMode) -> bool {
    if is_alpha_numeric(c) {
        return false;
    }
    match mode {
        &EncodeMode::Host | &EncodeMode::Zone => {
            if HOST_NOT_ALLOWS.contains(&c) {
                return true;
            } else {
                return false;
            }
        }
        _ => {

            if UNRESERVED_CHARACTERS.contains(&c) {
                return false;
            }

            if RESERVED_CHARACTERS.contains(&c) {
                let x = match mode {
                    // 63='?'
                    &EncodeMode::Path => c == 63,
                    &EncodeMode::UserPassword => NOT_ALLOWS_PASSWORD.contains(&c),
                    &EncodeMode::QueryComponent => true,
                    _ => false,
                };
                return x;

            }
            true
        }
    }
}

fn escape(url: &'static str, mode: &EncodeMode) -> &'static str {
    let mut space_count = 0usize;
    let mut hex_count = 0usize;

    let url_bytes = url.as_bytes();
    for b in url_bytes {
        if should_escape(*b, mode) {
            if *b == 32 && mode == &EncodeMode::QueryComponent {
                space_count += 1;
            } else {
                hex_count += 1;
            }
        }
    }
    if space_count == 0 && hex_count == 0 {
        return url;
    }
    let l = url_bytes.len() + 2 * hex_count;

    let mut t: Vec<u8> = vec![0;l as usize];
    let mut j = 0;
    let s = "0123456789ABCDEF".as_bytes();

    for v in url_bytes {
        if *v == 32 && mode == &EncodeMode::QueryComponent {
            t[j] = 43;
            j += 1;
        }else if should_escape(*v, mode) {

            t[j] = 37;
            t[j + 1] = s[(v >> 4) as usize];
            t[j + 2] = s[(v & 15) as usize];
            j += 3;
        } else {
            t[j] = *v;
            j += 1;
        }
    }
    println!("{:?}",t);
    let result: &str = fast_str(&t);
    result
}


fn is_alpha_numeric(cc: u8) -> bool {
    match cc {
        97...122 | 65...90 | 48...57 => true,
        _ => false,
    }
}


fn main() {
    println!("{:?}",
             escape("/test?id=32&value=我们", &EncodeMode::Path));
}

Output

[47, 116, 101, 115, 116, 37, 51, 70, 105, 100, 61, 51, 50, 38, 118, 97, 108, 117, 101, 61, 37, 69, 54, 37, 56, 56, 37, 57, 49, 37, 69, 52, 37, 66, 66, 37, 65, 67]
"/test%3Fid=32&value=%E6%88%91%E4%BB%AC"

I just get what i wanted. why would this be?


#2

Your fast_str function takes a buffer with an arbitrary lifetime and produces a str with a static lifetime. The buffer being passed into it does not have a static lifetime, so this is undefined behavior. You are now in nasal demon territory. The compiler can do literally whatever it wants at that point. The insertion of the println apparently makes the stars align so that everything happens to “work”.

I would highly recommend avoiding unsafe code and particularly transmute. Assuming that you have carefully profiled your code and determined that the UTF8 check is significantly impacting performance, str::from_utf8_unchecked is a better method to use as it will properly associate the input and output lifetimes.


#3

:relaxed:thanks