-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from danielparks/bytes
Add support for operating on byte strings
- Loading branch information
Showing
4 changed files
with
308 additions
and
127 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
// Copyright 2015 Nicholas Allegra (comex). | ||
// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or | ||
// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be | ||
// copied, modified, or distributed except according to those terms. | ||
|
||
//! [`Shlex`] and friends for byte strings. | ||
//! | ||
//! This is used internally by the [outer module](crate), and may be more | ||
//! convenient if you are working with byte slices (`[u8]`) or types that are | ||
//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): | ||
//! | ||
//! ```rust | ||
//! #[cfg(unix)] { | ||
//! use shlex::bytes::quote; | ||
//! use std::ffi::OsStr; | ||
//! use std::os::unix::ffi::OsStrExt; | ||
//! | ||
//! // `\x80` is invalid in UTF-8. | ||
//! let os_str = OsStr::from_bytes(b"a\x80b c"); | ||
//! assert_eq!(quote(os_str.as_bytes()), &b"\"a\x80b c\""[..]); | ||
//! } | ||
//! ``` | ||
//! | ||
//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.) | ||
|
||
extern crate alloc; | ||
use alloc::vec::Vec; | ||
use alloc::borrow::Cow; | ||
#[cfg(test)] | ||
use alloc::vec; | ||
#[cfg(test)] | ||
use alloc::borrow::ToOwned; | ||
|
||
/// An iterator that takes an input byte string and splits it into the words using the same syntax as | ||
/// the POSIX shell. | ||
pub struct Shlex<'a> { | ||
in_iter: core::slice::Iter<'a, u8>, | ||
/// The number of newlines read so far, plus one. | ||
pub line_no: usize, | ||
/// An input string is erroneous if it ends while inside a quotation or right after an | ||
/// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that | ||
/// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to | ||
/// true; best to check it after you're done iterating. | ||
pub had_error: bool, | ||
} | ||
|
||
impl<'a> Shlex<'a> { | ||
pub fn new(in_bytes: &'a [u8]) -> Self { | ||
Shlex { | ||
in_iter: in_bytes.iter(), | ||
line_no: 1, | ||
had_error: false, | ||
} | ||
} | ||
|
||
fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> { | ||
let mut result: Vec<u8> = Vec::new(); | ||
loop { | ||
match ch as char { | ||
'"' => if let Err(()) = self.parse_double(&mut result) { | ||
self.had_error = true; | ||
return None; | ||
}, | ||
'\'' => if let Err(()) = self.parse_single(&mut result) { | ||
self.had_error = true; | ||
return None; | ||
}, | ||
'\\' => if let Some(ch2) = self.next_char() { | ||
if ch2 != '\n' as u8 { result.push(ch2); } | ||
} else { | ||
self.had_error = true; | ||
return None; | ||
}, | ||
' ' | '\t' | '\n' => { break; }, | ||
_ => { result.push(ch as u8); }, | ||
} | ||
if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } | ||
} | ||
Some(result) | ||
} | ||
|
||
fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> { | ||
loop { | ||
if let Some(ch2) = self.next_char() { | ||
match ch2 as char { | ||
'\\' => { | ||
if let Some(ch3) = self.next_char() { | ||
match ch3 as char { | ||
// \$ => $ | ||
'$' | '`' | '"' | '\\' => { result.push(ch3); }, | ||
// \<newline> => nothing | ||
'\n' => {}, | ||
// \x => =x | ||
_ => { result.push('\\' as u8); result.push(ch3); } | ||
} | ||
} else { | ||
return Err(()); | ||
} | ||
}, | ||
'"' => { return Ok(()); }, | ||
_ => { result.push(ch2); }, | ||
} | ||
} else { | ||
return Err(()); | ||
} | ||
} | ||
} | ||
|
||
fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> { | ||
loop { | ||
if let Some(ch2) = self.next_char() { | ||
match ch2 as char { | ||
'\'' => { return Ok(()); }, | ||
_ => { result.push(ch2); }, | ||
} | ||
} else { | ||
return Err(()); | ||
} | ||
} | ||
} | ||
|
||
fn next_char(&mut self) -> Option<u8> { | ||
let res = self.in_iter.next().copied(); | ||
if res == Some(b'\n') { self.line_no += 1; } | ||
res | ||
} | ||
} | ||
|
||
impl<'a> Iterator for Shlex<'a> { | ||
type Item = Vec<u8>; | ||
fn next(&mut self) -> Option<Self::Item> { | ||
if let Some(mut ch) = self.next_char() { | ||
// skip initial whitespace | ||
loop { | ||
match ch as char { | ||
' ' | '\t' | '\n' => {}, | ||
'#' => { | ||
while let Some(ch2) = self.next_char() { | ||
if ch2 as char == '\n' { break; } | ||
} | ||
}, | ||
_ => { break; } | ||
} | ||
if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } | ||
} | ||
self.parse_word(ch) | ||
} else { // no initial character | ||
None | ||
} | ||
} | ||
|
||
} | ||
|
||
/// Convenience function that consumes the whole byte string at once. Returns None if the input was | ||
/// erroneous. | ||
pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> { | ||
let mut shl = Shlex::new(in_bytes); | ||
let res = shl.by_ref().collect(); | ||
if shl.had_error { None } else { Some(res) } | ||
} | ||
|
||
/// Given a single word, return a byte string suitable to encode it as a shell argument. | ||
/// | ||
/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only | ||
/// ever inserts valid ASCII characters before or after existing ASCII characters (or | ||
/// returns two double quotes if the input was an empty string). It will never modify a | ||
/// multibyte UTF-8 character. | ||
pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> { | ||
if in_bytes.len() == 0 { | ||
b"\"\""[..].into() | ||
} else if in_bytes.iter().any(|c| match *c as char { | ||
'|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | | ||
'\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true, | ||
_ => false | ||
}) { | ||
let mut out: Vec<u8> = Vec::new(); | ||
out.push(b'"'); | ||
for &c in in_bytes { | ||
match c { | ||
b'$' | b'`' | b'"' | b'\\' => out.push(b'\\'), | ||
_ => () | ||
} | ||
out.push(c); | ||
} | ||
out.push(b'"'); | ||
out.into() | ||
} else { | ||
in_bytes.into() | ||
} | ||
} | ||
|
||
/// Convenience function that consumes an iterable of words and turns it into a single byte string, | ||
/// quoting words when necessary. Consecutive words will be separated by a single space. | ||
pub fn join<'a, I: core::iter::IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> { | ||
words.into_iter() | ||
.map(quote) | ||
.collect::<Vec<_>>() | ||
.join(&b' ') | ||
} | ||
|
||
#[cfg(test)] | ||
const INVALID_UTF8: &[u8] = b"\xa1"; | ||
|
||
#[test] | ||
fn test_invalid_utf8() { | ||
// Check that our test string is actually invalid UTF-8. | ||
assert!(core::str::from_utf8(INVALID_UTF8).is_err()); | ||
} | ||
|
||
#[cfg(test)] | ||
static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[ | ||
(b"foo$baz", Some(&[b"foo$baz"])), | ||
(b"foo baz", Some(&[b"foo", b"baz"])), | ||
(b"foo\"bar\"baz", Some(&[b"foobarbaz"])), | ||
(b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])), | ||
(b" foo \nbar", Some(&[b"foo", b"bar"])), | ||
(b"foo\\\nbar", Some(&[b"foobar"])), | ||
(b"\"foo\\\nbar\"", Some(&[b"foobar"])), | ||
(b"'baz\\$b'", Some(&[b"baz\\$b"])), | ||
(b"'baz\\\''", None), | ||
(b"\\", None), | ||
(b"\"\\", None), | ||
(b"'\\", None), | ||
(b"\"", None), | ||
(b"'", None), | ||
(b"foo #bar\nbaz", Some(&[b"foo", b"baz"])), | ||
(b"foo #bar", Some(&[b"foo"])), | ||
(b"foo#bar", Some(&[b"foo#bar"])), | ||
(b"foo\"#bar", None), | ||
(b"'\\n'", Some(&[b"\\n"])), | ||
(b"'\\\\n'", Some(&[b"\\\\n"])), | ||
(INVALID_UTF8, Some(&[INVALID_UTF8])), | ||
]; | ||
|
||
#[test] | ||
fn test_split() { | ||
for &(input, output) in SPLIT_TEST_ITEMS { | ||
assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect())); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_lineno() { | ||
let mut sh = Shlex::new(b"\nfoo\nbar"); | ||
while let Some(word) = sh.next() { | ||
if word == b"bar" { | ||
assert_eq!(sh.line_no, 3); | ||
} | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_quote() { | ||
assert_eq!(quote(b"foobar"), &b"foobar"[..]); | ||
assert_eq!(quote(b"foo bar"), &b"\"foo bar\""[..]); | ||
assert_eq!(quote(b"\""), &b"\"\\\"\""[..]); | ||
assert_eq!(quote(b""), &b"\"\""[..]); | ||
assert_eq!(quote(INVALID_UTF8), INVALID_UTF8); | ||
} | ||
|
||
#[test] | ||
fn test_join() { | ||
assert_eq!(join(vec![]), &b""[..]); | ||
assert_eq!(join(vec![&b""[..]]), &b"\"\""[..]); | ||
assert_eq!(join(vec![&b"a"[..], &b"b"[..]]), &b"a b"[..]); | ||
assert_eq!(join(vec![&b"foo bar"[..], &b"baz"[..]]), &b"\"foo bar\" baz"[..]); | ||
assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8); | ||
} |
Oops, something went wrong.