remove allocation for int parsing

2026-08-02 20:34:49 +02:00 · 2020-12-08 20:13:50 +01:00 · 2020-12-08 20:13:50 +01:00 · f7ae4135d6
commit f7ae4135d6
parent 2227135357
6 changed files with 85 additions and 18 deletions
--- a/src/error.rs
+++ b/src/error.rs
@ -1,4 +1,5 @@
 use crate::lexer::Token;
+use crate::num::ParseIntError;
 use crate::string::UnescapeError;
 use crate::Value;
 use logos::Span;
@ -8,7 +9,7 @@ use source_span::{
 };
 use std::error::Error;
 use std::fmt::{self, Debug, Display};
-use std::num::{ParseFloatError, ParseIntError};
+use std::num::ParseFloatError;
 use std::str::ParseBoolError;
 use thiserror::Error;

--- a/src/lib.rs
+++ b/src/lib.rs
@ -20,6 +20,7 @@
 //!
 mod error;
 mod lexer;
+mod num;
 mod parser;
 mod string;

--- a/src/num.rs
+++ b/src/num.rs
@ -0,0 +1,62 @@
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum ParseIntError {
+    #[error("cannot parse integer from empty string")]
+    Empty,
+    #[error("invalid digit found in string")]
+    InvalidDigit,
+    #[error("number too large or small to fit in target type")]
+    Overflow,
+}
+
+/// Mostly copied from std
+pub fn parse_int(src: &str) -> Result<i64, ParseIntError> {
+    if src.is_empty() {
+        return Err(ParseIntError::Empty);
+    }
+
+    // all valid digits are ascii, so we will just iterate over the utf8 bytes
+    // and cast them to chars. .to_digit() will safely return None for anything
+    // other than a valid ascii digit for the given radix, including the first-byte
+    // of multi-byte sequences
+    let src = src.as_bytes();
+
+    let (sign, digits) = match src[0] {
+        b'+' => (1, &src[1..]),
+        b'-' => (-1, &src[1..]),
+        _ => (1, src),
+    };
+
+    let (radix, digits) = match digits {
+        [b'0', b'x', tail @ ..] => (16, tail),
+        [b'0', b'b', tail @ ..] => (2, tail),
+        [b'0', tail @ ..] if tail.len() > 0 => (8, tail),
+        tail => (10, tail),
+    };
+
+    if digits.is_empty() {
+        return Err(ParseIntError::Empty);
+    }
+
+    let mut result: i64 = 0;
+
+    // The number is positive
+    for &c in digits {
+        if c != b'_' {
+            let x = match (c as char).to_digit(radix) {
+                Some(x) => x,
+                None => return Err(ParseIntError::InvalidDigit),
+            };
+            result = match result.checked_mul(radix as i64) {
+                Some(result) => result,
+                None => return Err(ParseIntError::Overflow),
+            };
+            result = match result.checked_add(x as i64) {
+                Some(result) => result,
+                None => return Err(ParseIntError::Overflow),
+            };
+        }
+    }
+    Ok(result * sign)
+}
--- a/src/parser.rs
+++ b/src/parser.rs
@ -1,11 +1,12 @@
 use crate::error::UnexpectedTokenError;
 use crate::error::{ExpectToken, InvalidArrayKeyError, ParseError, ResultExt, SpannedError};
 use crate::lexer::Token;
+use crate::num::parse_int;
 use crate::string::{unescape_double, unescape_single, UnescapeError};
 use crate::{Key, Value};
 use logos::{Lexer, Logos};
 use std::collections::HashMap;
-use std::num::{ParseFloatError, ParseIntError};
+use std::num::ParseFloatError;

 /// Parse a php literal
 ///
@ -79,18 +80,6 @@ fn parse_string(literal: &str) -> Result<String, UnescapeError> {
    }
 }

-fn parse_int(literal: &str) -> Result<i64, ParseIntError> {
-    let stripped = literal.replace('_', "");
-    match stripped.as_bytes() {
-        [b'0', b'x', tail @ ..] => i64::from_str_radix(std::str::from_utf8(tail).unwrap(), 16),
-        [b'0', b'b', tail @ ..] => i64::from_str_radix(std::str::from_utf8(tail).unwrap(), 2),
-        [b'0', tail @ ..] if tail.len() > 0 => {
-            i64::from_str_radix(std::str::from_utf8(tail).unwrap(), 8)
-        }
-        tail => i64::from_str_radix(std::str::from_utf8(tail).unwrap(), 10),
-    }
-}
-
 fn parse_float(literal: &str) -> Result<f64, ParseFloatError> {
    let stripped = literal.replace('_', "");
    stripped.parse()
--- a/src/string.rs
+++ b/src/string.rs
@ -1,6 +1,6 @@
 /// unescaping php string literals borrowed mostly from `escape8259`
 use std::char::decode_utf16;
-use std::iter::Peekable;
+use std::iter::{once, Peekable};

 #[derive(Debug, Clone, Eq, PartialEq)]

@ -41,10 +41,9 @@ impl UnescapeState {
        match (self.stash, surrogate) {
            (0, false) => {
                // The std library only provides utf16 decode of an iterator,
-                // so to decode a single character we wrap it in an array.
+                // so to decode a single character we wrap it in a `once`.
                // Hopefully the compiler will elide most of this extra work.
-                let words = [x];
-                match decode_utf16(words.iter().copied()).next() {
+                match decode_utf16(once(x)).next() {
                    Some(Ok(c)) => {
                        self.out.push(c);
                    }