buffer string read improvements

2026-08-02 20:24:49 +02:00 · 2025-07-14 20:06:45 +02:00 · 2025-07-14 20:06:45 +02:00 · 191a5a7cb9
commit 191a5a7cb9
parent 8a96a5dc41
1 changed files with 167 additions and 160 deletions
--- a/src/readbuffer.rs
+++ b/src/readbuffer.rs
@ -154,6 +154,25 @@ where
            slice,
        }
    }
+
+    #[inline(always)]
+    fn bounds_check(&self, position: usize, bits_requested: usize) -> Result<()> {
+        if position + bits_requested > self.bit_len() {
+            if position > self.bit_len() {
+                Err(BitError::IndexOutOfBounds {
+                    pos: position,
+                    size: self.bit_len(),
+                })
+            } else {
+                Err(BitError::NotEnoughData {
+                    requested: bits_requested,
+                    bits_left: self.bit_len() - position,
+                })
+            }
+        } else {
+            Ok(())
+        }
+    }
 }

 impl<E> BitReadBuffer<'static, E>
@ -372,19 +391,7 @@ where
        }

        if position + count + USIZE_BIT_SIZE > self.bit_len() {
-            if position + count > self.bit_len() {
-                return if position > self.bit_len() {
-                    Err(BitError::IndexOutOfBounds {
-                        pos: position,
-                        size: self.bit_len(),
-                    })
-                } else {
-                    Err(BitError::NotEnoughData {
-                        requested: count,
-                        bits_left: self.bit_len() - position,
-                    })
-                };
-            }
+            self.bounds_check(position, count)?;
            Ok(unsafe { self.read_int_unchecked(position, count, true) })
        } else {
            Ok(unsafe { self.read_int_unchecked(position, count, false) })
@ -501,9 +508,8 @@ where
    /// [`ReadError::NotEnoughData`]: enum.ReadError.html#variant.NotEnoughData
    #[inline]
    pub fn read_bytes(&self, position: usize, byte_count: usize) -> Result<Cow<'a, [u8]>> {
-        let mut output = Vec::new();
-        let result = self.read_bytes_into(position, byte_count, &mut output)?;
-        Ok(result.into_cow(output))
+        self.bounds_check(position, byte_count * 8)?;
+        Ok(unsafe { self.read_bytes_unchecked(position, byte_count) })
    }

    /// Read a series of bytes from the buffer, using an existing buffer
@ -549,48 +555,19 @@ where
        byte_count: usize,
        output: &mut Vec<u8>,
    ) -> Result<MaybeBorrowed<'a, [u8]>> {
-        if position + byte_count * 8 > self.bit_len() {
-            if position > self.bit_len() {
-                return Err(BitError::IndexOutOfBounds {
-                    pos: position,
-                    size: self.bit_len(),
-                });
-            } else {
-                return Err(BitError::NotEnoughData {
-                    requested: byte_count * 8,
-                    bits_left: self.bit_len() - position,
-                });
-            }
-        }
+        self.bounds_check(position, byte_count * 8)?;

        Ok(unsafe { self.read_bytes_unchecked_into(position, byte_count, output) })
    }

-    #[doc(hidden)]
-    #[inline]
-    pub unsafe fn read_bytes_unchecked(&self, position: usize, byte_count: usize) -> Cow<'a, [u8]> {
-        let mut output = Vec::new();
-        let result = self.read_bytes_unchecked_into(position, byte_count, &mut output);
-        result.into_cow(output)
-    }
-
-    #[doc(hidden)]
-    #[inline]
-    pub unsafe fn read_bytes_unchecked_into(
+    #[inline(always)]
+    unsafe fn read_bytes_unchecked_owned(
        &self,
        position: usize,
        byte_count: usize,
        output: &mut Vec<u8>,
-    ) -> MaybeBorrowed<'a, [u8]> {
+    ) {
        let shift = position & 7;
-
-        if shift == 0 {
-            let byte_pos = position / 8;
-            return MaybeBorrowed::Borrowed(&self.slice[byte_pos..byte_pos + byte_count]);
-        }
-
-        output.clear();
-        output.reserve(byte_count);
        let mut byte_left = byte_count;
        let mut read_pos = position / 8;

@ -617,6 +594,42 @@ where
                pos += 8;
            }
        }
+    }
+
+    #[doc(hidden)]
+    #[inline]
+    pub unsafe fn read_bytes_unchecked(&self, position: usize, byte_count: usize) -> Cow<'a, [u8]> {
+        let shift = position & 7;
+
+        if shift == 0 {
+            let byte_pos = position / 8;
+            return Cow::Borrowed(&self.slice[byte_pos..byte_pos + byte_count]);
+        }
+
+        let mut output = Vec::with_capacity(byte_count);
+        self.read_bytes_unchecked_owned(position, byte_count, &mut output);
+
+        Cow::Owned(output)
+    }
+
+    #[doc(hidden)]
+    #[inline]
+    pub unsafe fn read_bytes_unchecked_into(
+        &self,
+        position: usize,
+        byte_count: usize,
+        output: &mut Vec<u8>,
+    ) -> MaybeBorrowed<'a, [u8]> {
+        let shift = position & 7;
+
+        if shift == 0 {
+            let byte_pos = position / 8;
+            return MaybeBorrowed::Borrowed(&self.slice[byte_pos..byte_pos + byte_count]);
+        }
+
+        output.clear();
+        output.reserve(byte_count);
+        self.read_bytes_unchecked_owned(position, byte_count, output);

        MaybeBorrowed::Owned
    }
@ -660,9 +673,14 @@ where
    /// [`ReadError::Utf8Error`]: enum.ReadError.html#variant.Utf8Error
    #[inline]
    pub fn read_string(&self, position: usize, byte_len: Option<usize>) -> Result<Cow<'a, str>> {
-        let mut output = String::new();
-        let result = self.read_string_into(position, byte_len, &mut output)?;
-        Ok(result.into_cow(output))
+        let shift = position & 7;
+        if shift == 0 {
+            return Ok(Cow::Borrowed(self.read_string_aligned(position, byte_len)?));
+        }
+
+        let output = Vec::with_capacity(64);
+        let string = self.read_string_unaligned(position, byte_len, output)?;
+        Ok(Cow::Owned(string))
    }

    /// Read a series of bytes from the buffer as string.
@ -713,103 +731,114 @@ where
        byte_len: Option<usize>,
        output: &mut String,
    ) -> Result<MaybeBorrowed<'a, str>> {
-        let mut taken_output = take(output).into_bytes();
-        match byte_len {
-            Some(byte_len) => {
-                let bytes = self.read_bytes_into(position, byte_len, &mut taken_output)?;
-
-                let result = match bytes {
-                    MaybeBorrowed::Owned => {
-                        *output = String::from_utf8(taken_output)?
-                            .trim_end_matches(char::from(0))
-                            .to_string();
-                        MaybeBorrowed::Owned
-                    }
-                    MaybeBorrowed::Borrowed(bytes) => MaybeBorrowed::Borrowed(
-                        std::str::from_utf8(bytes)
-                            .map_err(|err| BitError::Utf8Error(err, bytes.len()))?
-                            .trim_end_matches(char::from(0)),
-                    ),
-                };
-                Ok(result)
-            }
-            None => {
-                let bytes = self.read_string_bytes(position, &mut taken_output)?;
-                let result = match bytes {
-                    MaybeBorrowed::Owned => {
-                        *output = String::from_utf8(taken_output)?;
-                        MaybeBorrowed::Owned
-                    }
-                    MaybeBorrowed::Borrowed(bytes) => MaybeBorrowed::Borrowed(
-                        std::str::from_utf8(bytes)
-                            .map_err(|err| BitError::Utf8Error(err, bytes.len()))?,
-                    ),
-                };
-                Ok(result)
-            }
-        }
-    }
-
-    #[inline]
-    fn find_null_byte(&self, byte_index: usize) -> usize {
-        memchr::memchr(0, &self.slice[byte_index..])
-            .map(|index| index + byte_index)
-            .unwrap_or(self.slice.len()) // due to padding we always have 0 bytes at the end
-    }
-
-    #[inline]
-    fn read_string_bytes<'output>(
-        &self,
-        position: usize,
-        buffer: &'output mut Vec<u8>,
-    ) -> Result<MaybeBorrowed<'a, [u8]>> {
        let shift = position & 7;
        if shift == 0 {
-            let byte_index = position / 8;
-            Ok(MaybeBorrowed::Borrowed(
-                &self.slice[byte_index..self.find_null_byte(byte_index)],
-            ))
-        } else {
-            buffer.clear();
-            buffer.reserve(64);
-            if E::is_le() {
-                let mut byte_index = position / 8;
-                loop {
-                    // note: if less then a usize worth of data is left in the buffer, read_usize_bytes
-                    // will automatically pad with null bytes, triggering the loop termination
-                    // thus no separate logic for dealing with the end of the bytes is required
-                    //
-                    // This is safe because the final usize is filled with 0's, thus triggering the exit clause
-                    // before reading any out of bounds
-                    let shifted = unsafe { self.read_shifted_usize(byte_index, shift, true) };
+            return Ok(MaybeBorrowed::Borrowed(
+                self.read_string_aligned(position, byte_len)?,
+            ));
+        }

-                    let has_null = contains_zero_byte_non_top(shifted);
-                    let bytes: [u8; USIZE_SIZE] = shifted.to_le_bytes();
-                    let usable_bytes = &bytes[0..USIZE_SIZE - 1];
+        let taken_output = take(output).into_bytes();
+        *output = self.read_string_unaligned(position, byte_len, taken_output)?;

-                    if has_null {
-                        for i in 0..USIZE_SIZE - 1 {
-                            if usable_bytes[i] == 0 {
-                                buffer.extend_from_slice(&usable_bytes[0..i]);
-                                return Ok(MaybeBorrowed::Owned);
-                            }
+        Ok(MaybeBorrowed::Owned)
+    }
+
+    #[inline(always)]
+    fn read_string_unaligned(
+        &self,
+        position: usize,
+        byte_len: Option<usize>,
+        mut output: Vec<u8>,
+    ) -> Result<String> {
+        Ok(match byte_len {
+            Some(byte_len) => {
+                self.bounds_check(position, byte_len * 8)?;
+                unsafe { self.read_bytes_unchecked_owned(position, byte_len, &mut output) };
+
+                String::from_utf8(output)?
+                    .trim_end_matches(char::from(0))
+                    .to_string()
+            }
+            None => {
+                self.read_string_bytes(position, &mut output)?;
+                String::from_utf8(output)?
+            }
+        })
+    }
+
+    #[inline(always)]
+    fn read_string_aligned(&self, position: usize, byte_len: Option<usize>) -> Result<&'a str> {
+        let byte_pos = position / 8;
+        let str = match byte_len {
+            Some(byte_len) => {
+                self.bounds_check(position, byte_len * 8)?;
+                let bytes = unsafe { self.slice.get_unchecked(byte_pos..byte_pos + byte_len) };
+                str::from_utf8(bytes)
+                    .map_err(|err| BitError::Utf8Error(err, byte_len))?
+                    .trim_end_matches(char::from(0))
+            }
+            None => {
+                if byte_pos > self.byte_len() {
+                    return Err(BitError::IndexOutOfBounds {
+                        pos: position,
+                        size: self.bit_len(),
+                    });
+                }
+
+                let slice = unsafe { self.slice.get_unchecked(byte_pos..) };
+                let byte_len = memchr::memchr(0, slice).unwrap_or_default();
+
+                let bytes = unsafe { self.slice.get_unchecked(byte_pos..byte_pos + byte_len) };
+                str::from_utf8(bytes).map_err(|err| BitError::Utf8Error(err, byte_len))?
+            }
+        };
+
+        Ok(str)
+    }
+
+    #[inline]
+    fn read_string_bytes(&self, position: usize, buffer: &mut Vec<u8>) -> Result<()> {
+        let shift = position & 7;
+        buffer.clear();
+        buffer.reserve(64);
+        if E::is_le() {
+            let mut byte_index = position / 8;
+            loop {
+                // note: if less then a usize worth of data is left in the buffer, read_usize_bytes
+                // will automatically pad with null bytes, triggering the loop termination
+                // thus no separate logic for dealing with the end of the bytes is required
+                //
+                // This is safe because the final usize is filled with 0's, thus triggering the exit clause
+                // before reading any out of bounds
+                let shifted = unsafe { self.read_shifted_usize(byte_index, shift, true) };
+
+                let has_null = contains_zero_byte_non_top(shifted);
+                let bytes: [u8; USIZE_SIZE] = shifted.to_le_bytes();
+                let usable_bytes = &bytes[0..USIZE_SIZE - 1];
+
+                if has_null {
+                    for i in 0..USIZE_SIZE - 1 {
+                        if usable_bytes[i] == 0 {
+                            buffer.extend_from_slice(&usable_bytes[0..i]);
+                            return Ok(());
                        }
                    }
-
-                    buffer.extend_from_slice(&usable_bytes[0..USIZE_SIZE - 1]);
-
-                    byte_index += USIZE_SIZE - 1;
                }
-            } else {
-                let mut pos = position;
-                loop {
-                    let byte = self.read_int::<u8>(pos, 8)?;
-                    pos += 8;
-                    if byte == 0 {
-                        return Ok(MaybeBorrowed::Owned);
-                    } else {
-                        buffer.push(byte);
-                    }
+
+                buffer.extend_from_slice(&usable_bytes[0..USIZE_SIZE - 1]);
+
+                byte_index += USIZE_SIZE - 1;
+            }
+        } else {
+            let mut pos = position;
+            loop {
+                let byte = self.read_int::<u8>(pos, 8)?;
+                pos += 8;
+                if byte == 0 {
+                    return Ok(());
+                } else {
+                    buffer.push(byte);
                }
            }
        }
@ -846,19 +875,7 @@ where
    {
        let type_bit_size = size_of::<T>() * 8;
        if position + type_bit_size + USIZE_BIT_SIZE > self.bit_len() {
-            if position + type_bit_size > self.bit_len() {
-                if position > self.bit_len() {
-                    return Err(BitError::IndexOutOfBounds {
-                        pos: position,
-                        size: self.bit_len(),
-                    });
-                } else {
-                    return Err(BitError::NotEnoughData {
-                        requested: size_of::<T>() * 8,
-                        bits_left: self.bit_len() - position,
-                    });
-                }
-            }
+            self.bounds_check(position, type_bit_size)?;
            Ok(unsafe { self.read_float_unchecked(position, true) })
        } else {
            Ok(unsafe { self.read_float_unchecked(position, false) })
@ -884,12 +901,7 @@ where
    }

    pub(crate) fn get_sub_buffer(&self, bit_len: usize) -> Result<Self> {
-        if bit_len > self.bit_len() {
-            return Err(BitError::NotEnoughData {
-                requested: bit_len,
-                bits_left: self.bit_len(),
-            });
-        }
+        self.bounds_check(0, bit_len)?;

        Ok(BitReadBuffer {
            bytes: self.bytes.clone(),
@ -901,12 +913,7 @@ where

    /// Truncate the buffer to a given bit length
    pub fn truncate(&mut self, bit_len: usize) -> Result<()> {
-        if bit_len > self.bit_len() {
-            return Err(BitError::NotEnoughData {
-                requested: bit_len,
-                bits_left: self.bit_len(),
-            });
-        }
+        self.bounds_check(bit_len, 0)?;

        self.bit_len = bit_len;
        Ok(())