Arm NEON SIMD method for fast packing of RGB8 textures to RGB565 textures
rgb565.rs
63 lines 2.1 kB view raw
1use std::arch::aarch64::{ 2 uint16x8_t, uint8x8x3_t, 3 vld3_u8, vmovl_u8, vorrq_u16, 4 vshlq_n_u16, vshrq_n_u16, vst1q_u16 5}; 6 7#[cfg(target_arch = "aarch64")] 8pub fn pack_rgb565_neon(rgb_data: &[u8]) -> Vec<u8> { 9 let pixels = rgb_data.len() / 3; 10 // allocate as u16 to allow direct SIMD stores 11 let mut packed_u16: Vec<u16> = Vec::with_capacity(pixels); 12 let mut src = rgb_data.as_ptr(); 13 let mut dst = packed_u16.as_mut_ptr(); 14 let mut remaining = pixels; 15 16 unsafe { 17 // process 8 pixels at a time with NEON 18 while remaining >= 8 { 19 let rgb: uint8x8x3_t = vld3_u8(src); 20 21 let r_u16: uint16x8_t = vmovl_u8(rgb.0); 22 let g_u16: uint16x8_t = vmovl_u8(rgb.1); 23 let b_u16: uint16x8_t = vmovl_u8(rgb.2); 24 25 let r_shifted = vshrq_n_u16(r_u16, 3); 26 let g_shifted = vshrq_n_u16(g_u16, 2); 27 let b_shifted = vshrq_n_u16(b_u16, 3); 28 29 let r_bits = vshlq_n_u16(r_shifted, 11); 30 let g_bits = vshlq_n_u16(g_shifted, 5); 31 let packed = vorrq_u16(vorrq_u16(r_bits, g_bits), b_shifted); 32 33 // Store directly to output buffer 34 vst1q_u16(dst, packed); 35 36 const SRC_INCREMENT: usize = 8 * 3; 37 src = src.add(SRC_INCREMENT); 38 dst = dst.add(8); 39 remaining -= 8; 40 } 41 42 // Handle remaining pixels 43 while remaining > 0 { 44 let r = (*src >> 3) as u16; 45 let g = (*src.add(1) >> 2) as u16; 46 let b = (*src.add(2) >> 3) as u16; 47 *dst = (r << 11) | (g << 5) | b; 48 src = src.add(3); 49 dst = dst.add(1); 50 remaining -= 1; 51 } 52 53 // set length only one time, here near the end 54 packed_u16.set_len(pixels); 55 56 // Reinterpret as bytes (zero-copy on little-endian) 57 let byte_len = pixels * 2; 58 let byte_cap = packed_u16.capacity() * 2; 59 let byte_ptr = packed_u16.as_mut_ptr() as *mut u8; 60 std::mem::forget(packed_u16); 61 Vec::from_raw_parts(byte_ptr, byte_len, byte_cap) 62 } 63}