rgb565.rs
1use std::arch::aarch64::{
2 uint16x8_t, uint8x8x3_t,
3 vld3_u8, vmovl_u8, vorrq_u16,
4 vshlq_n_u16, vshrq_n_u16, vst1q_u16
5};
6
7#[cfg(target_arch = "aarch64")]
8pub fn pack_rgb565_neon(rgb_data: &[u8]) -> Vec<u8> {
9 let pixels = rgb_data.len() / 3;
10 // allocate as u16 to allow direct SIMD stores
11 let mut packed_u16: Vec<u16> = Vec::with_capacity(pixels);
12 let mut src = rgb_data.as_ptr();
13 let mut dst = packed_u16.as_mut_ptr();
14 let mut remaining = pixels;
15
16 unsafe {
17 // process 8 pixels at a time with NEON
18 while remaining >= 8 {
19 let rgb: uint8x8x3_t = vld3_u8(src);
20
21 let r_u16: uint16x8_t = vmovl_u8(rgb.0);
22 let g_u16: uint16x8_t = vmovl_u8(rgb.1);
23 let b_u16: uint16x8_t = vmovl_u8(rgb.2);
24
25 let r_shifted = vshrq_n_u16(r_u16, 3);
26 let g_shifted = vshrq_n_u16(g_u16, 2);
27 let b_shifted = vshrq_n_u16(b_u16, 3);
28
29 let r_bits = vshlq_n_u16(r_shifted, 11);
30 let g_bits = vshlq_n_u16(g_shifted, 5);
31 let packed = vorrq_u16(vorrq_u16(r_bits, g_bits), b_shifted);
32
33 // Store directly to output buffer
34 vst1q_u16(dst, packed);
35
36 const SRC_INCREMENT: usize = 8 * 3;
37 src = src.add(SRC_INCREMENT);
38 dst = dst.add(8);
39 remaining -= 8;
40 }
41
42 // Handle remaining pixels
43 while remaining > 0 {
44 let r = (*src >> 3) as u16;
45 let g = (*src.add(1) >> 2) as u16;
46 let b = (*src.add(2) >> 3) as u16;
47 *dst = (r << 11) | (g << 5) | b;
48 src = src.add(3);
49 dst = dst.add(1);
50 remaining -= 1;
51 }
52
53 // set length only one time, here near the end
54 packed_u16.set_len(pixels);
55
56 // Reinterpret as bytes (zero-copy on little-endian)
57 let byte_len = pixels * 2;
58 let byte_cap = packed_u16.capacity() * 2;
59 let byte_ptr = packed_u16.as_mut_ptr() as *mut u8;
60 std::mem::forget(packed_u16);
61 Vec::from_raw_parts(byte_ptr, byte_len, byte_cap)
62 }
63}