experimental hashing with oxcaml
1#include <immintrin.h> 2#include <stdint.h> 3#include <string.h> 4#include <caml/mlvalues.h> 5#include <caml/memory.h> 6#include <caml/alloc.h> 7#include <caml/bigarray.h> 8 9// Aligned storage for round constants 10alignas(64) static const uint32_t K256[64] = { 11 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 12 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 13 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 14 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 15 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 16 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 17 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 18 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 19 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 20 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 21 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 22 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 23 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 24 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 25 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 26 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 27}; 28 29// Initial SHA256 state values 30alignas(16) static const uint32_t H256_INIT[8] = { 31 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 32 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 33}; 34 35// Byte swap for endianness 36static const __m128i BSWAP_MASK = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL}; 37 38// Process a single 512-bit block using SHA-NI instructions 39static void sha256_process_block_shani(uint32_t state[8], const uint8_t block[64]) { 40 __m128i msg0, msg1, msg2, msg3; 41 __m128i tmp; 42 __m128i state0, state1; 43 __m128i msg; 44 __m128i abef_save, cdgh_save; 45 46 // Load initial state 47 tmp = _mm_loadu_si128((const __m128i*)&state[0]); 48 state1 = _mm_loadu_si128((const __m128i*)&state[4]); 49 50 // Swap byte order for initial state 51 tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB 52 state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH 53 state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF 54 state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH 55 56 // Save initial state 57 abef_save = state0; 58 cdgh_save = state1; 59 60 // Load message blocks with byte swap 61 msg0 = _mm_loadu_si128((const __m128i*)(block + 0)); 62 msg1 = _mm_loadu_si128((const __m128i*)(block + 16)); 63 msg2 = _mm_loadu_si128((const __m128i*)(block + 32)); 64 msg3 = _mm_loadu_si128((const __m128i*)(block + 48)); 65 66 msg0 = _mm_shuffle_epi8(msg0, BSWAP_MASK); 67 msg1 = _mm_shuffle_epi8(msg1, BSWAP_MASK); 68 msg2 = _mm_shuffle_epi8(msg2, BSWAP_MASK); 69 msg3 = _mm_shuffle_epi8(msg3, BSWAP_MASK); 70 71 // Rounds 0-3 72 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[0])); 73 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 74 msg = _mm_shuffle_epi32(msg, 0x0E); 75 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 76 77 // Rounds 4-7 78 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[4])); 79 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 80 msg = _mm_shuffle_epi32(msg, 0x0E); 81 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 82 msg0 = _mm_sha256msg1_epu32(msg0, msg1); 83 84 // Rounds 8-11 85 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[8])); 86 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 87 msg = _mm_shuffle_epi32(msg, 0x0E); 88 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 89 msg1 = _mm_sha256msg1_epu32(msg1, msg2); 90 91 // Rounds 12-15 92 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[12])); 93 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 94 tmp = _mm_alignr_epi8(msg3, msg2, 4); 95 msg0 = _mm_add_epi32(msg0, tmp); 96 msg0 = _mm_sha256msg2_epu32(msg0, msg3); 97 msg = _mm_shuffle_epi32(msg, 0x0E); 98 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 99 msg2 = _mm_sha256msg1_epu32(msg2, msg3); 100 101 // Rounds 16-19 102 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[16])); 103 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 104 tmp = _mm_alignr_epi8(msg0, msg3, 4); 105 msg1 = _mm_add_epi32(msg1, tmp); 106 msg1 = _mm_sha256msg2_epu32(msg1, msg0); 107 msg = _mm_shuffle_epi32(msg, 0x0E); 108 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 109 msg3 = _mm_sha256msg1_epu32(msg3, msg0); 110 111 // Rounds 20-23 112 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[20])); 113 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 114 tmp = _mm_alignr_epi8(msg1, msg0, 4); 115 msg2 = _mm_add_epi32(msg2, tmp); 116 msg2 = _mm_sha256msg2_epu32(msg2, msg1); 117 msg = _mm_shuffle_epi32(msg, 0x0E); 118 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 119 msg0 = _mm_sha256msg1_epu32(msg0, msg1); 120 121 // Rounds 24-27 122 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[24])); 123 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 124 tmp = _mm_alignr_epi8(msg2, msg1, 4); 125 msg3 = _mm_add_epi32(msg3, tmp); 126 msg3 = _mm_sha256msg2_epu32(msg3, msg2); 127 msg = _mm_shuffle_epi32(msg, 0x0E); 128 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 129 msg1 = _mm_sha256msg1_epu32(msg1, msg2); 130 131 // Rounds 28-31 132 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[28])); 133 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 134 tmp = _mm_alignr_epi8(msg3, msg2, 4); 135 msg0 = _mm_add_epi32(msg0, tmp); 136 msg0 = _mm_sha256msg2_epu32(msg0, msg3); 137 msg = _mm_shuffle_epi32(msg, 0x0E); 138 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 139 msg2 = _mm_sha256msg1_epu32(msg2, msg3); 140 141 // Rounds 32-35 142 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[32])); 143 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 144 tmp = _mm_alignr_epi8(msg0, msg3, 4); 145 msg1 = _mm_add_epi32(msg1, tmp); 146 msg1 = _mm_sha256msg2_epu32(msg1, msg0); 147 msg = _mm_shuffle_epi32(msg, 0x0E); 148 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 149 msg3 = _mm_sha256msg1_epu32(msg3, msg0); 150 151 // Rounds 36-39 152 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[36])); 153 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 154 tmp = _mm_alignr_epi8(msg1, msg0, 4); 155 msg2 = _mm_add_epi32(msg2, tmp); 156 msg2 = _mm_sha256msg2_epu32(msg2, msg1); 157 msg = _mm_shuffle_epi32(msg, 0x0E); 158 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 159 msg0 = _mm_sha256msg1_epu32(msg0, msg1); 160 161 // Rounds 40-43 162 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[40])); 163 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 164 tmp = _mm_alignr_epi8(msg2, msg1, 4); 165 msg3 = _mm_add_epi32(msg3, tmp); 166 msg3 = _mm_sha256msg2_epu32(msg3, msg2); 167 msg = _mm_shuffle_epi32(msg, 0x0E); 168 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 169 msg1 = _mm_sha256msg1_epu32(msg1, msg2); 170 171 // Rounds 44-47 172 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[44])); 173 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 174 tmp = _mm_alignr_epi8(msg3, msg2, 4); 175 msg0 = _mm_add_epi32(msg0, tmp); 176 msg0 = _mm_sha256msg2_epu32(msg0, msg3); 177 msg = _mm_shuffle_epi32(msg, 0x0E); 178 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 179 msg2 = _mm_sha256msg1_epu32(msg2, msg3); 180 181 // Rounds 48-51 182 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[48])); 183 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 184 tmp = _mm_alignr_epi8(msg0, msg3, 4); 185 msg1 = _mm_add_epi32(msg1, tmp); 186 msg1 = _mm_sha256msg2_epu32(msg1, msg0); 187 msg = _mm_shuffle_epi32(msg, 0x0E); 188 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 189 msg3 = _mm_sha256msg1_epu32(msg3, msg0); 190 191 // Rounds 52-55 192 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[52])); 193 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 194 tmp = _mm_alignr_epi8(msg1, msg0, 4); 195 msg2 = _mm_add_epi32(msg2, tmp); 196 msg2 = _mm_sha256msg2_epu32(msg2, msg1); 197 msg = _mm_shuffle_epi32(msg, 0x0E); 198 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 199 200 // Rounds 56-59 201 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[56])); 202 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 203 tmp = _mm_alignr_epi8(msg2, msg1, 4); 204 msg3 = _mm_add_epi32(msg3, tmp); 205 msg3 = _mm_sha256msg2_epu32(msg3, msg2); 206 msg = _mm_shuffle_epi32(msg, 0x0E); 207 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 208 209 // Rounds 60-63 210 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[60])); 211 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 212 msg = _mm_shuffle_epi32(msg, 0x0E); 213 state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 214 215 // Add initial state 216 state0 = _mm_add_epi32(state0, abef_save); 217 state1 = _mm_add_epi32(state1, cdgh_save); 218 219 // Swap byte order back and store 220 tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA 221 state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG 222 state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA 223 state1 = _mm_alignr_epi8(state1, tmp, 8); // HGFE 224 225 _mm_storeu_si128((__m128i*)&state[0], state0); 226 _mm_storeu_si128((__m128i*)&state[4], state1); 227} 228 229// OCaml interface functions 230 231// Initialize SHA256 state 232value oxcaml_sha256_init(value unit) { 233 CAMLparam1(unit); 234 CAMLlocal1(state); 235 236 // Allocate bigarray for state (8 x int32) 237 long dims[1] = {8}; 238 state = caml_ba_alloc_dims(CAML_BA_INT32 | CAML_BA_C_LAYOUT, 1, NULL, dims); 239 uint32_t* s = (uint32_t*)Caml_ba_data_val(state); 240 241 // Copy initial values 242 memcpy(s, H256_INIT, 32); 243 244 CAMLreturn(state); 245} 246 247// Process a single 512-bit block 248value oxcaml_sha256_process_block(value state, value block) { 249 CAMLparam2(state, block); 250 251 uint32_t* s = (uint32_t*)Caml_ba_data_val(state); 252 uint8_t* b = (uint8_t*)Caml_ba_data_val(block); 253 254 sha256_process_block_shani(s, b); 255 256 CAMLreturn(Val_unit); 257} 258 259// Finalize hash with padding and return digest 260value oxcaml_sha256_finalize(value state, value data, value len_v) { 261 CAMLparam3(state, data, len_v); 262 CAMLlocal1(result); 263 264 uint32_t* s = (uint32_t*)Caml_ba_data_val(state); 265 uint8_t* input = (uint8_t*)Caml_ba_data_val(data); 266 uint64_t len = Int64_val(len_v); 267 268 // Process full blocks 269 uint64_t full_blocks = len / 64; 270 for (uint64_t i = 0; i < full_blocks; i++) { 271 sha256_process_block_shani(s, input + i * 64); 272 } 273 274 // Handle final block with padding 275 uint8_t final_block[128] = {0}; // Max 2 blocks for padding 276 uint64_t remaining = len % 64; 277 278 // Copy remaining bytes 279 if (remaining > 0) { 280 memcpy(final_block, input + full_blocks * 64, remaining); 281 } 282 283 // Add padding 284 final_block[remaining] = 0x80; 285 286 // Add length in bits at the end 287 uint64_t bit_len = len * 8; 288 if (remaining >= 56) { 289 // Need two blocks 290 sha256_process_block_shani(s, final_block); 291 memset(final_block, 0, 64); 292 } 293 294 // Add bit length (big-endian) 295 final_block[56] = (bit_len >> 56) & 0xFF; 296 final_block[57] = (bit_len >> 48) & 0xFF; 297 final_block[58] = (bit_len >> 40) & 0xFF; 298 final_block[59] = (bit_len >> 32) & 0xFF; 299 final_block[60] = (bit_len >> 24) & 0xFF; 300 final_block[61] = (bit_len >> 16) & 0xFF; 301 final_block[62] = (bit_len >> 8) & 0xFF; 302 final_block[63] = bit_len & 0xFF; 303 304 sha256_process_block_shani(s, final_block); 305 306 // Create result bigarray (32 bytes) 307 long dims[1] = {32}; 308 result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims); 309 uint8_t* res = (uint8_t*)Caml_ba_data_val(result); 310 311 // Convert to big-endian bytes 312 for (int i = 0; i < 8; i++) { 313 res[i*4 + 0] = (s[i] >> 24) & 0xFF; 314 res[i*4 + 1] = (s[i] >> 16) & 0xFF; 315 res[i*4 + 2] = (s[i] >> 8) & 0xFF; 316 res[i*4 + 3] = s[i] & 0xFF; 317 } 318 319 CAMLreturn(result); 320} 321 322// Fast one-shot SHA256 323value oxcaml_sha256_oneshot(value data, value len_v) { 324 CAMLparam2(data, len_v); 325 CAMLlocal1(result); 326 327 uint8_t* input = (uint8_t*)Caml_ba_data_val(data); 328 uint64_t len = Int64_val(len_v); 329 330 // Local state 331 alignas(16) uint32_t state[8]; 332 memcpy(state, H256_INIT, 32); 333 334 // Process full blocks 335 uint64_t full_blocks = len / 64; 336 for (uint64_t i = 0; i < full_blocks; i++) { 337 sha256_process_block_shani(state, input + i * 64); 338 } 339 340 // Handle final block with padding 341 alignas(64) uint8_t final_block[128] = {0}; 342 uint64_t remaining = len % 64; 343 344 if (remaining > 0) { 345 memcpy(final_block, input + full_blocks * 64, remaining); 346 } 347 348 final_block[remaining] = 0x80; 349 350 uint64_t bit_len = len * 8; 351 if (remaining >= 56) { 352 sha256_process_block_shani(state, final_block); 353 memset(final_block, 0, 64); 354 } 355 356 // Add bit length (big-endian) 357 final_block[56] = (bit_len >> 56) & 0xFF; 358 final_block[57] = (bit_len >> 48) & 0xFF; 359 final_block[58] = (bit_len >> 40) & 0xFF; 360 final_block[59] = (bit_len >> 32) & 0xFF; 361 final_block[60] = (bit_len >> 24) & 0xFF; 362 final_block[61] = (bit_len >> 16) & 0xFF; 363 final_block[62] = (bit_len >> 8) & 0xFF; 364 final_block[63] = bit_len & 0xFF; 365 366 sha256_process_block_shani(state, final_block); 367 368 // Create result bigarray 369 long dims[1] = {32}; 370 result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims); 371 uint8_t* res = (uint8_t*)Caml_ba_data_val(result); 372 373 // Convert to big-endian bytes 374 for (int i = 0; i < 8; i++) { 375 res[i*4 + 0] = (state[i] >> 24) & 0xFF; 376 res[i*4 + 1] = (state[i] >> 16) & 0xFF; 377 res[i*4 + 2] = (state[i] >> 8) & 0xFF; 378 res[i*4 + 3] = state[i] & 0xFF; 379 } 380 381 CAMLreturn(result); 382}