experimental hashing with oxcaml
1#include <immintrin.h>
2#include <stdint.h>
3#include <string.h>
4#include <caml/mlvalues.h>
5#include <caml/memory.h>
6#include <caml/alloc.h>
7#include <caml/bigarray.h>
8
9// Aligned storage for round constants
10alignas(64) static const uint32_t K256[64] = {
11 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
12 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
13 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
14 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
15 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
16 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
17 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
18 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
19 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
20 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
21 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
22 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
23 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
24 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
25 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
26 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
27};
28
29// Initial SHA256 state values
30alignas(16) static const uint32_t H256_INIT[8] = {
31 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
32 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
33};
34
35// Byte swap for endianness
36static const __m128i BSWAP_MASK = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};
37
38// Process a single 512-bit block using SHA-NI instructions
39static void sha256_process_block_shani(uint32_t state[8], const uint8_t block[64]) {
40 __m128i msg0, msg1, msg2, msg3;
41 __m128i tmp;
42 __m128i state0, state1;
43 __m128i msg;
44 __m128i abef_save, cdgh_save;
45
46 // Load initial state
47 tmp = _mm_loadu_si128((const __m128i*)&state[0]);
48 state1 = _mm_loadu_si128((const __m128i*)&state[4]);
49
50 // Swap byte order for initial state
51 tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB
52 state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
53 state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
54 state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
55
56 // Save initial state
57 abef_save = state0;
58 cdgh_save = state1;
59
60 // Load message blocks with byte swap
61 msg0 = _mm_loadu_si128((const __m128i*)(block + 0));
62 msg1 = _mm_loadu_si128((const __m128i*)(block + 16));
63 msg2 = _mm_loadu_si128((const __m128i*)(block + 32));
64 msg3 = _mm_loadu_si128((const __m128i*)(block + 48));
65
66 msg0 = _mm_shuffle_epi8(msg0, BSWAP_MASK);
67 msg1 = _mm_shuffle_epi8(msg1, BSWAP_MASK);
68 msg2 = _mm_shuffle_epi8(msg2, BSWAP_MASK);
69 msg3 = _mm_shuffle_epi8(msg3, BSWAP_MASK);
70
71 // Rounds 0-3
72 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[0]));
73 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
74 msg = _mm_shuffle_epi32(msg, 0x0E);
75 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
76
77 // Rounds 4-7
78 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[4]));
79 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
80 msg = _mm_shuffle_epi32(msg, 0x0E);
81 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
82 msg0 = _mm_sha256msg1_epu32(msg0, msg1);
83
84 // Rounds 8-11
85 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[8]));
86 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
87 msg = _mm_shuffle_epi32(msg, 0x0E);
88 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
89 msg1 = _mm_sha256msg1_epu32(msg1, msg2);
90
91 // Rounds 12-15
92 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[12]));
93 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
94 tmp = _mm_alignr_epi8(msg3, msg2, 4);
95 msg0 = _mm_add_epi32(msg0, tmp);
96 msg0 = _mm_sha256msg2_epu32(msg0, msg3);
97 msg = _mm_shuffle_epi32(msg, 0x0E);
98 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
99 msg2 = _mm_sha256msg1_epu32(msg2, msg3);
100
101 // Rounds 16-19
102 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[16]));
103 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
104 tmp = _mm_alignr_epi8(msg0, msg3, 4);
105 msg1 = _mm_add_epi32(msg1, tmp);
106 msg1 = _mm_sha256msg2_epu32(msg1, msg0);
107 msg = _mm_shuffle_epi32(msg, 0x0E);
108 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
109 msg3 = _mm_sha256msg1_epu32(msg3, msg0);
110
111 // Rounds 20-23
112 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[20]));
113 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
114 tmp = _mm_alignr_epi8(msg1, msg0, 4);
115 msg2 = _mm_add_epi32(msg2, tmp);
116 msg2 = _mm_sha256msg2_epu32(msg2, msg1);
117 msg = _mm_shuffle_epi32(msg, 0x0E);
118 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
119 msg0 = _mm_sha256msg1_epu32(msg0, msg1);
120
121 // Rounds 24-27
122 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[24]));
123 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
124 tmp = _mm_alignr_epi8(msg2, msg1, 4);
125 msg3 = _mm_add_epi32(msg3, tmp);
126 msg3 = _mm_sha256msg2_epu32(msg3, msg2);
127 msg = _mm_shuffle_epi32(msg, 0x0E);
128 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
129 msg1 = _mm_sha256msg1_epu32(msg1, msg2);
130
131 // Rounds 28-31
132 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[28]));
133 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
134 tmp = _mm_alignr_epi8(msg3, msg2, 4);
135 msg0 = _mm_add_epi32(msg0, tmp);
136 msg0 = _mm_sha256msg2_epu32(msg0, msg3);
137 msg = _mm_shuffle_epi32(msg, 0x0E);
138 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
139 msg2 = _mm_sha256msg1_epu32(msg2, msg3);
140
141 // Rounds 32-35
142 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[32]));
143 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
144 tmp = _mm_alignr_epi8(msg0, msg3, 4);
145 msg1 = _mm_add_epi32(msg1, tmp);
146 msg1 = _mm_sha256msg2_epu32(msg1, msg0);
147 msg = _mm_shuffle_epi32(msg, 0x0E);
148 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
149 msg3 = _mm_sha256msg1_epu32(msg3, msg0);
150
151 // Rounds 36-39
152 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[36]));
153 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
154 tmp = _mm_alignr_epi8(msg1, msg0, 4);
155 msg2 = _mm_add_epi32(msg2, tmp);
156 msg2 = _mm_sha256msg2_epu32(msg2, msg1);
157 msg = _mm_shuffle_epi32(msg, 0x0E);
158 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
159 msg0 = _mm_sha256msg1_epu32(msg0, msg1);
160
161 // Rounds 40-43
162 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[40]));
163 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
164 tmp = _mm_alignr_epi8(msg2, msg1, 4);
165 msg3 = _mm_add_epi32(msg3, tmp);
166 msg3 = _mm_sha256msg2_epu32(msg3, msg2);
167 msg = _mm_shuffle_epi32(msg, 0x0E);
168 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
169 msg1 = _mm_sha256msg1_epu32(msg1, msg2);
170
171 // Rounds 44-47
172 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[44]));
173 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
174 tmp = _mm_alignr_epi8(msg3, msg2, 4);
175 msg0 = _mm_add_epi32(msg0, tmp);
176 msg0 = _mm_sha256msg2_epu32(msg0, msg3);
177 msg = _mm_shuffle_epi32(msg, 0x0E);
178 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
179 msg2 = _mm_sha256msg1_epu32(msg2, msg3);
180
181 // Rounds 48-51
182 msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[48]));
183 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
184 tmp = _mm_alignr_epi8(msg0, msg3, 4);
185 msg1 = _mm_add_epi32(msg1, tmp);
186 msg1 = _mm_sha256msg2_epu32(msg1, msg0);
187 msg = _mm_shuffle_epi32(msg, 0x0E);
188 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
189 msg3 = _mm_sha256msg1_epu32(msg3, msg0);
190
191 // Rounds 52-55
192 msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[52]));
193 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
194 tmp = _mm_alignr_epi8(msg1, msg0, 4);
195 msg2 = _mm_add_epi32(msg2, tmp);
196 msg2 = _mm_sha256msg2_epu32(msg2, msg1);
197 msg = _mm_shuffle_epi32(msg, 0x0E);
198 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
199
200 // Rounds 56-59
201 msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[56]));
202 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
203 tmp = _mm_alignr_epi8(msg2, msg1, 4);
204 msg3 = _mm_add_epi32(msg3, tmp);
205 msg3 = _mm_sha256msg2_epu32(msg3, msg2);
206 msg = _mm_shuffle_epi32(msg, 0x0E);
207 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
208
209 // Rounds 60-63
210 msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[60]));
211 state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
212 msg = _mm_shuffle_epi32(msg, 0x0E);
213 state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
214
215 // Add initial state
216 state0 = _mm_add_epi32(state0, abef_save);
217 state1 = _mm_add_epi32(state1, cdgh_save);
218
219 // Swap byte order back and store
220 tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA
221 state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
222 state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
223 state1 = _mm_alignr_epi8(state1, tmp, 8); // HGFE
224
225 _mm_storeu_si128((__m128i*)&state[0], state0);
226 _mm_storeu_si128((__m128i*)&state[4], state1);
227}
228
229// OCaml interface functions
230
231// Initialize SHA256 state
232value oxcaml_sha256_init(value unit) {
233 CAMLparam1(unit);
234 CAMLlocal1(state);
235
236 // Allocate bigarray for state (8 x int32)
237 long dims[1] = {8};
238 state = caml_ba_alloc_dims(CAML_BA_INT32 | CAML_BA_C_LAYOUT, 1, NULL, dims);
239 uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
240
241 // Copy initial values
242 memcpy(s, H256_INIT, 32);
243
244 CAMLreturn(state);
245}
246
247// Process a single 512-bit block
248value oxcaml_sha256_process_block(value state, value block) {
249 CAMLparam2(state, block);
250
251 uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
252 uint8_t* b = (uint8_t*)Caml_ba_data_val(block);
253
254 sha256_process_block_shani(s, b);
255
256 CAMLreturn(Val_unit);
257}
258
259// Finalize hash with padding and return digest
260value oxcaml_sha256_finalize(value state, value data, value len_v) {
261 CAMLparam3(state, data, len_v);
262 CAMLlocal1(result);
263
264 uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
265 uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
266 uint64_t len = Int64_val(len_v);
267
268 // Process full blocks
269 uint64_t full_blocks = len / 64;
270 for (uint64_t i = 0; i < full_blocks; i++) {
271 sha256_process_block_shani(s, input + i * 64);
272 }
273
274 // Handle final block with padding
275 uint8_t final_block[128] = {0}; // Max 2 blocks for padding
276 uint64_t remaining = len % 64;
277
278 // Copy remaining bytes
279 if (remaining > 0) {
280 memcpy(final_block, input + full_blocks * 64, remaining);
281 }
282
283 // Add padding
284 final_block[remaining] = 0x80;
285
286 // Add length in bits at the end
287 uint64_t bit_len = len * 8;
288 if (remaining >= 56) {
289 // Need two blocks
290 sha256_process_block_shani(s, final_block);
291 memset(final_block, 0, 64);
292 }
293
294 // Add bit length (big-endian)
295 final_block[56] = (bit_len >> 56) & 0xFF;
296 final_block[57] = (bit_len >> 48) & 0xFF;
297 final_block[58] = (bit_len >> 40) & 0xFF;
298 final_block[59] = (bit_len >> 32) & 0xFF;
299 final_block[60] = (bit_len >> 24) & 0xFF;
300 final_block[61] = (bit_len >> 16) & 0xFF;
301 final_block[62] = (bit_len >> 8) & 0xFF;
302 final_block[63] = bit_len & 0xFF;
303
304 sha256_process_block_shani(s, final_block);
305
306 // Create result bigarray (32 bytes)
307 long dims[1] = {32};
308 result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
309 uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
310
311 // Convert to big-endian bytes
312 for (int i = 0; i < 8; i++) {
313 res[i*4 + 0] = (s[i] >> 24) & 0xFF;
314 res[i*4 + 1] = (s[i] >> 16) & 0xFF;
315 res[i*4 + 2] = (s[i] >> 8) & 0xFF;
316 res[i*4 + 3] = s[i] & 0xFF;
317 }
318
319 CAMLreturn(result);
320}
321
322// Fast one-shot SHA256
323value oxcaml_sha256_oneshot(value data, value len_v) {
324 CAMLparam2(data, len_v);
325 CAMLlocal1(result);
326
327 uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
328 uint64_t len = Int64_val(len_v);
329
330 // Local state
331 alignas(16) uint32_t state[8];
332 memcpy(state, H256_INIT, 32);
333
334 // Process full blocks
335 uint64_t full_blocks = len / 64;
336 for (uint64_t i = 0; i < full_blocks; i++) {
337 sha256_process_block_shani(state, input + i * 64);
338 }
339
340 // Handle final block with padding
341 alignas(64) uint8_t final_block[128] = {0};
342 uint64_t remaining = len % 64;
343
344 if (remaining > 0) {
345 memcpy(final_block, input + full_blocks * 64, remaining);
346 }
347
348 final_block[remaining] = 0x80;
349
350 uint64_t bit_len = len * 8;
351 if (remaining >= 56) {
352 sha256_process_block_shani(state, final_block);
353 memset(final_block, 0, 64);
354 }
355
356 // Add bit length (big-endian)
357 final_block[56] = (bit_len >> 56) & 0xFF;
358 final_block[57] = (bit_len >> 48) & 0xFF;
359 final_block[58] = (bit_len >> 40) & 0xFF;
360 final_block[59] = (bit_len >> 32) & 0xFF;
361 final_block[60] = (bit_len >> 24) & 0xFF;
362 final_block[61] = (bit_len >> 16) & 0xFF;
363 final_block[62] = (bit_len >> 8) & 0xFF;
364 final_block[63] = bit_len & 0xFF;
365
366 sha256_process_block_shani(state, final_block);
367
368 // Create result bigarray
369 long dims[1] = {32};
370 result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
371 uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
372
373 // Convert to big-endian bytes
374 for (int i = 0; i < 8; i++) {
375 res[i*4 + 0] = (state[i] >> 24) & 0xFF;
376 res[i*4 + 1] = (state[i] >> 16) & 0xFF;
377 res[i*4 + 2] = (state[i] >> 8) & 0xFF;
378 res[i*4 + 3] = state[i] & 0xFF;
379 }
380
381 CAMLreturn(result);
382}