···
1
+
#include <immintrin.h>
4
+
#include <caml/mlvalues.h>
5
+
#include <caml/memory.h>
6
+
#include <caml/alloc.h>
7
+
#include <caml/bigarray.h>
9
+
// Aligned storage for round constants
10
+
alignas(64) static const uint32_t K256[64] = {
11
+
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
12
+
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
13
+
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
14
+
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
15
+
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
16
+
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
17
+
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
18
+
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
19
+
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
20
+
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
21
+
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
22
+
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
23
+
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
24
+
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
25
+
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
26
+
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
29
+
// Initial SHA256 state values
30
+
alignas(16) static const uint32_t H256_INIT[8] = {
31
+
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
32
+
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
35
+
// Byte swap for endianness
36
+
static const __m128i BSWAP_MASK = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};
38
+
// Process a single 512-bit block using SHA-NI instructions
39
+
static void sha256_process_block_shani(uint32_t state[8], const uint8_t block[64]) {
40
+
__m128i msg0, msg1, msg2, msg3;
42
+
__m128i state0, state1;
44
+
__m128i abef_save, cdgh_save;
46
+
// Load initial state
47
+
tmp = _mm_loadu_si128((const __m128i*)&state[0]);
48
+
state1 = _mm_loadu_si128((const __m128i*)&state[4]);
50
+
// Swap byte order for initial state
51
+
tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB
52
+
state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
53
+
state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
54
+
state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
56
+
// Save initial state
60
+
// Load message blocks with byte swap
61
+
msg0 = _mm_loadu_si128((const __m128i*)(block + 0));
62
+
msg1 = _mm_loadu_si128((const __m128i*)(block + 16));
63
+
msg2 = _mm_loadu_si128((const __m128i*)(block + 32));
64
+
msg3 = _mm_loadu_si128((const __m128i*)(block + 48));
66
+
msg0 = _mm_shuffle_epi8(msg0, BSWAP_MASK);
67
+
msg1 = _mm_shuffle_epi8(msg1, BSWAP_MASK);
68
+
msg2 = _mm_shuffle_epi8(msg2, BSWAP_MASK);
69
+
msg3 = _mm_shuffle_epi8(msg3, BSWAP_MASK);
72
+
msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[0]));
73
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
74
+
msg = _mm_shuffle_epi32(msg, 0x0E);
75
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
78
+
msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[4]));
79
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
80
+
msg = _mm_shuffle_epi32(msg, 0x0E);
81
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
82
+
msg0 = _mm_sha256msg1_epu32(msg0, msg1);
85
+
msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[8]));
86
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
87
+
msg = _mm_shuffle_epi32(msg, 0x0E);
88
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
89
+
msg1 = _mm_sha256msg1_epu32(msg1, msg2);
92
+
msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[12]));
93
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
94
+
tmp = _mm_alignr_epi8(msg3, msg2, 4);
95
+
msg0 = _mm_add_epi32(msg0, tmp);
96
+
msg0 = _mm_sha256msg2_epu32(msg0, msg3);
97
+
msg = _mm_shuffle_epi32(msg, 0x0E);
98
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
99
+
msg2 = _mm_sha256msg1_epu32(msg2, msg3);
102
+
msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[16]));
103
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
104
+
tmp = _mm_alignr_epi8(msg0, msg3, 4);
105
+
msg1 = _mm_add_epi32(msg1, tmp);
106
+
msg1 = _mm_sha256msg2_epu32(msg1, msg0);
107
+
msg = _mm_shuffle_epi32(msg, 0x0E);
108
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
109
+
msg3 = _mm_sha256msg1_epu32(msg3, msg0);
112
+
msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[20]));
113
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
114
+
tmp = _mm_alignr_epi8(msg1, msg0, 4);
115
+
msg2 = _mm_add_epi32(msg2, tmp);
116
+
msg2 = _mm_sha256msg2_epu32(msg2, msg1);
117
+
msg = _mm_shuffle_epi32(msg, 0x0E);
118
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
119
+
msg0 = _mm_sha256msg1_epu32(msg0, msg1);
122
+
msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[24]));
123
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
124
+
tmp = _mm_alignr_epi8(msg2, msg1, 4);
125
+
msg3 = _mm_add_epi32(msg3, tmp);
126
+
msg3 = _mm_sha256msg2_epu32(msg3, msg2);
127
+
msg = _mm_shuffle_epi32(msg, 0x0E);
128
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
129
+
msg1 = _mm_sha256msg1_epu32(msg1, msg2);
132
+
msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[28]));
133
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
134
+
tmp = _mm_alignr_epi8(msg3, msg2, 4);
135
+
msg0 = _mm_add_epi32(msg0, tmp);
136
+
msg0 = _mm_sha256msg2_epu32(msg0, msg3);
137
+
msg = _mm_shuffle_epi32(msg, 0x0E);
138
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
139
+
msg2 = _mm_sha256msg1_epu32(msg2, msg3);
142
+
msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[32]));
143
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
144
+
tmp = _mm_alignr_epi8(msg0, msg3, 4);
145
+
msg1 = _mm_add_epi32(msg1, tmp);
146
+
msg1 = _mm_sha256msg2_epu32(msg1, msg0);
147
+
msg = _mm_shuffle_epi32(msg, 0x0E);
148
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
149
+
msg3 = _mm_sha256msg1_epu32(msg3, msg0);
152
+
msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[36]));
153
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
154
+
tmp = _mm_alignr_epi8(msg1, msg0, 4);
155
+
msg2 = _mm_add_epi32(msg2, tmp);
156
+
msg2 = _mm_sha256msg2_epu32(msg2, msg1);
157
+
msg = _mm_shuffle_epi32(msg, 0x0E);
158
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
159
+
msg0 = _mm_sha256msg1_epu32(msg0, msg1);
162
+
msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[40]));
163
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
164
+
tmp = _mm_alignr_epi8(msg2, msg1, 4);
165
+
msg3 = _mm_add_epi32(msg3, tmp);
166
+
msg3 = _mm_sha256msg2_epu32(msg3, msg2);
167
+
msg = _mm_shuffle_epi32(msg, 0x0E);
168
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
169
+
msg1 = _mm_sha256msg1_epu32(msg1, msg2);
172
+
msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[44]));
173
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
174
+
tmp = _mm_alignr_epi8(msg3, msg2, 4);
175
+
msg0 = _mm_add_epi32(msg0, tmp);
176
+
msg0 = _mm_sha256msg2_epu32(msg0, msg3);
177
+
msg = _mm_shuffle_epi32(msg, 0x0E);
178
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
179
+
msg2 = _mm_sha256msg1_epu32(msg2, msg3);
182
+
msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[48]));
183
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
184
+
tmp = _mm_alignr_epi8(msg0, msg3, 4);
185
+
msg1 = _mm_add_epi32(msg1, tmp);
186
+
msg1 = _mm_sha256msg2_epu32(msg1, msg0);
187
+
msg = _mm_shuffle_epi32(msg, 0x0E);
188
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
189
+
msg3 = _mm_sha256msg1_epu32(msg3, msg0);
192
+
msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[52]));
193
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
194
+
tmp = _mm_alignr_epi8(msg1, msg0, 4);
195
+
msg2 = _mm_add_epi32(msg2, tmp);
196
+
msg2 = _mm_sha256msg2_epu32(msg2, msg1);
197
+
msg = _mm_shuffle_epi32(msg, 0x0E);
198
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
201
+
msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[56]));
202
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
203
+
tmp = _mm_alignr_epi8(msg2, msg1, 4);
204
+
msg3 = _mm_add_epi32(msg3, tmp);
205
+
msg3 = _mm_sha256msg2_epu32(msg3, msg2);
206
+
msg = _mm_shuffle_epi32(msg, 0x0E);
207
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
210
+
msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[60]));
211
+
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
212
+
msg = _mm_shuffle_epi32(msg, 0x0E);
213
+
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
215
+
// Add initial state
216
+
state0 = _mm_add_epi32(state0, abef_save);
217
+
state1 = _mm_add_epi32(state1, cdgh_save);
219
+
// Swap byte order back and store
220
+
tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA
221
+
state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
222
+
state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
223
+
state1 = _mm_alignr_epi8(state1, tmp, 8); // HGFE
225
+
_mm_storeu_si128((__m128i*)&state[0], state0);
226
+
_mm_storeu_si128((__m128i*)&state[4], state1);
229
+
// OCaml interface functions
231
+
// Initialize SHA256 state
232
+
value oxcaml_sha256_init(value unit) {
236
+
// Allocate bigarray for state (8 x int32)
237
+
long dims[1] = {8};
238
+
state = caml_ba_alloc_dims(CAML_BA_INT32 | CAML_BA_C_LAYOUT, 1, NULL, dims);
239
+
uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
241
+
// Copy initial values
242
+
memcpy(s, H256_INIT, 32);
247
+
// Process a single 512-bit block
248
+
value oxcaml_sha256_process_block(value state, value block) {
249
+
CAMLparam2(state, block);
251
+
uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
252
+
uint8_t* b = (uint8_t*)Caml_ba_data_val(block);
254
+
sha256_process_block_shani(s, b);
256
+
CAMLreturn(Val_unit);
259
+
// Finalize hash with padding and return digest
260
+
value oxcaml_sha256_finalize(value state, value data, value len_v) {
261
+
CAMLparam3(state, data, len_v);
262
+
CAMLlocal1(result);
264
+
uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
265
+
uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
266
+
uint64_t len = Int64_val(len_v);
268
+
// Process full blocks
269
+
uint64_t full_blocks = len / 64;
270
+
for (uint64_t i = 0; i < full_blocks; i++) {
271
+
sha256_process_block_shani(s, input + i * 64);
274
+
// Handle final block with padding
275
+
uint8_t final_block[128] = {0}; // Max 2 blocks for padding
276
+
uint64_t remaining = len % 64;
278
+
// Copy remaining bytes
279
+
if (remaining > 0) {
280
+
memcpy(final_block, input + full_blocks * 64, remaining);
284
+
final_block[remaining] = 0x80;
286
+
// Add length in bits at the end
287
+
uint64_t bit_len = len * 8;
288
+
if (remaining >= 56) {
290
+
sha256_process_block_shani(s, final_block);
291
+
memset(final_block, 0, 64);
294
+
// Add bit length (big-endian)
295
+
final_block[56] = (bit_len >> 56) & 0xFF;
296
+
final_block[57] = (bit_len >> 48) & 0xFF;
297
+
final_block[58] = (bit_len >> 40) & 0xFF;
298
+
final_block[59] = (bit_len >> 32) & 0xFF;
299
+
final_block[60] = (bit_len >> 24) & 0xFF;
300
+
final_block[61] = (bit_len >> 16) & 0xFF;
301
+
final_block[62] = (bit_len >> 8) & 0xFF;
302
+
final_block[63] = bit_len & 0xFF;
304
+
sha256_process_block_shani(s, final_block);
306
+
// Create result bigarray (32 bytes)
307
+
long dims[1] = {32};
308
+
result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
309
+
uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
311
+
// Convert to big-endian bytes
312
+
for (int i = 0; i < 8; i++) {
313
+
res[i*4 + 0] = (s[i] >> 24) & 0xFF;
314
+
res[i*4 + 1] = (s[i] >> 16) & 0xFF;
315
+
res[i*4 + 2] = (s[i] >> 8) & 0xFF;
316
+
res[i*4 + 3] = s[i] & 0xFF;
319
+
CAMLreturn(result);
322
+
// Fast one-shot SHA256
323
+
value oxcaml_sha256_oneshot(value data, value len_v) {
324
+
CAMLparam2(data, len_v);
325
+
CAMLlocal1(result);
327
+
uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
328
+
uint64_t len = Int64_val(len_v);
331
+
alignas(16) uint32_t state[8];
332
+
memcpy(state, H256_INIT, 32);
334
+
// Process full blocks
335
+
uint64_t full_blocks = len / 64;
336
+
for (uint64_t i = 0; i < full_blocks; i++) {
337
+
sha256_process_block_shani(state, input + i * 64);
340
+
// Handle final block with padding
341
+
alignas(64) uint8_t final_block[128] = {0};
342
+
uint64_t remaining = len % 64;
344
+
if (remaining > 0) {
345
+
memcpy(final_block, input + full_blocks * 64, remaining);
348
+
final_block[remaining] = 0x80;
350
+
uint64_t bit_len = len * 8;
351
+
if (remaining >= 56) {
352
+
sha256_process_block_shani(state, final_block);
353
+
memset(final_block, 0, 64);
356
+
// Add bit length (big-endian)
357
+
final_block[56] = (bit_len >> 56) & 0xFF;
358
+
final_block[57] = (bit_len >> 48) & 0xFF;
359
+
final_block[58] = (bit_len >> 40) & 0xFF;
360
+
final_block[59] = (bit_len >> 32) & 0xFF;
361
+
final_block[60] = (bit_len >> 24) & 0xFF;
362
+
final_block[61] = (bit_len >> 16) & 0xFF;
363
+
final_block[62] = (bit_len >> 8) & 0xFF;
364
+
final_block[63] = bit_len & 0xFF;
366
+
sha256_process_block_shani(state, final_block);
368
+
// Create result bigarray
369
+
long dims[1] = {32};
370
+
result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
371
+
uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
373
+
// Convert to big-endian bytes
374
+
for (int i = 0; i < 8; i++) {
375
+
res[i*4 + 0] = (state[i] >> 24) & 0xFF;
376
+
res[i*4 + 1] = (state[i] >> 16) & 0xFF;
377
+
res[i*4 + 2] = (state[i] >> 8) & 0xFF;
378
+
res[i*4 + 3] = state[i] & 0xFF;
381
+
CAMLreturn(result);