lib/sha256_stubs.c at 58db89cca953a414496adf384e1012f827f1818c · anil.recoil.org/oxsha

anil.recoil.org / oxsha
experimental hashing with oxcaml
oxsha / lib / sha256_stubs.c
at 58db89cca953a414496adf384e1012f827f1818c 14 kB view raw
  1#include <immintrin.h>
  2#include <stdint.h>
  3#include <string.h>
  4#include <caml/mlvalues.h>
  5#include <caml/memory.h>
  6#include <caml/alloc.h>
  7#include <caml/bigarray.h>
  8
  9// Aligned storage for round constants
 10alignas(64) static const uint32_t K256[64] = {
 11    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 12    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 13    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 14    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 15    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 16    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 17    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 18    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 19    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 20    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 21    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 22    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 23    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 24    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 25    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 26    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 27};
 28
 29// Initial SHA256 state values
 30alignas(16) static const uint32_t H256_INIT[8] = {
 31    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
 32    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
 33};
 34
 35// Byte swap for endianness
 36static const __m128i BSWAP_MASK = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};
 37
 38// Process a single 512-bit block using SHA-NI instructions
 39static void sha256_process_block_shani(uint32_t state[8], const uint8_t block[64]) {
 40    __m128i msg0, msg1, msg2, msg3;
 41    __m128i tmp;
 42    __m128i state0, state1;
 43    __m128i msg;
 44    __m128i abef_save, cdgh_save;
 45    
 46    // Load initial state
 47    tmp = _mm_loadu_si128((const __m128i*)&state[0]);
 48    state1 = _mm_loadu_si128((const __m128i*)&state[4]);
 49    
 50    // Swap byte order for initial state
 51    tmp = _mm_shuffle_epi32(tmp, 0xB1);  // CDAB
 52    state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
 53    state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
 54    state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
 55    
 56    // Save initial state
 57    abef_save = state0;
 58    cdgh_save = state1;
 59    
 60    // Load message blocks with byte swap
 61    msg0 = _mm_loadu_si128((const __m128i*)(block + 0));
 62    msg1 = _mm_loadu_si128((const __m128i*)(block + 16));
 63    msg2 = _mm_loadu_si128((const __m128i*)(block + 32));
 64    msg3 = _mm_loadu_si128((const __m128i*)(block + 48));
 65    
 66    msg0 = _mm_shuffle_epi8(msg0, BSWAP_MASK);
 67    msg1 = _mm_shuffle_epi8(msg1, BSWAP_MASK);
 68    msg2 = _mm_shuffle_epi8(msg2, BSWAP_MASK);
 69    msg3 = _mm_shuffle_epi8(msg3, BSWAP_MASK);
 70    
 71    // Rounds 0-3
 72    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[0]));
 73    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
 74    msg = _mm_shuffle_epi32(msg, 0x0E);
 75    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
 76    
 77    // Rounds 4-7
 78    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[4]));
 79    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
 80    msg = _mm_shuffle_epi32(msg, 0x0E);
 81    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
 82    msg0 = _mm_sha256msg1_epu32(msg0, msg1);
 83    
 84    // Rounds 8-11
 85    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[8]));
 86    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
 87    msg = _mm_shuffle_epi32(msg, 0x0E);
 88    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
 89    msg1 = _mm_sha256msg1_epu32(msg1, msg2);
 90    
 91    // Rounds 12-15
 92    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[12]));
 93    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
 94    tmp = _mm_alignr_epi8(msg3, msg2, 4);
 95    msg0 = _mm_add_epi32(msg0, tmp);
 96    msg0 = _mm_sha256msg2_epu32(msg0, msg3);
 97    msg = _mm_shuffle_epi32(msg, 0x0E);
 98    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
 99    msg2 = _mm_sha256msg1_epu32(msg2, msg3);
100    
101    // Rounds 16-19
102    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[16]));
103    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
104    tmp = _mm_alignr_epi8(msg0, msg3, 4);
105    msg1 = _mm_add_epi32(msg1, tmp);
106    msg1 = _mm_sha256msg2_epu32(msg1, msg0);
107    msg = _mm_shuffle_epi32(msg, 0x0E);
108    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
109    msg3 = _mm_sha256msg1_epu32(msg3, msg0);
110    
111    // Rounds 20-23
112    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[20]));
113    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
114    tmp = _mm_alignr_epi8(msg1, msg0, 4);
115    msg2 = _mm_add_epi32(msg2, tmp);
116    msg2 = _mm_sha256msg2_epu32(msg2, msg1);
117    msg = _mm_shuffle_epi32(msg, 0x0E);
118    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
119    msg0 = _mm_sha256msg1_epu32(msg0, msg1);
120    
121    // Rounds 24-27
122    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[24]));
123    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
124    tmp = _mm_alignr_epi8(msg2, msg1, 4);
125    msg3 = _mm_add_epi32(msg3, tmp);
126    msg3 = _mm_sha256msg2_epu32(msg3, msg2);
127    msg = _mm_shuffle_epi32(msg, 0x0E);
128    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
129    msg1 = _mm_sha256msg1_epu32(msg1, msg2);
130    
131    // Rounds 28-31
132    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[28]));
133    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
134    tmp = _mm_alignr_epi8(msg3, msg2, 4);
135    msg0 = _mm_add_epi32(msg0, tmp);
136    msg0 = _mm_sha256msg2_epu32(msg0, msg3);
137    msg = _mm_shuffle_epi32(msg, 0x0E);
138    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
139    msg2 = _mm_sha256msg1_epu32(msg2, msg3);
140    
141    // Rounds 32-35
142    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[32]));
143    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
144    tmp = _mm_alignr_epi8(msg0, msg3, 4);
145    msg1 = _mm_add_epi32(msg1, tmp);
146    msg1 = _mm_sha256msg2_epu32(msg1, msg0);
147    msg = _mm_shuffle_epi32(msg, 0x0E);
148    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
149    msg3 = _mm_sha256msg1_epu32(msg3, msg0);
150    
151    // Rounds 36-39
152    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[36]));
153    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
154    tmp = _mm_alignr_epi8(msg1, msg0, 4);
155    msg2 = _mm_add_epi32(msg2, tmp);
156    msg2 = _mm_sha256msg2_epu32(msg2, msg1);
157    msg = _mm_shuffle_epi32(msg, 0x0E);
158    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
159    msg0 = _mm_sha256msg1_epu32(msg0, msg1);
160    
161    // Rounds 40-43
162    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[40]));
163    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
164    tmp = _mm_alignr_epi8(msg2, msg1, 4);
165    msg3 = _mm_add_epi32(msg3, tmp);
166    msg3 = _mm_sha256msg2_epu32(msg3, msg2);
167    msg = _mm_shuffle_epi32(msg, 0x0E);
168    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
169    msg1 = _mm_sha256msg1_epu32(msg1, msg2);
170    
171    // Rounds 44-47
172    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[44]));
173    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
174    tmp = _mm_alignr_epi8(msg3, msg2, 4);
175    msg0 = _mm_add_epi32(msg0, tmp);
176    msg0 = _mm_sha256msg2_epu32(msg0, msg3);
177    msg = _mm_shuffle_epi32(msg, 0x0E);
178    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
179    msg2 = _mm_sha256msg1_epu32(msg2, msg3);
180    
181    // Rounds 48-51
182    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[48]));
183    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
184    tmp = _mm_alignr_epi8(msg0, msg3, 4);
185    msg1 = _mm_add_epi32(msg1, tmp);
186    msg1 = _mm_sha256msg2_epu32(msg1, msg0);
187    msg = _mm_shuffle_epi32(msg, 0x0E);
188    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
189    msg3 = _mm_sha256msg1_epu32(msg3, msg0);
190    
191    // Rounds 52-55
192    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[52]));
193    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
194    tmp = _mm_alignr_epi8(msg1, msg0, 4);
195    msg2 = _mm_add_epi32(msg2, tmp);
196    msg2 = _mm_sha256msg2_epu32(msg2, msg1);
197    msg = _mm_shuffle_epi32(msg, 0x0E);
198    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
199    
200    // Rounds 56-59
201    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[56]));
202    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
203    tmp = _mm_alignr_epi8(msg2, msg1, 4);
204    msg3 = _mm_add_epi32(msg3, tmp);
205    msg3 = _mm_sha256msg2_epu32(msg3, msg2);
206    msg = _mm_shuffle_epi32(msg, 0x0E);
207    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
208    
209    // Rounds 60-63
210    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[60]));
211    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
212    msg = _mm_shuffle_epi32(msg, 0x0E);
213    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
214    
215    // Add initial state
216    state0 = _mm_add_epi32(state0, abef_save);
217    state1 = _mm_add_epi32(state1, cdgh_save);
218    
219    // Swap byte order back and store
220    tmp = _mm_shuffle_epi32(state0, 0x1B);  // FEBA
221    state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
222    state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
223    state1 = _mm_alignr_epi8(state1, tmp, 8); // HGFE
224    
225    _mm_storeu_si128((__m128i*)&state[0], state0);
226    _mm_storeu_si128((__m128i*)&state[4], state1);
227}
228
229// OCaml interface functions
230
231// Initialize SHA256 state
232value oxcaml_sha256_init(value unit) {
233    CAMLparam1(unit);
234    CAMLlocal1(state);
235    
236    // Allocate bigarray for state (8 x int32)
237    long dims[1] = {8};
238    state = caml_ba_alloc_dims(CAML_BA_INT32 | CAML_BA_C_LAYOUT, 1, NULL, dims);
239    uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
240    
241    // Copy initial values
242    memcpy(s, H256_INIT, 32);
243    
244    CAMLreturn(state);
245}
246
247// Process a single 512-bit block
248value oxcaml_sha256_process_block(value state, value block) {
249    CAMLparam2(state, block);
250    
251    uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
252    uint8_t* b = (uint8_t*)Caml_ba_data_val(block);
253    
254    sha256_process_block_shani(s, b);
255    
256    CAMLreturn(Val_unit);
257}
258
259// Finalize hash with padding and return digest
260value oxcaml_sha256_finalize(value state, value data, value len_v) {
261    CAMLparam3(state, data, len_v);
262    CAMLlocal1(result);
263    
264    uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
265    uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
266    uint64_t len = Int64_val(len_v);
267    
268    // Process full blocks
269    uint64_t full_blocks = len / 64;
270    for (uint64_t i = 0; i < full_blocks; i++) {
271        sha256_process_block_shani(s, input + i * 64);
272    }
273    
274    // Handle final block with padding
275    uint8_t final_block[128] = {0};  // Max 2 blocks for padding
276    uint64_t remaining = len % 64;
277    
278    // Copy remaining bytes
279    if (remaining > 0) {
280        memcpy(final_block, input + full_blocks * 64, remaining);
281    }
282    
283    // Add padding
284    final_block[remaining] = 0x80;
285    
286    // Add length in bits at the end
287    uint64_t bit_len = len * 8;
288    if (remaining >= 56) {
289        // Need two blocks
290        sha256_process_block_shani(s, final_block);
291        memset(final_block, 0, 64);
292    }
293    
294    // Add bit length (big-endian)
295    final_block[56] = (bit_len >> 56) & 0xFF;
296    final_block[57] = (bit_len >> 48) & 0xFF;
297    final_block[58] = (bit_len >> 40) & 0xFF;
298    final_block[59] = (bit_len >> 32) & 0xFF;
299    final_block[60] = (bit_len >> 24) & 0xFF;
300    final_block[61] = (bit_len >> 16) & 0xFF;
301    final_block[62] = (bit_len >> 8) & 0xFF;
302    final_block[63] = bit_len & 0xFF;
303    
304    sha256_process_block_shani(s, final_block);
305    
306    // Create result bigarray (32 bytes)
307    long dims[1] = {32};
308    result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
309    uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
310    
311    // Convert to big-endian bytes
312    for (int i = 0; i < 8; i++) {
313        res[i*4 + 0] = (s[i] >> 24) & 0xFF;
314        res[i*4 + 1] = (s[i] >> 16) & 0xFF;
315        res[i*4 + 2] = (s[i] >> 8) & 0xFF;
316        res[i*4 + 3] = s[i] & 0xFF;
317    }
318    
319    CAMLreturn(result);
320}
321
322// Fast one-shot SHA256
323value oxcaml_sha256_oneshot(value data, value len_v) {
324    CAMLparam2(data, len_v);
325    CAMLlocal1(result);
326    
327    uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
328    uint64_t len = Int64_val(len_v);
329    
330    // Local state
331    alignas(16) uint32_t state[8];
332    memcpy(state, H256_INIT, 32);
333    
334    // Process full blocks
335    uint64_t full_blocks = len / 64;
336    for (uint64_t i = 0; i < full_blocks; i++) {
337        sha256_process_block_shani(state, input + i * 64);
338    }
339    
340    // Handle final block with padding
341    alignas(64) uint8_t final_block[128] = {0};
342    uint64_t remaining = len % 64;
343    
344    if (remaining > 0) {
345        memcpy(final_block, input + full_blocks * 64, remaining);
346    }
347    
348    final_block[remaining] = 0x80;
349    
350    uint64_t bit_len = len * 8;
351    if (remaining >= 56) {
352        sha256_process_block_shani(state, final_block);
353        memset(final_block, 0, 64);
354    }
355    
356    // Add bit length (big-endian)
357    final_block[56] = (bit_len >> 56) & 0xFF;
358    final_block[57] = (bit_len >> 48) & 0xFF;
359    final_block[58] = (bit_len >> 40) & 0xFF;
360    final_block[59] = (bit_len >> 32) & 0xFF;
361    final_block[60] = (bit_len >> 24) & 0xFF;
362    final_block[61] = (bit_len >> 16) & 0xFF;
363    final_block[62] = (bit_len >> 8) & 0xFF;
364    final_block[63] = bit_len & 0xFF;
365    
366    sha256_process_block_shani(state, final_block);
367    
368    // Create result bigarray
369    long dims[1] = {32};
370    result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
371    uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
372    
373    // Convert to big-endian bytes
374    for (int i = 0; i < 8; i++) {
375        res[i*4 + 0] = (state[i] >> 24) & 0xFF;
376        res[i*4 + 1] = (state[i] >> 16) & 0xFF;
377        res[i*4 + 2] = (state[i] >> 8) & 0xFF;
378        res[i*4 + 3] = state[i] & 0xFF;
379    }
380    
381    CAMLreturn(result);
382}