#include <immintrin.h>
#include <stdint.h>
#include <string.h>
#include <caml/mlvalues.h>
#include <caml/memory.h>
#include <caml/alloc.h>
#include <caml/bigarray.h>

// Aligned storage for round constants
alignas(64) static const uint32_t K256[64] = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

// Initial SHA256 state values
alignas(16) static const uint32_t H256_INIT[8] = {
    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};

// Byte swap for endianness
static const __m128i BSWAP_MASK = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};

// Process a single 512-bit block using SHA-NI instructions
static void sha256_process_block_shani(uint32_t state[8], const uint8_t block[64]) {
    __m128i msg0, msg1, msg2, msg3;
    __m128i tmp;
    __m128i state0, state1;
    __m128i msg;
    __m128i abef_save, cdgh_save;
    
    // Load initial state
    tmp = _mm_loadu_si128((const __m128i*)&state[0]);
    state1 = _mm_loadu_si128((const __m128i*)&state[4]);
    
    // Swap byte order for initial state
    tmp = _mm_shuffle_epi32(tmp, 0xB1);  // CDAB
    state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
    state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
    state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
    
    // Save initial state
    abef_save = state0;
    cdgh_save = state1;
    
    // Load message blocks with byte swap
    msg0 = _mm_loadu_si128((const __m128i*)(block + 0));
    msg1 = _mm_loadu_si128((const __m128i*)(block + 16));
    msg2 = _mm_loadu_si128((const __m128i*)(block + 32));
    msg3 = _mm_loadu_si128((const __m128i*)(block + 48));
    
    msg0 = _mm_shuffle_epi8(msg0, BSWAP_MASK);
    msg1 = _mm_shuffle_epi8(msg1, BSWAP_MASK);
    msg2 = _mm_shuffle_epi8(msg2, BSWAP_MASK);
    msg3 = _mm_shuffle_epi8(msg3, BSWAP_MASK);
    
    // Rounds 0-3
    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[0]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    
    // Rounds 4-7
    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[4]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg0 = _mm_sha256msg1_epu32(msg0, msg1);
    
    // Rounds 8-11
    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[8]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg1 = _mm_sha256msg1_epu32(msg1, msg2);
    
    // Rounds 12-15
    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[12]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg3, msg2, 4);
    msg0 = _mm_add_epi32(msg0, tmp);
    msg0 = _mm_sha256msg2_epu32(msg0, msg3);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg2 = _mm_sha256msg1_epu32(msg2, msg3);
    
    // Rounds 16-19
    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[16]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg0, msg3, 4);
    msg1 = _mm_add_epi32(msg1, tmp);
    msg1 = _mm_sha256msg2_epu32(msg1, msg0);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg3 = _mm_sha256msg1_epu32(msg3, msg0);
    
    // Rounds 20-23
    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[20]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg1, msg0, 4);
    msg2 = _mm_add_epi32(msg2, tmp);
    msg2 = _mm_sha256msg2_epu32(msg2, msg1);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg0 = _mm_sha256msg1_epu32(msg0, msg1);
    
    // Rounds 24-27
    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[24]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg2, msg1, 4);
    msg3 = _mm_add_epi32(msg3, tmp);
    msg3 = _mm_sha256msg2_epu32(msg3, msg2);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg1 = _mm_sha256msg1_epu32(msg1, msg2);
    
    // Rounds 28-31
    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[28]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg3, msg2, 4);
    msg0 = _mm_add_epi32(msg0, tmp);
    msg0 = _mm_sha256msg2_epu32(msg0, msg3);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg2 = _mm_sha256msg1_epu32(msg2, msg3);
    
    // Rounds 32-35
    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[32]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg0, msg3, 4);
    msg1 = _mm_add_epi32(msg1, tmp);
    msg1 = _mm_sha256msg2_epu32(msg1, msg0);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg3 = _mm_sha256msg1_epu32(msg3, msg0);
    
    // Rounds 36-39
    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[36]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg1, msg0, 4);
    msg2 = _mm_add_epi32(msg2, tmp);
    msg2 = _mm_sha256msg2_epu32(msg2, msg1);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg0 = _mm_sha256msg1_epu32(msg0, msg1);
    
    // Rounds 40-43
    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[40]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg2, msg1, 4);
    msg3 = _mm_add_epi32(msg3, tmp);
    msg3 = _mm_sha256msg2_epu32(msg3, msg2);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg1 = _mm_sha256msg1_epu32(msg1, msg2);
    
    // Rounds 44-47
    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[44]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg3, msg2, 4);
    msg0 = _mm_add_epi32(msg0, tmp);
    msg0 = _mm_sha256msg2_epu32(msg0, msg3);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg2 = _mm_sha256msg1_epu32(msg2, msg3);
    
    // Rounds 48-51
    msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[48]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg0, msg3, 4);
    msg1 = _mm_add_epi32(msg1, tmp);
    msg1 = _mm_sha256msg2_epu32(msg1, msg0);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    msg3 = _mm_sha256msg1_epu32(msg3, msg0);
    
    // Rounds 52-55
    msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[52]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg1, msg0, 4);
    msg2 = _mm_add_epi32(msg2, tmp);
    msg2 = _mm_sha256msg2_epu32(msg2, msg1);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    
    // Rounds 56-59
    msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[56]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    tmp = _mm_alignr_epi8(msg2, msg1, 4);
    msg3 = _mm_add_epi32(msg3, tmp);
    msg3 = _mm_sha256msg2_epu32(msg3, msg2);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    
    // Rounds 60-63
    msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[60]));
    state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    msg = _mm_shuffle_epi32(msg, 0x0E);
    state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
    
    // Add initial state
    state0 = _mm_add_epi32(state0, abef_save);
    state1 = _mm_add_epi32(state1, cdgh_save);
    
    // Swap byte order back and store
    tmp = _mm_shuffle_epi32(state0, 0x1B);  // FEBA
    state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
    state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
    state1 = _mm_alignr_epi8(state1, tmp, 8); // HGFE
    
    _mm_storeu_si128((__m128i*)&state[0], state0);
    _mm_storeu_si128((__m128i*)&state[4], state1);
}

// OCaml interface functions

// Initialize SHA256 state
value oxcaml_sha256_init(value unit) {
    CAMLparam1(unit);
    CAMLlocal1(state);
    
    // Allocate bigarray for state (8 x int32)
    long dims[1] = {8};
    state = caml_ba_alloc_dims(CAML_BA_INT32 | CAML_BA_C_LAYOUT, 1, NULL, dims);
    uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
    
    // Copy initial values
    memcpy(s, H256_INIT, 32);
    
    CAMLreturn(state);
}

// Process a single 512-bit block
value oxcaml_sha256_process_block(value state, value block) {
    CAMLparam2(state, block);
    
    uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
    uint8_t* b = (uint8_t*)Caml_ba_data_val(block);
    
    sha256_process_block_shani(s, b);
    
    CAMLreturn(Val_unit);
}

// Finalize hash with padding and return digest
value oxcaml_sha256_finalize(value state, value data, value len_v) {
    CAMLparam3(state, data, len_v);
    CAMLlocal1(result);
    
    uint32_t* s = (uint32_t*)Caml_ba_data_val(state);
    uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
    uint64_t len = Int64_val(len_v);
    
    // Process full blocks
    uint64_t full_blocks = len / 64;
    for (uint64_t i = 0; i < full_blocks; i++) {
        sha256_process_block_shani(s, input + i * 64);
    }
    
    // Handle final block with padding
    uint8_t final_block[128] = {0};  // Max 2 blocks for padding
    uint64_t remaining = len % 64;
    
    // Copy remaining bytes
    if (remaining > 0) {
        memcpy(final_block, input + full_blocks * 64, remaining);
    }
    
    // Add padding
    final_block[remaining] = 0x80;
    
    // Add length in bits at the end
    uint64_t bit_len = len * 8;
    if (remaining >= 56) {
        // Need two blocks
        sha256_process_block_shani(s, final_block);
        memset(final_block, 0, 64);
    }
    
    // Add bit length (big-endian)
    final_block[56] = (bit_len >> 56) & 0xFF;
    final_block[57] = (bit_len >> 48) & 0xFF;
    final_block[58] = (bit_len >> 40) & 0xFF;
    final_block[59] = (bit_len >> 32) & 0xFF;
    final_block[60] = (bit_len >> 24) & 0xFF;
    final_block[61] = (bit_len >> 16) & 0xFF;
    final_block[62] = (bit_len >> 8) & 0xFF;
    final_block[63] = bit_len & 0xFF;
    
    sha256_process_block_shani(s, final_block);
    
    // Create result bigarray (32 bytes)
    long dims[1] = {32};
    result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
    uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
    
    // Convert to big-endian bytes
    for (int i = 0; i < 8; i++) {
        res[i*4 + 0] = (s[i] >> 24) & 0xFF;
        res[i*4 + 1] = (s[i] >> 16) & 0xFF;
        res[i*4 + 2] = (s[i] >> 8) & 0xFF;
        res[i*4 + 3] = s[i] & 0xFF;
    }
    
    CAMLreturn(result);
}

// Fast one-shot SHA256
value oxcaml_sha256_oneshot(value data, value len_v) {
    CAMLparam2(data, len_v);
    CAMLlocal1(result);
    
    uint8_t* input = (uint8_t*)Caml_ba_data_val(data);
    uint64_t len = Int64_val(len_v);
    
    // Local state
    alignas(16) uint32_t state[8];
    memcpy(state, H256_INIT, 32);
    
    // Process full blocks
    uint64_t full_blocks = len / 64;
    for (uint64_t i = 0; i < full_blocks; i++) {
        sha256_process_block_shani(state, input + i * 64);
    }
    
    // Handle final block with padding
    alignas(64) uint8_t final_block[128] = {0};
    uint64_t remaining = len % 64;
    
    if (remaining > 0) {
        memcpy(final_block, input + full_blocks * 64, remaining);
    }
    
    final_block[remaining] = 0x80;
    
    uint64_t bit_len = len * 8;
    if (remaining >= 56) {
        sha256_process_block_shani(state, final_block);
        memset(final_block, 0, 64);
    }
    
    // Add bit length (big-endian)
    final_block[56] = (bit_len >> 56) & 0xFF;
    final_block[57] = (bit_len >> 48) & 0xFF;
    final_block[58] = (bit_len >> 40) & 0xFF;
    final_block[59] = (bit_len >> 32) & 0xFF;
    final_block[60] = (bit_len >> 24) & 0xFF;
    final_block[61] = (bit_len >> 16) & 0xFF;
    final_block[62] = (bit_len >> 8) & 0xFF;
    final_block[63] = bit_len & 0xFF;
    
    sha256_process_block_shani(state, final_block);
    
    // Create result bigarray
    long dims[1] = {32};
    result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims);
    uint8_t* res = (uint8_t*)Caml_ba_data_val(result);
    
    // Convert to big-endian bytes
    for (int i = 0; i < 8; i++) {
        res[i*4 + 0] = (state[i] >> 24) & 0xFF;
        res[i*4 + 1] = (state[i] >> 16) & 0xFF;
        res[i*4 + 2] = (state[i] >> 8) & 0xFF;
        res[i*4 + 3] = state[i] & 0xFF;
    }
    
    CAMLreturn(result);
}