/*
 * Adapted D. J. Bernstein's implementation (cr.yp.to)
 * Public Domain
 *
 * Copyright (c) Paul Stankovski
 * Free for all non-commercial use unless this directive conflicts with
 * other applicable copyright statement(s), patent holders, laws or such.
 */

#include "black_box_salsa.h"
#include <stdlib.h>

/******************************************************************************
 * Salsa20/12
 ******************************************************************************/
#define U32V(v) ((UINT32)(v) & U32C(0xFFFFFFFF))
#define U32TO32_LITTLE(v) (v)
#define U8TO32_LITTLE(p) U32TO32_LITTLE(((UINT32*)(p))[0])
#define U32TO8_LITTLE(p, v) (((UINT32*)(p))[0] = U32TO32_LITTLE(v))
#define ROTL32(v, n) (U32V((v) << (n)) | ((v) >> (32 - (n))))
#define ROTATE(v,c) (ROTL32(v,c))
#define XOR(v,w) ((v) ^ (w))
#define PLUS(v,w) (U32V((v) + (w)))
#define PLUSONE(v) (PLUS((v),1))

static const int numRounds = 12; /* 8 for Salsa20/8, 12 for Salsa20/12, 20 for Salsa20/20 */
static const int numKeyBytes = 16; /* 32 also supported */
static const int numIvBytes = 8;
static const int numSuppressedBytes = 11 * 64; /* (numRounds - 1) * 64 */
static const int implicitBlockBytes = 1;

typedef struct {
  UINT32 input[16];
} Salsa20_ctx;

static void salsa20_wordtobyte(BYTE output[64], const UINT32 input[16])
{
  UINT32 x[16];
  int i;

  for (i = 0;i < 16;++i) x[i] = input[i];
  for (i = numRounds;i > 0;i -= 2) {
    x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7));
    x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9));
    x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13));
    x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18));
    x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7));
    x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9));
    x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13));
    x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18));
    x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7));
    x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9));
    x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13));
    x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18));
    x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7));
    x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9));
    x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13));
    x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18));
    x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7));
    x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9));
    x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13));
    x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18));
    x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7));
    x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9));
    x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13));
    x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18));
    x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7));
    x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9));
    x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13));
    x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18));
    x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7));
    x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9));
    x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13));
    x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18));
  }
  for (i = 0;i < 16;++i) x[i] = PLUS(x[i],input[i]);
  for (i = 0;i < 16;++i) U32TO8_LITTLE(output + 4 * i,x[i]);
}

static void salsa20_wordtobyte_withInitOutput(BYTE *out, int numBytes, const UINT32 input[16])
{
  UINT32 *op = (UINT32*)out;
  UINT32 x[16];
  int i;

  for (i = 0;i < 16;++i) x[i] = input[i];
  for (i = numRounds;i > 0;i -= 2) {
    x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7));
    x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9));
    x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13));
    x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18));
    x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7));
    x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9));
    x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13));
    x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18));
    x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7));
    x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9));
    x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13));
    x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18));
    x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7));
    x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9));
    x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13));
    x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18));


    {
      UINT32 *xp = (UINT32*)x;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[0]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[1]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[2]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[3]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[4]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[5]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[6]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[7]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[8]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[9]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[10]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[11]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[12]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[13]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[14]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[15]; numBytes -= 4;
    }


    x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7));
    x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9));
    x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13));
    x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18));
    x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7));
    x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9));
    x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13));
    x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18));
    x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7));
    x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9));
    x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13));
    x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18));
    x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7));
    x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9));
    x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13));
    x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18));

    {
      UINT32 *xp = (UINT32*)x;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[0]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[1]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[2]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[3]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[4]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[5]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[6]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[7]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[8]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[9]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[10]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[11]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[12]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[13]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[14]; numBytes -= 4;
      if (numBytes < 4) return; *op++ ^= *xp++ + input[15]; numBytes -= 4;
    }

  }
}

static const char sigma[16] = "expand 32-byte k";
static const char tau[16] = "expand 16-byte k";

void salsa20_keysetup(Salsa20_ctx *x, const BYTE *k, UINT32 kbits)
{
  static const char *constants;

  x->input[1] = U8TO32_LITTLE(k + 0);
  x->input[2] = U8TO32_LITTLE(k + 4);
  x->input[3] = U8TO32_LITTLE(k + 8);
  x->input[4] = U8TO32_LITTLE(k + 12);
  if (kbits == 256) { /* recommended */
    k += 16;
    constants = sigma;
  } else { /* kbits == 128 */
    constants = tau;
  }
  x->input[11] = U8TO32_LITTLE(k + 0);
  x->input[12] = U8TO32_LITTLE(k + 4);
  x->input[13] = U8TO32_LITTLE(k + 8);
  x->input[14] = U8TO32_LITTLE(k + 12);
  x->input[0] = U8TO32_LITTLE(constants + 0);
  x->input[5] = U8TO32_LITTLE(constants + 4);
  x->input[10] = U8TO32_LITTLE(constants + 8);
  x->input[15] = U8TO32_LITTLE(constants + 12);
}

void salsa20_ivsetup(Salsa20_ctx *x, const BYTE *iv)
{
  x->input[6] = U8TO32_LITTLE(iv + 0);
  x->input[7] = U8TO32_LITTLE(iv + 4);
  x->input[8] = 0;
  x->input[9] = 0;
}

void salsa20_encrypt_bytes(Salsa20_ctx *x, const BYTE *m, BYTE *c, UINT32 bytes)
{
  BYTE output[64];
  int i;

  if (!bytes) return;
  for (;;) {
    salsa20_wordtobyte(output,x->input);
    x->input[8] = PLUSONE(x->input[8]);
    if (!x->input[8]) {
      x->input[9] = PLUSONE(x->input[9]);
      /* stopping at 2^70 bytes per nonce is user's responsibility */
    }
    if (bytes <= 64) {
      for (i = 0;i < (int)bytes;++i) c[i] = m[i] ^ output[i];
      return;
    }
    for (i = 0;i < 64;++i) c[i] = m[i] ^ output[i];
    bytes -= 64;
    c += 64;
    m += 64;
  }
}

void salsa20_ivsetup_withInitOutput(Salsa20_ctx* ctx, const BYTE* iv, BYTE *out, int numBytes) {
  (void)out;
  salsa20_ivsetup(ctx, iv);
  salsa20_wordtobyte_withInitOutput(out, numBytes, ctx->input);
}

/******************************************************************************
 * Black box variants
 ******************************************************************************/
int salsa20_xor(const BYTE *key, const BYTE *iv, const BYTE *inBuf, unsigned int numInputBytes, BYTE *outBuf, unsigned int numOutputBytes) {
  Salsa20_ctx ctx;
  if (numInputBytes < numOutputBytes) return -1;
  salsa20_keysetup(&ctx, key, numKeyBytes * 8);
  salsa20_ivsetup(&ctx, iv);
  salsa20_encrypt_bytes(&ctx, inBuf, outBuf, numOutputBytes);
  return 0;
}

int salsa20_xor_withInitOutput(const BYTE *key, const BYTE *iv, const BYTE *inBuf, unsigned int numInputBytes, BYTE *outBuf, unsigned int numOutputBytes) {
  Salsa20_ctx ctx;
  if (numInputBytes + numSuppressedBytes < numOutputBytes) return -1;
  salsa20_keysetup(&ctx, key, numKeyBytes * 8);
  salsa20_ivsetup_withInitOutput(&ctx, iv, outBuf, numOutputBytes);
  if ((int)numOutputBytes <= numSuppressedBytes + 64)
    return 0;
  outBuf += numSuppressedBytes + 64; numOutputBytes -= numSuppressedBytes + 64;
  salsa20_encrypt_bytes(&ctx, inBuf, outBuf, numOutputBytes);
  return 0;
}

/******************************************************************************
 * Black box API
 ******************************************************************************/
int blackBoxSalsa2012Encryption(const BYTE *key, const BYTE *iv, const BYTE *inBuf, unsigned int numInputBytes, BYTE *outBuf, unsigned int numOutputBytes, int withInitRoundOutput) {
  if (withInitRoundOutput)
    return salsa20_xor_withInitOutput(key, iv, inBuf, numInputBytes, outBuf, numOutputBytes);
  return salsa20_xor(key, iv, inBuf, numInputBytes, outBuf, numOutputBytes);
}

/******************************************************************************
 * Basic cipher information
 ******************************************************************************/
void getBlackBoxSalsa2012Info(int *keySizeInBytes, int *ivSizeInBytes, int *suppressedBytes, int *implicitBlockSizeInBytes) {
  if (keySizeInBytes) *keySizeInBytes = numKeyBytes;
  if (ivSizeInBytes) *ivSizeInBytes = numIvBytes;
  if (suppressedBytes) *suppressedBytes = numSuppressedBytes;
  if (implicitBlockSizeInBytes) *implicitBlockSizeInBytes = implicitBlockBytes;
}

