/*
 * Adapted HC-128 reference code.
 *
 * Copyright (c) Paul Stankovski
 * Free for all non-commercial use unless this directive conflicts with
 * other applicable copyright statement(s), patent holders, laws or such.
 */
#include "black_box_hc128.h"
#include "tmalloc.h"

#define BLOCK_LEN 1
#define NUM_INIT_ROUNDS 64
#define NUM_HIDDEN_BYTES_PER_INIT_ROUND (16 * sizeof(UINT32))
#define NUM_HIDDEN_BYTES (NUM_INIT_ROUNDS * NUM_HIDDEN_BYTES_PER_INIT_ROUND)

typedef struct {
  UINT32 T[1024]; /* P[i] = T[i]; Q[i] = T[1024+i]; */
  UINT32 X[16];
  UINT32 Y[16];
  UINT32 counter1024; /* counter1024 = i mod 1024 at the i-th step */
  UINT32 key[8];
  UINT32 iv[8];
  UINT32 keysize; /* key size in bits */
  UINT32 ivsize;  /* iv size in bits*/
} HC128_info;


#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))

#define UINT32TO32_LITTLE(x) (x)



/* =====================================================================

 *     The following defines the keystream generation function
 *======================================================================*/

/*h1 function*/
#define HC128_h1(ctx, x, y) {    \
     BYTE a,c;               \
     a = (BYTE) (x);         \
     c = (BYTE) ((x) >> 16);  \
     y = (ctx->T[512+a])+(ctx->T[512+256+c]); \
}

/*h2 function*/
#define HC128_h2(ctx, x, y) {    \
     BYTE a,c;               \
     a = (BYTE) (x);         \
     c = (BYTE) ((x) >> 16); \
     y = (ctx->T[a])+(ctx->T[256+c]); \
}

/*one step of HC-128, update P and generate 32 bits keystream*/
#define HC128_step_P(ctx,u,v,a,b,c,d,n){    \
     UINT32 tem0,tem1,tem2,tem3;         \
     HC128_h1((ctx),(ctx->X[(d)]),tem3);              \
     tem0 = ROTR32((ctx->T[(v)]),23);           \
     tem1 = ROTR32((ctx->X[(c)]),10);           \
     tem2 = ROTR32((ctx->X[(b)]),8);            \
     (ctx->T[(u)]) += tem2+(tem0 ^ tem1);       \
     (ctx->X[(a)]) = (ctx->T[(u)]);             \
     (n) = tem3 ^ (ctx->T[(u)]) ;               \
}

/*one step of HC-128, update Q and generate 32 bits keystream*/
#define HC128_step_Q(ctx,u,v,a,b,c,d,n){      \
     UINT32 tem0,tem1,tem2,tem3;           \
     HC128_h2((ctx),(ctx->Y[(d)]),tem3);              \
     tem0 = ROTR32((ctx->T[(v)]),(32-23));      \
     tem1 = ROTR32((ctx->Y[(c)]),(32-10));      \
     tem2 = ROTR32((ctx->Y[(b)]),(32-8));       \
     (ctx->T[(u)]) += tem2 + (tem0 ^ tem1);     \
     (ctx->Y[(a)]) = (ctx->T[(u)]);             \
     (n) = tem3 ^ (ctx->T[(u)]) ;               \
}

/*16 steps of HC-128, generate 512 bits keystream*/
void generate_keystream(HC128_info* ctx, UINT32* keystream)
{
   UINT32 cc,dd;
   cc = ctx->counter1024 & 0x1ff;
   dd = (cc+16)&0x1ff;

   if (ctx->counter1024 < 512)
   {
      ctx->counter1024 = (ctx->counter1024 + 16) & 0x3ff;
      HC128_step_P(ctx, cc+0, cc+1, 0, 6, 13,4, keystream[0]);
      HC128_step_P(ctx, cc+1, cc+2, 1, 7, 14,5, keystream[1]);
      HC128_step_P(ctx, cc+2, cc+3, 2, 8, 15,6, keystream[2]);
      HC128_step_P(ctx, cc+3, cc+4, 3, 9, 0, 7, keystream[3]);
      HC128_step_P(ctx, cc+4, cc+5, 4, 10,1, 8, keystream[4]);
      HC128_step_P(ctx, cc+5, cc+6, 5, 11,2, 9, keystream[5]);
      HC128_step_P(ctx, cc+6, cc+7, 6, 12,3, 10,keystream[6]);
      HC128_step_P(ctx, cc+7, cc+8, 7, 13,4, 11,keystream[7]);
      HC128_step_P(ctx, cc+8, cc+9, 8, 14,5, 12,keystream[8]);
      HC128_step_P(ctx, cc+9, cc+10,9, 15,6, 13,keystream[9]);
      HC128_step_P(ctx, cc+10,cc+11,10,0, 7, 14,keystream[10]);
      HC128_step_P(ctx, cc+11,cc+12,11,1, 8, 15,keystream[11]);
      HC128_step_P(ctx, cc+12,cc+13,12,2, 9, 0, keystream[12]);
      HC128_step_P(ctx, cc+13,cc+14,13,3, 10,1, keystream[13]);
      HC128_step_P(ctx, cc+14,cc+15,14,4, 11,2, keystream[14]);
      HC128_step_P(ctx, cc+15,dd+0, 15,5, 12,3, keystream[15]);
   }
   else
   {
	    ctx->counter1024 = (ctx->counter1024 + 16) & 0x3ff;
      HC128_step_Q(ctx, 512+cc+0, 512+cc+1, 0, 6, 13,4, keystream[0]);
      HC128_step_Q(ctx, 512+cc+1, 512+cc+2, 1, 7, 14,5, keystream[1]);
      HC128_step_Q(ctx, 512+cc+2, 512+cc+3, 2, 8, 15,6, keystream[2]);
      HC128_step_Q(ctx, 512+cc+3, 512+cc+4, 3, 9, 0, 7, keystream[3]);
      HC128_step_Q(ctx, 512+cc+4, 512+cc+5, 4, 10,1, 8, keystream[4]);
      HC128_step_Q(ctx, 512+cc+5, 512+cc+6, 5, 11,2, 9, keystream[5]);
      HC128_step_Q(ctx, 512+cc+6, 512+cc+7, 6, 12,3, 10,keystream[6]);
      HC128_step_Q(ctx, 512+cc+7, 512+cc+8, 7, 13,4, 11,keystream[7]);
      HC128_step_Q(ctx, 512+cc+8, 512+cc+9, 8, 14,5, 12,keystream[8]);
      HC128_step_Q(ctx, 512+cc+9, 512+cc+10,9, 15,6, 13,keystream[9]);
      HC128_step_Q(ctx, 512+cc+10,512+cc+11,10,0, 7, 14,keystream[10]);
      HC128_step_Q(ctx, 512+cc+11,512+cc+12,11,1, 8, 15,keystream[11]);
      HC128_step_Q(ctx, 512+cc+12,512+cc+13,12,2, 9, 0, keystream[12]);
      HC128_step_Q(ctx, 512+cc+13,512+cc+14,13,3, 10,1, keystream[13]);
      HC128_step_Q(ctx, 512+cc+14,512+cc+15,14,4, 11,2, keystream[14]);
      HC128_step_Q(ctx, 512+cc+15,512+dd+0, 15,5, 12,3, keystream[15]);
   }
}


/*======================================================*/
/*   The following defines the initialization functions */
/*======================================================*/

#define HC128_f1(x)  (ROTR32((x),7) ^ ROTR32((x),18) ^ ((x) >> 3))
#define HC128_f2(x)  (ROTR32((x),17) ^ ROTR32((x),19) ^ ((x) >> 10))

/*update table P*/
#define HC128_update_P(ctx,u,v,a,b,c,d){      \
     UINT32 tem0,tem1,tem2,tem3;           \
     tem0 = ROTR32((ctx->T[(v)]),23);           \
     tem1 = ROTR32((ctx->X[(c)]),10);           \
     tem2 = ROTR32((ctx->X[(b)]),8);            \
     HC128_h1((ctx),(ctx->X[(d)]),tem3);              \
     (ctx->T[(u)]) = ((ctx->T[(u)]) + tem2+(tem0^tem1)) ^ tem3;         \
     (ctx->X[(a)]) = (ctx->T[(u)]);             \
}

/*update table Q*/
#define HC128_update_Q(ctx,u,v,a,b,c,d){      \
     UINT32 tem0,tem1,tem2,tem3;      \
     tem0 = ROTR32((ctx->T[(v)]),(32-23));             \
     tem1 = ROTR32((ctx->Y[(c)]),(32-10));             \
     tem2 = ROTR32((ctx->Y[(b)]),(32-8));            \
     HC128_h2((ctx),(ctx->Y[(d)]),tem3);              \
     (ctx->T[(u)]) = ((ctx->T[(u)]) + tem2+(tem0^tem1)) ^ tem3; \
     (ctx->Y[(a)]) = (ctx->T[(u)]);                       \
}

/*16 steps of HC-128, without generating keystream, */
/*but use the outputs to update P and Q*/
void HC128_setup_update(HC128_info* ctx)  /*each time 16 steps*/
{
   UINT32 cc,dd;
   cc = ctx->counter1024 & 0x1ff;
   dd = (cc+16)&0x1ff;

   if (ctx->counter1024 < 512)
   {
      ctx->counter1024 = (ctx->counter1024 + 16) & 0x3ff;
      HC128_update_P(ctx, cc+0, cc+1, 0, 6, 13, 4);
      HC128_update_P(ctx, cc+1, cc+2, 1, 7, 14, 5);
      HC128_update_P(ctx, cc+2, cc+3, 2, 8, 15, 6);
      HC128_update_P(ctx, cc+3, cc+4, 3, 9, 0,  7);
      HC128_update_P(ctx, cc+4, cc+5, 4, 10,1,  8);
      HC128_update_P(ctx, cc+5, cc+6, 5, 11,2,  9);
      HC128_update_P(ctx, cc+6, cc+7, 6, 12,3,  10);
      HC128_update_P(ctx, cc+7, cc+8, 7, 13,4,  11);
      HC128_update_P(ctx, cc+8, cc+9, 8, 14,5,  12);
      HC128_update_P(ctx, cc+9, cc+10,9, 15,6,  13);
      HC128_update_P(ctx, cc+10,cc+11,10,0, 7,  14);
      HC128_update_P(ctx, cc+11,cc+12,11,1, 8,  15);
      HC128_update_P(ctx, cc+12,cc+13,12,2, 9,  0);
      HC128_update_P(ctx, cc+13,cc+14,13,3, 10, 1);
      HC128_update_P(ctx, cc+14,cc+15,14,4, 11, 2);
      HC128_update_P(ctx, cc+15,dd+0, 15,5, 12, 3);
   }
   else
   {
      ctx->counter1024 = (ctx->counter1024 + 16) & 0x3ff;
      HC128_update_Q(ctx, 512+cc+0, 512+cc+1, 0, 6, 13, 4);
      HC128_update_Q(ctx, 512+cc+1, 512+cc+2, 1, 7, 14, 5);
      HC128_update_Q(ctx, 512+cc+2, 512+cc+3, 2, 8, 15, 6);
      HC128_update_Q(ctx, 512+cc+3, 512+cc+4, 3, 9, 0,  7);
      HC128_update_Q(ctx, 512+cc+4, 512+cc+5, 4, 10,1,  8);
      HC128_update_Q(ctx, 512+cc+5, 512+cc+6, 5, 11,2,  9);
      HC128_update_Q(ctx, 512+cc+6, 512+cc+7, 6, 12,3,  10);
      HC128_update_Q(ctx, 512+cc+7, 512+cc+8, 7, 13,4,  11);
      HC128_update_Q(ctx, 512+cc+8, 512+cc+9, 8, 14,5,  12);
      HC128_update_Q(ctx, 512+cc+9, 512+cc+10,9, 15,6,  13);
      HC128_update_Q(ctx, 512+cc+10,512+cc+11,10,0, 7,  14);
      HC128_update_Q(ctx, 512+cc+11,512+cc+12,11,1, 8,  15);
      HC128_update_Q(ctx, 512+cc+12,512+cc+13,12,2, 9,  0);
      HC128_update_Q(ctx, 512+cc+13,512+cc+14,13,3, 10, 1);
      HC128_update_Q(ctx, 512+cc+14,512+cc+15,14,4, 11, 2);
      HC128_update_Q(ctx, 512+cc+15,512+dd+0, 15,5, 12, 3);
   }
}

/*update table P*/
#define HC128_update_P_withInitOutput(ctx,u,v,a,b,c,d,n){      \
     UINT32 tem0,tem1,tem2,tem3;           \
     tem0 = ROTR32((ctx->T[(v)]),23);           \
     tem1 = ROTR32((ctx->X[(c)]),10);           \
     tem2 = ROTR32((ctx->X[(b)]),8);            \
     HC128_h1((ctx),(ctx->X[(d)]),tem3);              \
     n = (ctx->T[(u)]) = ((ctx->T[(u)]) + tem2+(tem0^tem1)) ^ tem3;         \
     (ctx->X[(a)]) = (ctx->T[(u)]);             \
}

/*update table Q*/
#define HC128_update_Q_withInitOutput(ctx,u,v,a,b,c,d,n){      \
     UINT32 tem0,tem1,tem2,tem3;      \
     tem0 = ROTR32((ctx->T[(v)]),(32-23));             \
     tem1 = ROTR32((ctx->Y[(c)]),(32-10));             \
     tem2 = ROTR32((ctx->Y[(b)]),(32-8));            \
     HC128_h2((ctx),(ctx->Y[(d)]),tem3);              \
     n = (ctx->T[(u)]) = ((ctx->T[(u)]) + tem2+(tem0^tem1)) ^ tem3; \
     (ctx->Y[(a)]) = (ctx->T[(u)]);                       \
}

void HC128_setup_update_withInitOutput(HC128_info* ctx, BYTE *out)  /*each time 16 steps*/
{
   UINT32 cc,dd,*out32 = (UINT32*)out;
   cc = ctx->counter1024 & 0x1ff;
   dd = (cc+16)&0x1ff;

   if (ctx->counter1024 < 512)
   {
      ctx->counter1024 = (ctx->counter1024 + 16) & 0x3ff;
      HC128_update_P_withInitOutput(ctx, cc+0, cc+1, 0, 6, 13, 4, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+1, cc+2, 1, 7, 14, 5, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+2, cc+3, 2, 8, 15, 6, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+3, cc+4, 3, 9, 0,  7, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+4, cc+5, 4, 10,1,  8, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+5, cc+6, 5, 11,2,  9, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+6, cc+7, 6, 12,3,  10, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+7, cc+8, 7, 13,4,  11, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+8, cc+9, 8, 14,5,  12, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+9, cc+10,9, 15,6,  13, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+10,cc+11,10,0, 7,  14, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+11,cc+12,11,1, 8,  15, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+12,cc+13,12,2, 9,  0, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+13,cc+14,13,3, 10, 1, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+14,cc+15,14,4, 11, 2, *out32++);
      HC128_update_P_withInitOutput(ctx, cc+15,dd+0, 15,5, 12, 3, *out32);
   }
   else
   {
      ctx->counter1024 = (ctx->counter1024 + 16) & 0x3ff;
      HC128_update_Q_withInitOutput(ctx, 512+cc+0, 512+cc+1, 0, 6, 13, 4, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+1, 512+cc+2, 1, 7, 14, 5, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+2, 512+cc+3, 2, 8, 15, 6, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+3, 512+cc+4, 3, 9, 0,  7, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+4, 512+cc+5, 4, 10,1,  8, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+5, 512+cc+6, 5, 11,2,  9, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+6, 512+cc+7, 6, 12,3,  10, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+7, 512+cc+8, 7, 13,4,  11, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+8, 512+cc+9, 8, 14,5,  12, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+9, 512+cc+10,9, 15,6,  13, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+10,512+cc+11,10,0, 7,  14, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+11,512+cc+12,11,1, 8,  15, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+12,512+cc+13,12,2, 9,  0, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+13,512+cc+14,13,3, 10, 1, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+14,512+cc+15,14,4, 11, 2, *out32++);
      HC128_update_Q_withInitOutput(ctx, 512+cc+15,512+dd+0, 15,5, 12, 3, *out32);
   }
}

void HC128_init(void) {
}  /* No operation performed */

/* for the 128-bit key:  key[0]...key[15]
*  key[0] is the least significant byte of ctx->key[0] (K_0);
*  key[3] is the most significant byte of ctx->key[0]  (K_0);
*  ...
*  key[12] is the least significant byte of ctx->key[3] (K_3)
*  key[15] is the most significant byte of ctx->key[3]  (K_3)
*
*  for the 128-bit iv:  iv[0]...iv[15]
*  iv[0] is the least significant byte of ctx->iv[0] (IV_0);
*  iv[3] is the most significant byte of ctx->iv[0]  (IV_0);
*  ...
*  iv[12] is the least significant byte of ctx->iv[3] (IV_3)
*  iv[15] is the most significant byte of ctx->iv[3]  (IV_3)
*/

void HC128_keysetup(
  HC128_info* ctx,
  const BYTE* key, 
  UINT32 keysize,                /* Key size in bits (128+128*i) */ 
  UINT32 ivsize)                 /* IV size in bits  (128+128*i)*/
{ 
  UINT32 i;  

  ctx->keysize = keysize;  
  ctx->ivsize = ivsize;

  /* Key size in bits 128 */ 
  for (i = 0; i < (keysize >> 5); i++) ctx->key[i] = UINT32TO32_LITTLE (((UINT32*)key)[i]);

  for ( ; i < 8 ; i++) ctx->key[i] = ctx->key[i-4];
  
} /* initialize the key, save the iv size*/


void HC128_ivsetup(HC128_info* ctx, const BYTE* iv)
{ 
    UINT32 i;

    /* initialize the iv */
    /* IV size in bits  128*/

	for (i = 0; i < (ctx->ivsize >> 5); i++)  ctx->iv[i] = UINT32TO32_LITTLE(((UINT32*)iv)[i]);
	
    for (; i < 8; i++) ctx->iv[i] = ctx->iv[i-4];
  
    /* expand the key and IV into the table T */ 
    /* (expand the key and IV into the table P and Q) */ 
	
	for (i = 0; i < 8;  i++)   ctx->T[i] = ctx->key[i];
	for (i = 8; i < 16; i++)   ctx->T[i] = ctx->iv[i-8];

    for (i = 16; i < (256+16); i++) 
		ctx->T[i] = HC128_f2(ctx->T[i-2]) + ctx->T[i-7] + HC128_f1(ctx->T[i-15]) + ctx->T[i-16]+i;

	for (i = 0; i < 16;  i++)  ctx->T[i] = ctx->T[256+i];

	for (i = 16; i < 1024; i++)
		ctx->T[i] = HC128_f2(ctx->T[i-2]) + ctx->T[i-7] + HC128_f1(ctx->T[i-15]) + ctx->T[i-16]+256+i;

    /* initialize counter1024, X and Y */
	ctx->counter1024 = 0;
	for (i = 0; i < 16; i++) ctx->X[i] = ctx->T[512-16+i];
    for (i = 0; i < 16; i++) ctx->Y[i] = ctx->T[512+512-16+i];

    /* run the cipher 1024 steps before generating the output */
	for (i = 0; i < NUM_INIT_ROUNDS; i++)  HC128_setup_update(ctx);
}

void HC128_ivsetup_withInitOutput(HC128_info* ctx, const BYTE* iv, BYTE *out)
{
    UINT32 i;

    /* initialize the iv */
    /* IV size in bits  128*/

	for (i = 0; i < (ctx->ivsize >> 5); i++)  ctx->iv[i] = UINT32TO32_LITTLE(((UINT32*)iv)[i]);

    for (; i < 8; i++) ctx->iv[i] = ctx->iv[i-4];

    /* expand the key and IV into the table T */
    /* (expand the key and IV into the table P and Q) */

	for (i = 0; i < 8;  i++)   ctx->T[i] = ctx->key[i];
	for (i = 8; i < 16; i++)   ctx->T[i] = ctx->iv[i-8];

    for (i = 16; i < (256+16); i++)
		ctx->T[i] = HC128_f2(ctx->T[i-2]) + ctx->T[i-7] + HC128_f1(ctx->T[i-15]) + ctx->T[i-16]+i;

	for (i = 0; i < 16;  i++)  ctx->T[i] = ctx->T[256+i];

	for (i = 16; i < 1024; i++)
		ctx->T[i] = HC128_f2(ctx->T[i-2]) + ctx->T[i-7] + HC128_f1(ctx->T[i-15]) + ctx->T[i-16]+256+i;

    /* initialize counter1024, X and Y */
	ctx->counter1024 = 0;
	for (i = 0; i < 16; i++) ctx->X[i] = ctx->T[512-16+i];
    for (i = 0; i < 16; i++) ctx->Y[i] = ctx->T[512+512-16+i];

    /* run the cipher 1024 steps before generating the output */
	for (i = 0; i < NUM_INIT_ROUNDS; i++)  HC128_setup_update_withInitOutput(ctx, out + i * NUM_HIDDEN_BYTES_PER_INIT_ROUND);
}

/*========================================================
 *  The following defines the encryption of data stream
 *========================================================
 */

void HC128_process_bytes(
  HC128_info* ctx,
  const BYTE* input,
  BYTE* output,
  UINT32 msglen)                /* Message length in bytes. */
{
  UINT32 keystream[16];

  for ( ; msglen >= 64; msglen -= 64, input += 64, output += 64) {

	  generate_keystream(ctx, keystream);
    tmemcpy(output, (BYTE*)input, 64);
    tmemxor(output, keystream, 64);

#if 0
    for (i = 0; i < 16; ++i)
	   ((UINT32*)output)[i] = ((UINT32*)input)[i] ^ UINT32TO32_LITTLE(keystream[i]); */
#endif
#if 0
	  ((UINT32*)output)[0]  = ((UINT32*)input)[0]  ^ UINT32TO32_LITTLE(keystream[0]);
	  ((UINT32*)output)[1]  = ((UINT32*)input)[1]  ^ UINT32TO32_LITTLE(keystream[1]);
	  ((UINT32*)output)[2]  = ((UINT32*)input)[2]  ^ UINT32TO32_LITTLE(keystream[2]);
	  ((UINT32*)output)[3]  = ((UINT32*)input)[3]  ^ UINT32TO32_LITTLE(keystream[3]);
	  ((UINT32*)output)[4]  = ((UINT32*)input)[4]  ^ UINT32TO32_LITTLE(keystream[4]);
	  ((UINT32*)output)[5]  = ((UINT32*)input)[5]  ^ UINT32TO32_LITTLE(keystream[5]);
	  ((UINT32*)output)[6]  = ((UINT32*)input)[6]  ^ UINT32TO32_LITTLE(keystream[6]);
	  ((UINT32*)output)[7]  = ((UINT32*)input)[7]  ^ UINT32TO32_LITTLE(keystream[7]);
	  ((UINT32*)output)[8]  = ((UINT32*)input)[8]  ^ UINT32TO32_LITTLE(keystream[8]);
	  ((UINT32*)output)[9]  = ((UINT32*)input)[9]  ^ UINT32TO32_LITTLE(keystream[9]);
	  ((UINT32*)output)[10] = ((UINT32*)input)[10] ^ UINT32TO32_LITTLE(keystream[10]);
	  ((UINT32*)output)[11] = ((UINT32*)input)[11] ^ UINT32TO32_LITTLE(keystream[11]);
	  ((UINT32*)output)[12] = ((UINT32*)input)[12] ^ UINT32TO32_LITTLE(keystream[12]);
	  ((UINT32*)output)[13] = ((UINT32*)input)[13] ^ UINT32TO32_LITTLE(keystream[13]);
	  ((UINT32*)output)[14] = ((UINT32*)input)[14] ^ UINT32TO32_LITTLE(keystream[14]);
	  ((UINT32*)output)[15] = ((UINT32*)input)[15] ^ UINT32TO32_LITTLE(keystream[15]);
#endif
  }

  if (msglen > 0)
  {
      generate_keystream(ctx, keystream);
      tmemcpy(output, (BYTE*)input, msglen);
      tmemxor(output, keystream, msglen);
#if 0
      for (i = 0; i < msglen; i ++)
	      output[i] = input[i] ^ ((BYTE*)keystream)[i];
#endif
  }

}


/******************************************************************************

 * Black box variants
 ******************************************************************************/
int HC128_xor(const BYTE *key, const BYTE *iv, const BYTE *inBuf, unsigned int numInputBytes, BYTE *outBuf, unsigned int numOutputBytes) {
  HC128_info B;

  if (numOutputBytes == 0) return 0;
  if (numInputBytes < numOutputBytes) return -1;

  HC128_keysetup(&B, key, 128, 128);
  HC128_ivsetup(&B, iv);
  HC128_process_bytes(&B, inBuf, outBuf, numOutputBytes);
  return 0;
}

int HC128_xor_withInitOutput(const BYTE *key, const BYTE *iv, const BYTE *inBuf, unsigned int numInputBytes, BYTE *outBuf, unsigned int numOutputBytes) {
  HC128_info B;
  BYTE initBytes[NUM_HIDDEN_BYTES];

  if (numOutputBytes == 0) return 0;
  if (numOutputBytes < NUM_HIDDEN_BYTES) return -1;
  if (NUM_HIDDEN_BYTES + numInputBytes < numOutputBytes) return -1;

  HC128_keysetup(&B, key, 128, 128);
  tmemset(initBytes, 0, NUM_HIDDEN_BYTES);
  HC128_ivsetup_withInitOutput(&B, iv, initBytes);
  tmemxor(outBuf, initBytes, numOutputBytes < NUM_HIDDEN_BYTES ? numOutputBytes : NUM_HIDDEN_BYTES);
  numOutputBytes -= NUM_HIDDEN_BYTES;
  if (numOutputBytes == 0) return 0;
  HC128_process_bytes(&B, inBuf, outBuf + NUM_HIDDEN_BYTES, numOutputBytes);
  return 0;
}

/******************************************************************************
 * Black box API
 ******************************************************************************/
int blackBoxHC128Encryption(const BYTE *key, const BYTE *iv, const BYTE *inBuf, unsigned int numInputBytes, BYTE *outBuf, unsigned int numOutputBytes, int withInitRoundOutput) {
  if (withInitRoundOutput)
      return HC128_xor_withInitOutput(key, iv, inBuf, numInputBytes, outBuf, numOutputBytes);
  return HC128_xor(key, iv, inBuf, numInputBytes, outBuf, numOutputBytes);
}

/******************************************************************************
 * Basic cipher information
 ******************************************************************************/
void getBlackBoxHC128Info(int *keySizeInBytes, int *ivSizeInBytes, int *suppressedBytes, int *implicitBlockSizeInBytes) {
  if (keySizeInBytes) *keySizeInBytes = 16;
  if (ivSizeInBytes) *ivSizeInBytes = 16;
  if (suppressedBytes) *suppressedBytes = NUM_HIDDEN_BYTES;
  if (implicitBlockSizeInBytes) *implicitBlockSizeInBytes = BLOCK_LEN;
}

