


#define INC_FLEN
#define COMMON_SKIP_BSF
//--- #include "common.inc"

#pragma once

#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <memory.h>
#undef EOF

#pragma pack(1)

typedef unsigned short word;
typedef unsigned int   uint;
typedef unsigned char  byte;
typedef unsigned long long qword;
typedef signed long long sqword;


template <class T> void bzero( T &_p ) { int i; byte* p = (byte*)&_p; for( i=0; i<sizeof(_p); i++ ) p[i]=0; }
//template <class T, int N> void bzero( T (&p)[N] ) { int i; for( i=0; i<N; i++ ) p[i]=0; }

template <class T, int N> void bzero( T (&p)[N] ) { 
  byte* q = (byte*)&p; uint _N=N*sizeof(T);
  uint i; for( i=0; i<_N; i++ ) q[i]=0; 
}

template <class T> void bzero( T* p, int N ) { int i; for( i=0; i<N; i++ ) p[i]=0; }

template <class T, int N, int M> void bzero( T (&p)[N][M] ) { int i; for( i=0; i<N*M; i++ ) p[0][i]=0; }

template <class T> T Min( T x, T y ) { return (x<y) ? x : y; }
template <class T> T Max( T x, T y ) { return (x>y) ? x : y; }

#define macro_Min( x, y ) (((x)<(y)) ? (x) : (y))
#define macro_Max( x, y ) (((x)>(y)) ? (x) : (y))

template <class T,int N> int DIM( T (&wr)[N] ) { return sizeof(wr)/sizeof(wr[0]); };
#define AlignUp(x,r) ((x)+((r)-1))/(r)*(r)
template<byte a,byte b,byte c,byte d> struct wc { 
  static const unsigned int n=(d<<24)+(c<<16)+(b<<8)+a; 
  static const unsigned int x=(a<<24)+(b<<16)+(c<<8)+d;
};

#ifdef __GNUC__
 #define INLINE   __attribute__((always_inline)) 
 #define NOINLINE __attribute__((noinline))
 #define ALIGN(n) __attribute__((aligned(n)))
// #define __assume_aligned(x,y) x=(byte*)__builtin_assume_aligned((void*)x,y)
 #define __assume_aligned(x,y) (x=decltype(x)(__builtin_assume_aligned((void*)x,y)))
 #define restrict __restrict
#else
 #define INLINE   __forceinline
 #define NOINLINE __declspec(noinline)
 #define ALIGN(n) __declspec(align(n))
#endif

#define if_e0(x) if(__builtin_expect((x),0))
#define if_e1(x) if(__builtin_expect((x),1))
#define for_e0(x,y,z) for( (x); __builtin_expect((y),0); (z) )
#define for_e1(x,y,z) for( (x); __builtin_expect((y),1); (z) )

#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 #define __builtin_expect(x,y) (x)
// #define __assume_aligned(x,y) 
 #define __assume_aligned(x,y) __assume( (((byte*)x)-((byte*)0))%(y)==0 )
 #define restrict __restrict
 //#include "intrin.h"
 #ifndef COMMON_SKIP_BSF
 extern "C" {
 byte __cdecl _BitScanForward( uint* _Index, uint _Mask);
 byte __cdecl _BitScanForward64( uint* _Index, qword _Mask );
 }
 #endif
#endif

#if !defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 #define __assume(x) (x)
#endif

#ifdef INC_FLEN
static uint flen( FILE* f ) {
  fseek( f, 0, SEEK_END );
  uint len = ftell(f);
  fseek( f, 0, SEEK_SET );
  return len;
}
#endif

#ifdef INC_LOG2I
static uint log2i( uint x ) {
#if ((defined __GNUC__) || (defined __INTEL_COMPILER))
 #ifdef __GNUC__
  return 31-__builtin_clz(x);
 #else
  return _bit_scan_reverse(x);
 #endif
#else
  uint i; 
  for( i=0; i<32; i++,x>>=1 ) if( x==0 ) break;
  return i-1;
#endif
}
#endif

#if defined(__x86_64) || defined(_M_X64)
 #define X64
 #define X64flag 1
#else
 #undef X64
 #define X64flag 0
#endif

//--- #include "tables.inc"

#include <intrin.h>

typedef signed char int8;

static inline uint BSR(uint value) {
  unsigned long index;
  _BitScanReverse(&index, value);
  return index;
}

static inline uint BSF(uint value) {
  unsigned long index;
  _BitScanForward(&index, value);
  return index;
}

static void memset32( void* _dst, uint value, size_t n ) {
  size_t i;
  uint* dst = (uint*)_dst;
  for( i=0; i<n; i++ ) dst[i]=value; //*(uint*)((char*)dst + i * 4) = value;
}

static inline int Clamp16 (int v ) {
  return (short(v)!=v) ? (v>>31)^0x7fff : v;
}

const byte kLzModelLNext[256*2] = {
  1,  2,  3,  5,  4,  6,  7, 10,  8, 12,  9, 13, 11, 14, 15, 19,
 16, 23, 17, 24, 18, 25, 20, 27, 21, 28, 22, 29, 26, 30, 31, 33,
 32, 35, 32, 35, 32, 35, 32, 35, 34, 37, 34, 37, 34, 37, 34, 37,
 34, 37, 34, 37, 36, 39, 36, 39, 36, 39, 36, 39, 38, 40, 41, 43,
 42, 45, 42, 45, 44, 47, 44, 47, 46, 49, 46, 49, 48, 51, 48, 51,
 50, 52, 53, 43, 54, 57, 54, 57, 56, 59, 56, 59, 58, 61, 58, 61,
 60, 63, 60, 63, 62, 65, 62, 65, 50, 66, 67, 55, 68, 57, 68, 57,
 70, 73, 70, 73, 72, 75, 72, 75, 74, 77, 74, 77, 76, 79, 76, 79,
 62, 81, 62, 81, 64, 82, 83, 69, 84, 71, 84, 71, 86, 73, 86, 73,
 44, 89, 44, 89, 88, 91, 88, 91, 90, 49, 90, 49, 76, 93, 76, 93,
 78, 95, 78, 95, 80, 96, 97, 69, 98, 87, 98, 87,100, 45,100, 45,
 72, 75, 72, 75, 74, 77, 74, 77, 48,103, 48,103, 92,105, 92,105,
 80,106,107, 69,108, 87,108, 87,110, 57,110, 57, 62,113, 62,113,
 92,115, 92,115, 80,116,117, 85,118, 87,118, 87,120, 57,120, 57,
 62,123, 62,123, 92,125, 92,125, 94,126,127, 85,128,101,128,101,
130, 57,130, 57, 62,133, 62,133,102,135,102,135, 94,136,137, 85,
138,101,138,101,140, 57,140, 57, 62,143, 62,143,102,145,102,145,
 94,146,147, 99,148,101,148,101, 68, 57, 68, 57, 62, 81, 62, 81,
102,149,102,149,104,150,151, 99,152,111,112,153,104,154,155, 99,
156,111,112,157,104,158,159, 99,160,111,112,161,104,162,163,109,
164,111,112,165,114,166,167,109,168,121,122,169,114,170,171,109,
172,121,122,173,114,174,175,109,176,121,122,177,114,178,179,109,
180,121,122,181,114,182,183,119,184,121,122,185,124,186,187,119,
188,131,132,189,124,190,191,119,192,131,132,193,124,194,195,119,
196,131,132,197,124,198,199,119,200,131,132,201,124,202,203,119,
204,131,132,205,124,206,207,119,208,131,132,209,124,210,211,129,
212,131,132,213,134,214,215,129,216,141,142,217,134,218,219,129,
220,141,142,221,134,222,223,129,224,141,142,225,134,226,227,129,
228,141,142,229,134,230,231,129,232,141,142,233,134,234,235,129,
236,141,142,237,134,238,239,129,240,141,142,241,134,242,243,129,
244,141,142,245,134,246,247,139,248,141,142,249,144,250,251,139,
252, 69, 80,253,144,254,251,139,255, 69, 80,253,144,254,255, 69,
};


word kModelInterpolation[4096];
word kModelLutLookup[4096];

word kLzPredSumLookup[256];
uint kDivideLookup[256];

uint kHashHist8Mult[256];
uint kHashHist4Mult[256];
uint kHashHist10Mult[256];
uint kHashHist24Mult[256];
uint kHashHist6Mult[256];
byte kCmFlagCrap[256];

void CM_InitTables() {
  uint i,a,b;

  for( i=0; i<256; i++ ) {
    kHashHist6Mult[i] =  i * 0x699c6d11;  // 0x2f208239**10
    kHashHist4Mult[i] =  i * 0x502c3f11;  // 0x1000193**4
    kHashHist8Mult[i] =  i * 0x5d615f21;  // 0x1000193**8
    kHashHist10Mult[i] = i * 0xb5199319;  // 0xAE8E8215**10
    kHashHist24Mult[i] = i * 0xd1a55a41;  // 0x99ABC1C7**24
  }

  for( i = 0; i<256; i++) {
    a = 8 * (i >> 4);
    b = 8 * (i & 0xF);
    kCmFlagCrap[i] = 
      2 * (a > 0x32 && b > 0x50 || a > 0x46) | 
      4 * (a <= 0x59 || b <= 0x4F) |
      8 * (a > 0x64 || b > 0x5A) |
     16 * (a <= 0x3F || b <= 0x37) |
     32 * (a <= 0x37 || b <= 0x2F) | 
     64 * (a <= 0x1F && b <= 0x11) |
    128 * (a <= 0x17 && b <= 0xD);
  }

}

void LzCreateTables() {
  uint i;

  kDivideLookup[0] = 255;
  for( i=1; i<255; i++ ) kDivideLookup[i] = 1024 / (3 + i * 2);
  kDivideLookup[255] = 1;

  kLzPredSumLookup[0] = 0;
  for( i=0; i<256; i++ ) kLzPredSumLookup[i] = kModelLutLookup[i * 16 + 8];
  kLzPredSumLookup[255] = 4095;
}

qword SomeMathCrap( qword v, byte shift ) {
  if( v<=1 ) return 0;
  uint bits = BSR((uint)v);
  qword result = qword(bits) << shift;
  uint q = 1 << shift;
  qword m = v << shift >> bits;
  while( q>>=1 ) {
    m = (m * m) >> shift;
    if( m >= (qword(2)<<shift) ) { result+=q; m>>=1; }
  }
  return result;
}

void Build_kModelInterpolation() {
  uint i,j,n,x;

  word* dst = kModelInterpolation;
  dst[0] = 0;
  dst[1] = 0x40;
  for( i=2; i<2048; i++ ) {
    qword v = SomeMathCrap( (4096*i)/(4096-i), 31 );
    dst[i] = (v*170 + 0x43C000000) >> 31;
  }
  for( i=0; i<2048; i++ ) dst[2048+i] = 4096-1 - dst[2048-1-i];

  dst = kModelLutLookup;
  for( i=0,j=0; j<2048; j++ ) {
    x = kModelInterpolation[j];
    for( n=(x-i)>>1; n; n-- ) dst[i++] = j-1;
    while( i<x ) dst[i++] = j;
  }

  for( n=(2048-i)>>1; n; n-- ) dst[i++] = 2048-2;
  while( i<2048 ) dst[i++] = 2048 - 1;

  for( i=0; i<2048; i++ ) dst[2048+i] = 4096-1 - dst[2048-i - 1];
}
//--- #include "CMB.inc"

struct CM_T {
  uint aa[0x2000];
  uint cc[0x40000];
  uint bb[0];
};

struct CM_B {
  uint cmt_mask;
  uint mask;
  uint inverse_mask;
  CM_T* cmt;
  uint* histhash8_ptr;
  uint* histhash4_ptr;
  uint* histhash2_ptr;
  byte* window;
  uint window_size;
  uint write_pos;
  uint hash2pos;
  uint delta1pos;
  uint delta2pos;
  uint hash4pos;
  uint hash8pos;
  short hash2attr;
  short hash4attr;
  word hash8attr;
  word last_bytes;
  word deltaattr;
  byte hash4len;
  byte hash8len;
  byte bestlen;
  byte hash4byte;
  byte hash8byte;
  byte hash2byte;
  byte delta1byte;
  byte delta2byte;
  byte byte4E;
  uint histhash4;
  uint histbyte4;
  byte byte58;
  uint histhash8;
  byte histbyte8[8];
  byte histpos8;

  uint* modelx_ptr;
  uint modelx_cur;

  uint* modely_ptr;
  uint modely_cur;

  word* modelz_ptr;

  byte* modelq_ptr;
  byte modelq_cur;

  word delta_hist_index;
  byte best_delta;

  static const uint modelx_size = 1024;
  uint modelx[modelx_size];

  static const uint modely_size = 1024;
  uint modely[modely_size];

  static const uint modelz_size = 8194;
  word modelz[8194+2];

  static const uint modelq_size = 0x2000;
  byte modelq[modelq_size+3+1];

  word delta_score[255+1];

  byte delta_hist[512];

  word last_pos_for_byte[256];

//---   #include "CMB_newbyte.inc"

void CM_B_NewByte( void ) {
  byte last_byte = (byte)last_bytes;

  uint last_delta = (word)write_pos - 2 - last_pos_for_byte[last_byte];
  last_pos_for_byte[(byte)last_bytes] = (word)write_pos;

  if (last_delta <= 252) {
    word *dp = &delta_score[delta_hist[delta_hist_index]];
    (*dp)--;
    delta_hist[delta_hist_index] = last_delta;

    dp = &delta_score[last_delta];
    if (*dp >= delta_score[best_delta])
      best_delta = last_delta;
    (*dp)++;
    delta_hist_index = (delta_hist_index + 1) & 0x1FF;
    if (last_delta <= 127 && last_byte == 32) {
      byte4E = (31 * byte4E + 2 * last_delta + 16) >> 5;
    }
  }
  uint histhash2_cur = *histhash2_ptr;
  uint histhash4_cur = *histhash4_ptr;
  uint histhash8_cur = *histhash8_ptr;

  hash2attr += (hash2attr & 7u) < 7;
  hash4attr += (hash4attr & 7u) < 7;
  hash8attr += (hash8attr & 7u) < 7;

  hash4pos = hash4pos + 1 < window_size ? hash4pos + 1 : 0;
  hash8pos = hash8pos + 1 < window_size ? hash8pos + 1 : 0;
  hash2pos = hash2pos + 1 < window_size ? hash2pos + 1 : 0;
  delta1pos = delta1pos + 1 < window_size ? delta1pos + 1 : 0;
  delta2pos = delta2pos + 1 < window_size ? delta2pos + 1 : 0;

  *histhash8_ptr = write_pos | histhash8 & inverse_mask;
  *histhash4_ptr = write_pos | histhash4 & inverse_mask;
  *histhash2_ptr = write_pos | (last_bytes << 16) & inverse_mask;

  if (hash2attr < 0) {
    uint pos = histhash2_cur & mask;
    if ( !(write_pos - 2 - pos > 0xFF ||
         (histhash2_cur ^ (last_bytes << 16)) & inverse_mask ||
         last_bytes >> 8 == (byte)last_bytes)) {
      if ( *(word *)&window[pos - size_t(2)] == *(word *)&window[write_pos - size_t(2)]) {
        hash2pos = pos;
        hash2attr = ((write_pos - pos <= 0x5F) + (write_pos - pos <= 0xF)) << 10;
      }
    }
  }
  word new_deltaattr = deltaattr + (deltaattr < 0x1C00u ? 0x400 : 0);
  if (delta1byte != last_byte) {
    delta1pos = write_pos - (best_delta + 2) < window_size ? write_pos - (best_delta + 2) : 0;
    delta2pos = write_pos - 2 * (best_delta + 2) < window_size ? write_pos - 2 * (best_delta + 2) : 0;
    new_deltaattr = 0;
  } else {
    new_deltaattr = new_deltaattr & 0xFFFFFE7F;
  }
  if (!(best_delta & 1))
    new_deltaattr |= ((write_pos & 3) << 7);
  deltaattr = new_deltaattr;

  if (hash8len) {
    hash8len += (hash8len < 0x3F);
    hash8attr = hash8attr & ~(15<<5) | 32 * Min<uint>(hash8len - 8, 14);
  } else {
    if ( !((histhash8 ^ histhash8_cur) & inverse_mask) ) {
      uint pos = (histhash8_cur & mask);
      if ( *(qword *)&window[pos - size_t(8)] == *(qword *)&window[write_pos - size_t(8)]) {
        if (hash4pos == pos) {
          hash4len = 0;
          hash4attr = -1;
        }
        hash8len = 8;
        hash8pos = pos;
        hash8attr = ((write_pos - pos) < 0x1000) << 9;
      }
    }
  }

  if (hash4len) {
    hash4len += (hash4len < 0x3F);
    hash4attr = hash4attr & ~(15<<5) | 32 * Min<uint>(hash4len - 4, 14);
  } else {
    if ( !((histhash4 ^ histhash4_cur) & inverse_mask) &&
         (*(uint *)&window[(histhash4_cur & mask) - size_t(4)] == *(uint *)&window[write_pos - size_t(4)]) ) {
      hash4len = 4;
      hash4pos = (histhash4_cur & mask);
      hash4attr = ((write_pos - (histhash4_cur & mask)) < 0x1000) << 9;
    }
  }
  bestlen = Max<byte>(hash4len, hash8len);
  hash2byte = window[hash2pos];
  delta1byte = window[delta1pos];
  delta2byte = 2 * window[delta1pos] - window[delta2pos];
  hash4byte = window[hash4pos];
  hash8byte = window[hash8pos];
}

//---   #include "CMB_proc2.inc"

int CM_B_Proc2( int bit_history, int bit, int bitindex, short *factors ) {
  int bitindex_minus1 = bitindex - 1;

  factors[0] = 0;
  if (hash4attr >= 0) {
    uint hash4byte = this->hash4byte + 256, v8;
    if (!hash4len || (hash4byte >> bitindex != bit_history)) {
      if (hash4len) {
        hash4attr = hash4attr & 0xFFFFFE00 | 1;
        hash4len = 0;
        bestlen = hash8len;
      }
      if ((hash4attr & 0xF) == 1 || (hash4byte >> bitindex == bit_history)) {
        v8 = (hash4attr & 0x7FE0) + 2 * (hash4attr & 0xF) + ((hash4byte >> (bitindex - 1)) & 1);
      } else {
        v8 = 480 + ((hash4byte >> (bitindex - 1)) & 1) + 2 * bitindex;
        hash4attr |= 8;
      }
    } else {
      v8 = (hash4attr & 0x7FE0) + ((hash4byte >> (bitindex - 1)) & 1);
    }
    uint *modelx_ptr_new = &modelx[v8];
    *modelx_ptr = (((bit << 23) - (modelx_cur >> 9)) * kDivideLookup[(byte)modelx_cur] & 0xFFFFFF00) + modelx_cur + ((byte)modelx_cur <= 0x27u);
    modelx_ptr = modelx_ptr_new;
    modelx_cur = *modelx_ptr_new;
    factors[0] = kModelInterpolation[modelx_cur >> 20] - 2048;
  }
  

  uint hash8byte = this->hash8byte + 256, v13;
  if (!hash8len) {
    if ( (hash8attr & 0xF) == 1 || (hash8byte >> bitindex == bit_history) ) {
      v13 = (hash8attr & 0x7FE0) + 2 * (hash8attr & 0xF) + ((hash8byte >> bitindex_minus1) & 1);
    } else {
      v13 = 480 + 2 * bitindex + ((hash8byte >> bitindex_minus1) & 1);
      hash8attr |= 8;
    }
  } else {
    if (hash8byte >> bitindex != bit_history) {
      hash8attr = hash8attr & 0xFE00 | 1;
      hash8len = 0;
      bestlen = hash4len;
      if ((hash8attr & 0xF) == 1) {
        v13 = (hash8attr & 0x7FE0) + 2 * (hash8attr & 0xF) + ((hash8byte >> bitindex_minus1) & 1);
      } else {
        v13 = 480 + 2 * bitindex + ((hash8byte >> bitindex_minus1) & 1);
        hash8attr |= 8;
      }
    } else {
      v13 = (hash8attr & 0x7FE0) + ((hash8byte >> bitindex_minus1) & 1);
    }
  }
  uint *modely_ptr_new = &modely[v13];
  *modely_ptr = (((bit << 23) - (modely_cur >> 9)) * kDivideLookup[(byte)modely_cur] & 0xFFFFFF00) + modely_cur + ((byte)modely_cur <= 39);
  modely_ptr = modely_ptr_new;
  modely_cur = *modely_ptr_new;
  factors[1] = kModelInterpolation[modely_cur >> 20] - 2048;

  uint v17 = hash2byte + 256;
  *modelz_ptr += ((bit << 16) + 8u - *modelz_ptr) >> 4;

  if (hash2attr < 0)
    goto LABEL_34;
  if (v17 >> bitindex == bit_history) {
    modelz_ptr = &modelz[((hash2attr & 7) << 7) + (hash2attr & 0x7F80) + ((v17 >> bitindex_minus1) & 1) + (bitindex_minus1 & 0xFFFFFFFE)];
    factors[2] = kModelInterpolation[(uint)*modelz_ptr >> 4] - 2048;
  } else {
    hash2attr = (hash2attr & 0xFFF0) | (Min(hash2attr & 7, 3) << 7) | (bit << 9) | 0x8001;
    bestlen = Max<byte>(hash4len, hash8len);
  LABEL_34:
    if ((hash2attr & 0xF) == 1 || (v17 >> bitindex == bit_history)) {
      modelz_ptr = &modelz[8 * (hash2attr & 0xF) + (hash2attr & 0x7F80) + ((v17 >> bitindex_minus1) & 1) + (bitindex_minus1 & 0xFFFFFFFE)];
      factors[2] = kModelInterpolation[(uint)*modelz_ptr >> 4] - 2048;
    } else {
      modelz_ptr = modelz + 0x2000;
      factors[2] = 0;
      hash2attr |= 8;
    }
  }

  uint v23 = (delta1byte + 256u) >> bitindex_minus1;
  uint v24 = (delta2byte + 256u) >> bitindex_minus1;
  uint modelq_new = 4 * bitindex_minus1 + 2 * (v23 & 1) + deltaattr + (v24 & 1) + (v24 >> 1 == bit_history) * 32 + (v23 >> 1 == bit_history) * 64;

  *modelq_ptr = modelq_cur + (((bit << 8) + 4u - modelq_cur) >> 3);
  modelq_ptr = &modelq[modelq_new];
  modelq_cur = *modelq_ptr;
  factors[3] = kModelInterpolation[16 * modelq_cur] - 2048;
  return bestlen;
}
//---   #include "CMB_init.inc"

void CMB_Init( uint b_size, uint winsize ) {
  cmt_mask = b_size - 1;
  cmt = (CM_T*)_aligned_malloc( sizeof(CM_T) + 4*(cmt_mask+1), 16);
  memset( cmt, 0, sizeof(CM_T) + 4*(cmt_mask+1) );

  window_size = winsize;
  mask = (1 << (BSR(winsize-1) + 1)) - 1;
  inverse_mask = ~mask;
  write_pos = 1;
  window = new byte[winsize+64];
  window += 64;
  memset( window-64, 0, window_size+64 );

  memset(last_pos_for_byte, 0, sizeof(last_pos_for_byte));
  best_delta = 0;
  delta_hist_index = 0;
  memset(delta_hist, 0, 0x200u);
  memset(delta_score, 0, 0x1FEu);
  *delta_score = 512;
  last_bytes = 0;
  hash2pos = 0;
  delta2pos = 0;
  hash2attr = 0;
  delta1pos = 0;
  hash8len = 0;
  hash4len = 0;
  hash8pos = 0;
  hash4pos = 0;
  hash8attr = 0;
  hash4attr = 0;
  deltaattr = 0;
  hash4byte = 0;
  hash8byte = 0;
  hash2byte = 0;
  delta1byte = 0;
  delta2byte = 0;
  bestlen = 0;
  byte4E = 0;

  memset32(modelx, 0x80000000, modelx_size);
  modelx_ptr = modelx;
  modelx_cur = *modelx;

  memset32(modely, 0x80000000, modely_size);
  modely_ptr = modely;
  modely_cur = *modely;

  memset32(modelz, 0x80008000, modelz_size>>1);
  modelz_ptr = modelz;

  memset(modelq, 0x80u, 4 * ((modelq_size + 3) >> 2));
  modelq_ptr = modelq;
  modelq_cur = *modelq;

  histhash4 = 0;
  byte58 = 0;
  histbyte4 = 0;
  histhash8 = 0;
  histpos8 = 0;
  memset(histbyte8, 0, sizeof(histbyte8));
}

//---   #include "CMB_finish.inc"

void CM_B_Finished( uint out_byte ) {
  window[write_pos] = out_byte;
  write_pos = (write_pos + 1) < window_size ? write_pos + 1 : 0;

  last_bytes = (last_bytes << 8) + out_byte;
  histhash2_ptr = &cmt->aa[(last_bytes ^ (last_bytes >> 5)) & 0x1FFF];

  histhash8 += out_byte;
  histhash8 = 0x1000193 * (histhash8 - kHashHist8Mult[histbyte8[histpos8]]);
  histbyte8[histpos8] = out_byte;
  histpos8 = histpos8 < 7 ? histpos8 + 1 : 0;

  histhash8_ptr = &cmt->bb[histhash8 & cmt_mask];

  histhash4 += out_byte;
  histhash4 = 0x1000193 * (histhash4 - kHashHist4Mult[histbyte4 >> 24]);
  histbyte4 = (histbyte4 << 8) | out_byte;
  histhash4_ptr = &cmt->cc[histhash4 & 0x3FFFF];
}

};


//--- #include "CM.inc"

struct CM_C {
  uint v0, v1;
};

struct CM_ModelE {
  static const uint size = 256;
  uint model[size];
  uint* ptr;
  uint cur;
};

struct CM_ModelG {
  static const uint size = 53248;
  uint base_ptr[size];
  uint* cur_ptr;
  uint cur;

  void CM_ModelG_Reset( void ) {
    uint i;
    for( i=0; i<13; i++ ) base_ptr[i] = (kModelLutLookup[i * 4096 / 13 + 157] << 20) + 8;
    for( i=13; i<53248; i++ ) base_ptr[i] = base_ptr[i-13];
    cur_ptr = base_ptr;
    cur = *cur_ptr;
  }
};


struct CM {
  byte* some_ptr;
  uint some_mask;
  byte* ptrs8[8];
  uint hashes_arr28[8];
  uint upper_3bits_history;
  uint bit_history2;
  uint out_byte;
  uint next_probability;
  word out_byte_wip;
  int8 factors7_div16;
  byte bit_index_in_byte;
  byte cmb_result;
  byte errorhist;
  byte ascii_counter;
  byte flags;
  byte errorsum;
  byte factors7_err_flt;
  byte factors0_err_flt;
  byte errorsum_flt1;
  byte errorsum_flt2;
  byte factors0_err;
  byte factors7_err;
  byte ptrs8_cur[8];
  uint histhash24;
  byte histbyte24[24];
  byte histpos24;
  uint histhash10;
  byte histbyte10[10];
  byte histpos10;
  uint histhash6;
  byte histbyte6[6];
  byte histpos6;

  byte* modeld_ptr;
  byte modeld_cur;

  uint* modela_ptr;
  uint modela_cur;

  byte* modelb_ptr;
  byte modelb_cur;

  uint* modelc_ptr;
  uint modelc_cur;

  byte* modelf_ptr;
  byte modelf_cur;

  CM_C* cmc_cur;
  uint cmc_v0_mult;
  uint cmc_v1_mult;
  uint next_prob_last;

  static const uint modeld_size = 0x10000;
  byte modeld[modeld_size];

  uint modela[256];

  static const uint modelb_size = 0x20000;
  byte modelb[modelb_size];

  uint modelc[256];

  static const uint modelf_size = 0x20000;
  byte modelf[modelf_size];

  CM_ModelE modele[8];

  short factors[12];
  short modelh[84][12];
  short* modelh_ptr;

  CM_B cmb;

  static const uint cmc_size = 0x2000;
  CM_C cmc[cmc_size];

  CM_ModelG modelg;

  byte modelu[0x4CA70C];
  byte modelv[0x10000];
  byte modelw[0x1000000];

//---   #include "CM_hash.inc"

static void ZeroRp(byte *rp) {
  rp[4] = 0;
  rp[8] = 0;
  rp[12] = 0;
  rp[16] = 0;
  rp[20] = 0;
  rp[24] = 0;
  rp[28] = 0;
  rp[32] = 0;
  rp[36] = 0;
  rp[40] = 0;
  rp[44] = 0;
  rp[48] = 0;
  rp[52] = 0;
  rp[56] = 0;
  rp[60] = 0;
}

static byte *CM_PtrGetX(byte *p, byte b) {
  byte *rp = (byte *)((uintptr_t)&p[64 * (b | 1)] & ~63);
  if( rp[0]==b ) return rp + 4;
  if( rp[1]==b ) return rp + 5;
  if( rp[2]==b ) return rp + 6;
  if( rp[3]==b ) return rp + 7;
  byte v4 = (rp[5] < rp[4]);
  byte v5 = -(rp[6] >= rp[v4 + 4]);
  byte v6 = (v5 & v4) + (~v5 & 2);
  byte v7 = -(rp[7] >= rp[v6 + 4]);
  rp += (v7 & v6) + (~v7 & 3);
  *rp = b;
  ZeroRp(rp);
  return rp + 4;
}

byte* CM_GetHashSlot( uint hash ) {
  CM* t = this;
  byte b = hash >> 24;
  byte *rp = &t->some_ptr[(hash << 6) & t->some_mask];
  if( rp[0]==b ) return rp;
  if (rp[1]==b ) return rp + 1;
  if( rp[2]==b ) return rp + 2;
  if( rp[3]==b ) return rp + 3;
  byte v4 = rp[5] < rp[4];
  byte v5 = -(rp[6] >= rp[v4 + 4]);
  byte v6 = (v5 & v4) + (~v5 & 2);
  byte v7 = -(rp[7] >= rp[v6 + 4]);
  rp += (v7 & v6) + (~v7 & 3);
  *rp = b;
  ZeroRp(rp);
  return rp;
}

//---   #include "CM_init.inc"

void CM_Init( int a_bits, int b_bits, uint window_size ) {

  some_mask = (1<<a_bits) - 1;
  some_ptr = (byte *)_aligned_malloc(some_mask + 16450, 64);

  cmb.CMB_Init( 1<<b_bits, window_size );

  modeld_ptr = modeld;
  modelb_ptr = modelb;

  ptrs8[0] = modelv;
  ptrs8[1] = modelw;
  byte* vv = CM_GetHashSlot(0);
  ptrs8[2] = vv;
  ptrs8[3] = vv;
  ptrs8[4] = vv;
  ptrs8[5] = vv;
  ptrs8[6] = vv;
  ptrs8[7] = vv;
  for (uint i = 0; i != 8; i++) {
    memset32(modele[i].model, 0x80000000, modele[i].size);
    modele[i].ptr = modele[i].model;
    modele[i].cur = *modele[i].ptr;
  }
  memset(modeld, 0, 4 * ((modeld_size + 3) >> 2));
  modeld_ptr = modeld;
  modeld_cur = *modeld_ptr;

  memset32(modela, 0x80000000, 0x100u);
  modela_ptr = modela;
  modela_cur = modela[0];

  memset(modelb, 0, 4 * ((modelb_size + 3) >> 2));
  modelb_ptr = modelb;
  modelb_cur = *modelb;

  memset32(modelc, 0x80000000, 0x100u);
  modelc_ptr = modelc;
  modelc_cur = modelc[0];

  memset(modelf, 0x80u, 4 * ((modelf_size + 3) >> 2));
  modelf_ptr = modelf;
  modelf_cur = *modelf;

  //CM_ModelG_Reset(&modelg);
  modelg.CM_ModelG_Reset();

  next_prob_last = 0;
  cmc_v1_mult = 0;
  cmc_v0_mult = 0;
  cmc_cur = cmc;
  for(uint i = 0; i < cmc_size; i++) {
    cmc[i].v0 = 0;
    cmc[i].v1 = 0x800000;
  }
  errorsum_flt1 = 127;
  errorsum_flt2 = 127;
  histhash24 = 0;
  histpos24 = 0;
  memset(histbyte24, 0, sizeof(histbyte24));
  histhash10 = 0;
  histpos10 = 0;
  memset(histbyte10, 0, sizeof(histbyte10));
  histhash6 = 0;
  histpos6 = 0;
  memset(histbyte6, 0, sizeof(histbyte6));

  memset(ptrs8_cur, 0, sizeof(ptrs8_cur));
  memset(modelv, 0, 0x10000);
  memset(modelw, 0, 0x1000000);
  memset(modelu, 0, 0x4CA70C);
  memset(hashes_arr28, 0, sizeof(hashes_arr28));
  next_probability = 2048;
  out_byte_wip = 1;
  out_byte = 0;
  bit_index_in_byte = 0;
  errorsum = 0;
  factors7_err_flt = 0;
  factors0_err_flt = 0;
  factors7_div16 = 0;
  upper_3bits_history = 0;
  cmb_result = 0;
  errorhist = 0;
  flags = 0;
  bit_history2 = 0;
  ascii_counter = 0;
  factors7_err = 0;
  factors0_err = 0;
  memset(factors, 0, sizeof(factors));
  memset(modelh, 0, sizeof(modelh));
  modelh_ptr = (short *)modelh;
  memset(some_ptr, 0, some_mask + 16449);
}
//---   #include "CM_update.inc"

void CM_Update( uint bit ) {

  int error = 4095 * bit - next_probability;
  for(int fi = 0; fi != 12; fi++)
    modelh_ptr[fi] = Clamp16(modelh_ptr[fi] + ((((error * 16 * factors[fi]) >> 16) + 1) >> 1));

  uint error_abs = abs(error);
  errorhist = errorhist * 2 + (error_abs > 1280);
  errorsum += error_abs >> 7;
  bit_history2 = (bit + 2 * bit_history2) & 0x1FFFF;

  uint tt;
  
  tt = modele[0].cur;
  *modele[0].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt <= 127);

  tt = modele[1].cur;
  *modele[1].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt <= 253);

  tt = modele[2].cur;
  *modele[2].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt <= 253);

  *ptrs8[0] = kLzModelLNext[ptrs8_cur[0] * 2 + bit];
  *ptrs8[1] = kLzModelLNext[ptrs8_cur[1] * 2 + bit];
  *ptrs8[2] = kLzModelLNext[ptrs8_cur[2] * 2 + bit];

  uint bit_shift_23 = bit << 23;

  if (flags & 0x10) {
    tt = modele[3].cur;
    *modele[3].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt != 255);
    *ptrs8[3] = kLzModelLNext[ptrs8_cur[3] * 2 + bit];

    if (flags & 0x20) {
      tt = modele[4].cur;
      *modele[4].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt != 255);
      *ptrs8[4] = kLzModelLNext[ptrs8_cur[4] * 2 + bit];
    }
  }

  if (flags & 4) {
    tt = modele[7].cur;
    *modele[7].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt != 255);
    *ptrs8[7] = kLzModelLNext[ptrs8_cur[7] * 2 + bit];
  }

  if (flags & 0x40) {
    tt = modele[5].cur;
    *modele[5].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt != 255);
    *ptrs8[5] = kLzModelLNext[ptrs8_cur[5] * 2 + bit];
    if (flags & 0x80) {
      tt = modele[6].cur;
      *modele[6].ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt != 255);
      *ptrs8[6] = kLzModelLNext[ptrs8_cur[6] * 2 + bit];
    }
  } else {
    factors0_err += abs((int)((bit << 8) - 128 - (factors[0] >> 4))) >> 3;
    factors7_err += abs((int)((bit << 8) - 128 - factors7_div16)) >> 3;

    *modeld_ptr = kLzModelLNext[modeld_cur * 2 + bit];

    tt = modela_cur;
    *modela_ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt != 255);

    *modelb_ptr = kLzModelLNext[modelb_cur * 2 + bit];

    tt = modelc_cur;
    *modelc_ptr = ((kDivideLookup[(byte)tt] * ((bit << 23) - (tt >> 9))) & ~0xFF) + tt + ((byte)tt != 255);
    
    if (flags & 2)
      *modelf_ptr = modelf_cur + ((16 - modelf_cur + (bit << 8)) >> 5);
  }

  out_byte_wip = out_byte_wip * 2 + bit;

  if (out_byte_wip <= 0xFF) {
    bit_index_in_byte = bit_index_in_byte + 1;
    if (bit_index_in_byte == 4) {
      ptrs8[0] = &modelv[15 * (out_byte_wip - 15) + ((byte)out_byte << 8)];
      ptrs8[1] = &modelw[15 * (out_byte_wip - 15) + hashes_arr28[1]];
      ptrs8[2] = CM_PtrGetX(ptrs8[2], (byte)(out_byte_wip ^ hashes_arr28[2]));
      if (flags & 0x10) {
        ptrs8[3] = CM_PtrGetX(ptrs8[3], (byte)(out_byte_wip ^ hashes_arr28[3]));
        if (flags & 0x20) {
          ptrs8[4] = CM_PtrGetX(ptrs8[4], (byte)(out_byte_wip ^ hashes_arr28[4]));
        }
      }
      if (flags & 4) {
        if (hashes_arr28[7] >= 27*27*27) {
          ptrs8[7] = CM_PtrGetX(ptrs8[7], (byte)(out_byte_wip ^ hashes_arr28[7]));
        } else {
          uint tmp = 17 * hashes_arr28[7] + 1 + (out_byte_wip & 0xF);
          ptrs8[7] = &modelu[60 * (tmp >> 2) + (tmp & 3)];
        }
      }
      if (flags & 0x40) {
        ptrs8[5] = CM_PtrGetX(ptrs8[5], (byte)(out_byte_wip ^ hashes_arr28[5]));
        if (flags & 0x80)
          ptrs8[6] = CM_PtrGetX(ptrs8[6], (byte)(out_byte_wip ^ hashes_arr28[6]));
      } else {
        modeld_ptr = &modeld[15 * (out_byte_wip - 15) + (out_byte & 0xFF00)];
      }
    } else {
      uint incr = (bit + 1) << ((bit_index_in_byte & 3) - 1);
      modeld_ptr += incr;
      ptrs8[0] += incr;
      ptrs8[1] += incr;
      incr *= 4;
      ptrs8[2] += incr;
      ptrs8[3] += incr;
      ptrs8[4] += incr;
      ptrs8[5] += incr;
      ptrs8[6] += incr;
      ptrs8[7] += incr;
    }
  } else {
    // byte finished
    out_byte_wip -= 256;

    (cmb).CM_B_Finished( out_byte_wip );

    hashes_arr28[0] = 32 * out_byte_wip;
    ptrs8[0] = &modelv[256 * out_byte_wip];
    out_byte = (out_byte << 8) + out_byte_wip;

    hashes_arr28[1] = (word)out_byte << 8;
    ptrs8[1] = &modelw[hashes_arr28[1]];

    hashes_arr28[2] = (out_byte << 8) ^ out_byte_wip ^ 0xFFFFFF00;
    uint hash2 = 33595399 * hashes_arr28[2] ^ (hashes_arr28[2] >> 11);
    hashes_arr28[3] = out_byte;

    histhash6 += out_byte_wip;
    histhash6 = 0x2f208239 * (histhash6 - kHashHist6Mult[histbyte6[histpos6]]);
    histbyte6[histpos6] = (byte)out_byte_wip;
    histpos6 = histpos6 < 5 ? (histpos6 + 1) : 0;
    hashes_arr28[4] = histhash6;

    if (cmb_result <= 31) {
      errorsum = Min<byte>(errorsum, 127);
      errorsum_flt1 = (3 * errorsum_flt1 + errorsum) >> 2;
      errorsum_flt2 = (3 * errorsum_flt2 + 2 + errorsum_flt1) >> 2;
      flags = kCmFlagCrap[16 * (errorsum_flt1 >> 3) + (errorsum_flt2 >> 3)];
    }
    errorsum = 0;

    uint hash3 = 0, hash4 = 0;
    if (flags & 0x10) {
      hash3 = 33595399 * hashes_arr28[3] ^ (hashes_arr28[3] >> 11);
      if (flags & 0x20)
        hash4 = hashes_arr28[4] ^ _rotr(hashes_arr28[4], 11);
    }

    histhash10 += out_byte_wip;
    histhash10 = 0xAE8E8215 * (histhash10 - kHashHist10Mult[histbyte10[histpos10]]);
    histbyte10[histpos10] = (byte)out_byte_wip;
    histpos10 = histpos10 < 9 ? histpos10 + 1 : 0;

    histhash24 += out_byte_wip;
    histhash24 = 0x99ABC1C7 * (histhash24 - kHashHist24Mult[histbyte24[histpos24]]);
    histbyte24[histpos24] = (byte)out_byte_wip;
    histpos24 = histpos24 < 23 ? histpos24 + 1 : 0;

    uint hash6 = 0;
    uint hash5 = 0;

    if (flags & 0x40) {
      hashes_arr28[5] = histhash10;
      hash5 = histhash10 ^ _rotr(histhash10, 11);
      if (flags & 0x80) {
        hashes_arr28[6] = histhash24;
        hash6 = histhash24 ^ _rotr(histhash24, 11);
      }
    }
    uint hash7;

    bool is_ascii = false;
    byte ascii_lower;

    if (flags & 4) {
      ascii_lower = out_byte_wip ^ ((uint)(out_byte_wip - 'A') < 26 ? 0x20 : 0);
      if ((uint)(ascii_lower - 'a') > 25) {
        is_ascii = (ascii_lower > 0x7F && cmb.byte4E <= 19);
      } else {
        is_ascii = ((ascii_lower -= 96) && (ascii_lower <= 0x7F || cmb.byte4E <= 19));
      }
    }
    if (is_ascii) {
      uint ascii_hash = ascii_lower + 27 * hashes_arr28[7];
      hashes_arr28[7] = ascii_hash;

      ascii_counter = (ascii_counter | (ascii_lower >> 7) & (ascii_counter == 0)) + 2;
      if (ascii_counter & 0x80)
        flags &= ~4;

      if (ascii_hash < 27*27*27) {
        ptrs8[7] = &modelu[60 * (17 * ascii_hash >> 2) + (17 * ascii_hash & 3)];
        hash7 = 0;
      } else {
        hash7 = 33595399 * ascii_hash ^ (ascii_hash >> 11);
      }
    } else {
      ascii_counter = 0;
      hashes_arr28[7] = 0;
      ptrs8[7] = modelu;
      hash7 = 0;
    }

    upper_3bits_history = (out_byte_wip & 0xE0) + (8 * upper_3bits_history & 0x1FFFF);

    if (!(flags & 0x40)) {
      modeld_ptr = &modeld[out_byte & 0xFF00];
      factors0_err_flt = (factors0_err + 15 * factors0_err_flt + 8u) >> 4;
      factors7_err_flt = (factors7_err + 15 * factors7_err_flt + 8u) >> 4;
      factors0_err = 0;
      factors7_err = 0;
    }

    (cmb).CM_B_NewByte();

    out_byte_wip = 1;
    bit_index_in_byte = 0;

    ptrs8[2] = CM_GetHashSlot(hash2) + 4;
    if (flags & 0x10) {
      ptrs8[3] = CM_GetHashSlot(hash3) + 4;
      if (flags & 0x20)
        ptrs8[4] = CM_GetHashSlot(hash4) + 4;
    }
    ptrs8[5] = modelv;
    ptrs8[6] = modelv;

    if ((flags & 4) && hashes_arr28[7] >= 27*27*27)
      ptrs8[7] = CM_GetHashSlot(hash7) + 4;

    if (flags & 0x40) {
      ptrs8[5] = CM_GetHashSlot(hash5) + 4;
      if (flags & 0x80)
        ptrs8[6] = CM_GetHashSlot(hash6) + 4;
    }
  }
  byte _cmb_result = (cmb).CM_B_Proc2( out_byte_wip, bit, 8 - bit_index_in_byte, &factors[8]  );
  ptrs8_cur[0] = *ptrs8[0];
  modele[0].ptr = &modele[0].model[ptrs8_cur[0]];
  modele[0].cur = *modele[0].ptr;
  factors[0] = kModelInterpolation[modele[0].cur >> 20] - 2048;

  ptrs8_cur[1] = *ptrs8[1];
  modele[1].ptr = &modele[1].model[ptrs8_cur[1]];
  modele[1].cur = *modele[1].ptr;
  factors[1] = kModelInterpolation[modele[1].cur >> 20] - 2048;

  byte hist_and = ptrs8_cur[1] != 0, hist_sum = hist_and;

  ptrs8_cur[2] = *ptrs8[2];
  modele[2].ptr = &modele[2].model[ptrs8_cur[2]];
  modele[2].cur = *modele[2].ptr;
  factors[2] = kModelInterpolation[modele[2].cur >> 20] - 2048;

  hist_and &= (ptrs8_cur[2] != 0);
  hist_sum += hist_and;

  factors[3] = 0;
  factors[4] = 0;
  factors[5] = 0;
  factors[6] = 0;

  if (flags & 0x10) {
    ptrs8_cur[3] = *ptrs8[3];
    modele[3].ptr = &modele[3].model[ptrs8_cur[3]];
    modele[3].cur = *modele[3].ptr;
    factors[3] = kModelInterpolation[modele[3].cur >> 20] - 2048;
    hist_and &= (ptrs8_cur[3] != 0);
    hist_sum += hist_and;
    if (flags & 0x20) {
      ptrs8_cur[4] = *ptrs8[4];
      modele[4].ptr = &modele[4].model[ptrs8_cur[4]];
      modele[4].cur = *modele[4].ptr;
      factors[4] = kModelInterpolation[modele[4].cur >> 20] - 2048;
      hist_and &= (ptrs8_cur[4] != 0);
      hist_sum += hist_and;
    }
  }
  ptrs8_cur[5] = 0;
  ptrs8_cur[7] = 0;
  if (flags & 4) {
    ptrs8_cur[7] = *ptrs8[7];
    modele[7].ptr = &modele[7].model[ptrs8_cur[7]];
    modele[7].cur = *modele[7].ptr;
    factors[5] = kModelInterpolation[modele[7].cur >> 20] - 2048;
  }
  if (flags & 0x40) {
    ptrs8_cur[5] = *ptrs8[5];
    modele[5].ptr = &modele[5].model[ptrs8_cur[5]];
    modele[5].cur = *modele[5].ptr;
    factors[7] = kModelInterpolation[modele[5].cur >> 20] - 2048;
    hist_and &= (ptrs8_cur[5] != 0);
    hist_sum += hist_and;
    if (flags & 0x80) {
      ptrs8_cur[6] = *ptrs8[6];
      modele[6].ptr = &modele[6].model[ptrs8_cur[6]];
      modele[6].cur = *modele[6].ptr;
      factors[6] = kModelInterpolation[modele[6].cur >> 20] - 2048;
      hist_and &= (ptrs8_cur[6] != 0);
      hist_sum += hist_and;
    }
  }

  if (!(flags & 0x40)) {
    if (flags & 2) {
      modelf_ptr = &modelf[bit_history2];
      modelf_cur = *modelf_ptr;
      factors[4] = 16 * modelf_cur - 2048;
    }
    modeld_cur = *modeld_ptr;

    modela_ptr = &modela[modeld_cur];
    modela_cur = *modela_ptr;

    factors[7] = (modela_cur >> 20) - 2048;
    factors7_div16 = factors[7] >> 4;
    if ( hist_sum > 2 || _cmb_result > 4 || (factors0_err_flt <= factors7_err_flt) || factors7_err_flt > 0x6Eu )
      factors[7] = 0;

    if ( bit_index_in_byte > 4u || hist_sum > 3 || flags & 1 ) {
      modelb_ptr = &modelb[123];
      modelb_cur = *modelb_ptr;
      modelc_ptr = &modelc[modelb_cur];
      modelc_cur = *modelc_ptr;
    } else {
      modelb_ptr = &modelb[upper_3bits_history ^ out_byte_wip];
      modelb_cur = *modelb_ptr;
      modelc_ptr = &modelc[modelb_cur];
      modelc_cur = *modelc_ptr;
      factors[6] = kModelInterpolation[modelc_cur >> 20] - 2048;
    }
  }

  uint modelg_index = ((errorhist & 7) << 8) + out_byte_wip + 2048 * (hist_sum > 3);
  
  cmb_result = _cmb_result;

  byte cm_bc = (ptrs8_cur[7] && !(ascii_counter & 1)) ? (ascii_counter > 3) + (ascii_counter > 9) + 1 : 0;
  modelh_ptr = modelh[cm_bc + 4 * ((_cmb_result > 4) + (_cmb_result > 9) + 3 * hist_sum)];

  int fsum = 0;
  for(uint ii = 0; ii != 12; ii++)
    fsum += factors[ii] * modelh_ptr[ii];
  next_probability = Min(Max((fsum >> 16) + 2048, 0), 4095);

  uint modelg_value = kLzPredSumLookup[next_probability >> 4];
  uint hv = 12 * next_probability;
  
  uint gmax = Min<uint>(1 << (hist_sum + (flags & 8) + 5), 255) - (hist_sum == 4);
 
  uint adjust = (byte)modelg.cur > gmax ? gmax - (byte)modelg.cur : 0;
  
  uint new_val = adjust + ((byte)modelg.cur < gmax) + modelg.cur + ((kDivideLookup[(byte)modelg.cur] * ((bit << 23) - (modelg.cur >> 9))) & ~0xFF);;
  *modelg.cur_ptr = new_val;

  modelg.cur_ptr = &modelg.base_ptr[(hv >> 12) + modelg_index * 13];
  next_probability = (modelg_value + 3 * (((hv & 0xfff) * (modelg.cur_ptr[1] >> 12) + (4096 - (hv & 0xfff)) * (modelg.cur_ptr[0] >> 12)) >> 20) + 2) >> 2;
  modelg.cur_ptr += (hv & 0xfff) >> 11;
  modelg.cur = *modelg.cur_ptr;

  uint cmc_index = hashes_arr28[0] + bit + 2 * ((errorhist & 1) + 2 * bit_index_in_byte);
  uint cmcmult = 2 * kModelInterpolation[next_probability] - 4096;
  int errx = bit * 4096 - next_prob_last;
  cmc_cur->v0 += cmc_v0_mult * (errx >> 4);
  cmc_cur->v1 += cmc_v1_mult * (errx >> 4);
  cmc_cur = &cmc[cmc_index];
  cmc_v0_mult = 512;
  cmc_v1_mult = cmcmult;
  int v168 = (int)((((int)cmc_cur->v0 >> 16) << 9) + ((int)cmc_cur->v1 >> 16) * cmcmult + 128) >> 8;
  next_prob_last = kModelLutLookup[Min(Max(v168 + 2048, 0), 4095)];

  next_probability = (next_probability + 3 * next_prob_last + 2) >> 2;
  next_probability += (next_probability < 0x800);
}

};


//#include "valloc.inc"

//--- #include "coro3b_fake.inc"

struct Coroutine {

  FILE* _f;
  FILE* _g;
  uint f_quit;
  byte* __restrict outptr;
  byte* __restrict outbeg;

  template <typename T> 
  uint coro_call( T* that ) {
    that->do_process();
    return 0;
  }

  uint get( void ) { return getc(_f); }
  void put( uint c ) { putc(c,_g); }

  void coro_init( void ) { f_quit=0; outptr=outbeg=0; }

  void addinp( byte* inp,uint inplen ) {}

  void addout( byte* out,uint outlen ) {}

  void yield( void* p, int value ) {}

};

//--- #include "coro_fp2.inc"

template < class Model, int inpbufsize=1<<16, int outbufsize = 1<<16 > 
struct CoroFileProc : Model {

  using Model::f_quit;
  using Model::outptr;
  using Model::outbeg;

  ALIGN(4096) byte inpbuf[inpbufsize];
  ALIGN(4096) byte outbuf[outbufsize];

  void processfile( FILE* f, FILE* g ) {
    uint l,r;
    coro_init();
    addout( outbuf, outbufsize );
    while( 1 ) {
      r = coro_call(this); 
//printf( "r=%i\n", r );
      if( r==1 ) {
        l = fread( inpbuf, 1, inpbufsize, f );
        if( l==0 ) f_quit=1; // get/put don't support repeated failures
        addinp( inpbuf, l ); 
      } else { // r0=quit, r3=error
        l = outptr-outbeg;
        if( l>0 ) fwrite( (byte*)outbeg, 1,l, g ); // flush
        if( r!=2 ) break;
        addout( outbuf, outbufsize );
      } // if
    } // while
  } // func

};

//--- #include "sh_v1x.inc"

static const int SCALElog = 15;
static const int SCALE    = 1<<SCALElog;
static const int eSCALE   = 16*SCALE;
static const int hSCALE   = SCALE/2;
static const int mSCALE   = SCALE-1;

template< int f_DEC >
struct Rangecoder : Coroutine {

  enum {
    NUM   = 4,
    sTOP  = 0x01000000U,
    gTOP  = 0x00010000U,
    Thres = 0xFF000000U,
    Threg = 0x00FF0000U
  };

//  int   f_DEC; // 0=encode, 1=decode;
  union {
    struct {
      uint  low;  
      uint  Carry;
    };
    qword lowc;
    uint  code; 
  };
  uint  FFNum;
  uint  Cache;
  uint  range;

  void rc_Process( uint cumFreq, uint freq, uint totFreq ) {
    uint tmp = cumFreq*range;
    if( f_DEC ) code-=tmp; else lowc+=tmp;
    range *= freq;
    Renorm();
  }


  void rc_Arrange( uint totFreq ) {
    range /= totFreq;
  }

  uint rc_GetFreq( uint totFreq ) {
    return code/range;
  }

  void Renorm( void ) {
    if( f_DEC ) {
      while( range<sTOP ) range<<=8, (code<<=8)+=get();
//      if_e1( range<gTOP ) range<<=16, (code<<=16)+=(get()<<8)+get(); else if_e0( range<sTOP ) range<<=8, (code<<=8)+=get();
    } else {
      while( range<sTOP ) range<<=8, ShiftLow();
//      if_e1( range<gTOP ) range<<=16, ShiftLow2(); else if_e0( range<sTOP ) range<<=8, ShiftLow();
    }
  }

//  NOINLINE
  uint rc_BProcess( uint freq, uint bit ) { 

    Renorm();

//    freq = ptrim(freq);

    uint r1 = range>>SCALElog;
    uint rnew = r1*freq;

    if( f_DEC ) {

      if( code<rnew ) {
        range = rnew;
        return 0;
      } else {
        range -= rnew;
        if( f_DEC ) code -= rnew; else lowc += rnew;
        return 1;
      }

    } else {

      if( bit==0 ) {
        range = rnew;
        return 0;
      } else {
        range -= rnew;
        if( f_DEC ) code -= rnew; else lowc += rnew;
        return 1;
      }

    }

  }

  void ShiftLow( void ) {
    if( low<Thres || Carry ) {
      put( Cache+Carry );
      for (;FFNum != 0;FFNum--) put( Carry-1 ); // (Carry-1)&255;
      Cache = low>>24;
      Carry = 0;
    } else FFNum++;
    low<<=8;
  }

  void ShiftLow2( void ) {
    if( low<Thres || Carry ) {
      put( Cache+Carry );
      for (;FFNum != 0;FFNum--) put( Carry-1 ); // (Carry-1)&255;
      Cache = low>>24;
      Carry = 0;
    } else FFNum++;
    low &= sTOP-1;
    if( low<Threg ) {
      put( Cache );
      for(; FFNum!=0; FFNum-- ) put( 0xFF ); // (Carry-1)&255;
      Cache = low>>16;
    } else FFNum++;
    low<<=16;
  }

  void rcInit( void ) { 
    range = 0xFFFFFFFF;
    low   = 0;
    FFNum = 0;
    Carry = 0;    
    Cache = 0;
  }
  
  void rc_Init( void ) {
    rcInit();
    if( f_DEC==1 ) {
      for(int _=0; _<NUM+1; _++) (code<<=8)+=get(); 
    }
  }

  void rc_Quit( void ) {
    if( f_DEC==0 ) {
      for(int _=0; _<NUM+1; _++) ShiftLow(); 
    }
  }

};



template< int f_DEC >
struct Model : Rangecoder<f_DEC>,CM {

  uint f_len;

  void Init( void ) {
  }

  void Quit( void ) {
  }

  void do_process( void ) {
    uint i,k,c,sym,bit; int p;

    rc_Init();

    for( i=0; i<f_len; i++ ) {
      if( f_DEC==0 ) { c = get(); if( c==-1 ) break; }

      for( sym=1,k=7; sym<0x100; k-- ) {
        bit=0; if( f_DEC==0 ) bit=(c>>k)&1;

        p = next_probability;
        p = SCALE-(p<<(SCALElog-12));
        if_e0( p<1 ) p=1;
        if_e0( p>mSCALE ) p=mSCALE;

        bit = rc_BProcess(p,bit);

        CM_Update( bit );

        sym += sym+bit;
      }

      if( f_DEC==1 ) put(sym);
    }

    rc_Quit();

    yield(this,0);
  }

};


static union {
  CoroFileProc< Model<0> > C;
  CoroFileProc< Model<1> > D;
};

int main( int argc, char** argv ) {

  if( argc<4 ) return 1;

  Build_kModelInterpolation();
  CM_InitTables();
  LzCreateTables();

  uint f_DEC = (argv[1][0]=='d');
  FILE* f = fopen(argv[2],"rb"); if( f==0 ) return 2;
  FILE* g = fopen(argv[3],"wb"); if( g==0 ) return 3;

  uint f_len;

  if( f_DEC==0 ) {
    f_len = flen(f);
    fwrite( &f_len,1,4,g );
    C.f_len = f_len; C._f=f; C._g=g;
    C.CM_Init( 28, 25, 1048576 );
    C.processfile( f, g );
  } else {
    f_len=0; fread( &f_len,1,4,f );
    D.f_len = f_len; D._f=f; D._g=g;
    D.CM_Init( 28, 25, 1048576 );
    D.processfile( f, g );
  }

  fclose(g);
  fclose(f);

  return 0;
}
