lzari.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678
  1. #include "StdAfx.h"
  2. #include "Lzari.h"
  3. //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  4. CLzariEx::CLzariEx(void)
  5. :m_nInCur(0)
  6. ,m_pSrcData(NULL)
  7. ,m_textsize(0)
  8. ,m_codesize(0)
  9. ,m_printcount(0)
  10. ,m_low(0)
  11. ,m_high(Q4)
  12. ,m_value(0)
  13. ,m_shifts(0)
  14. {
  15. m_match_length = 0;
  16. m_match_position = 0;
  17. m_nDestLength = 0;
  18. m_nSrcLength = 0;
  19. m_buffer_putbit = 0;
  20. m_mask_putbit = 128;
  21. m_buffer_getbit = 0;
  22. m_mask_getbit = 0;
  23. memset(m_dad, 0, N+1);
  24. memset(m_lson, 0, N+1);
  25. memset(m_rson, 0, N+257);
  26. memset(m_char_to_sym, 0, N);
  27. memset(m_sym_to_char, 0, N_CHAR+1);
  28. memset(m_sym_freq, 0, N_CHAR+1);
  29. memset(m_sym_cum, 0, N_CHAR+1);
  30. memset(m_position_cum, 0, N+1);
  31. memset(m_text_buf, 0, N+F-1);
  32. memset(m_text_buf, 0, N+F-1);
  33. }
  34. CLzariEx::~CLzariEx(void)
  35. {
  36. m_vtDestData.clear();
  37. }
  38. void CLzariEx::Release()
  39. {
  40. m_nInCur = 0;
  41. m_pSrcData = NULL;
  42. m_textsize = 0;
  43. m_codesize = 0;
  44. m_printcount = 0;
  45. m_low = 0;
  46. m_high = Q4;
  47. m_value = 0;
  48. m_shifts = 0;
  49. m_match_length = 0;
  50. m_match_position = 0;
  51. m_nDestLength = 0;
  52. m_nSrcLength = 0;
  53. m_buffer_putbit = 0;
  54. m_mask_putbit = 128;
  55. m_buffer_getbit = 0;
  56. m_mask_getbit = 0;
  57. memset(m_dad, 0, N+1);
  58. memset(m_lson, 0, N+1);
  59. memset(m_rson, 0, N+257);
  60. memset(m_char_to_sym, 0, N);
  61. memset(m_sym_to_char, 0, N_CHAR+1);
  62. memset(m_sym_freq, 0, N_CHAR+1);
  63. memset(m_sym_cum, 0, N_CHAR+1);
  64. memset(m_position_cum, 0, N+1);
  65. memset(m_text_buf, 0, N+F-1);
  66. memset(m_text_buf, 0, N+F-1);
  67. m_vtDestData.clear();
  68. }
  69. void CLzariEx::Error(char *message)
  70. {
  71. printf("\n%s\n", message);
  72. exit(EXIT_FAILURE);
  73. }
  74. void CLzariEx::PutBit(int bit)
  75. {
  76. if (bit)
  77. m_buffer_putbit |= m_mask_putbit;
  78. if ((m_mask_putbit >>= 1) == 0)
  79. {
  80. m_vtDestData.push_back(m_buffer_putbit);
  81. m_buffer_putbit = 0;
  82. m_mask_putbit = 128;
  83. m_codesize++;
  84. }
  85. }
  86. void CLzariEx::FlushBitBuffer()
  87. {
  88. for ( int i = 0; i < 7; i++ )
  89. PutBit(0);
  90. }
  91. int CLzariEx::GetBit(void)
  92. {
  93. if ((m_mask_getbit >>= 1) == 0)
  94. {
  95. if ( m_nInCur < m_nSrcLength )
  96. m_buffer_getbit = m_pSrcData[m_nInCur++];
  97. m_mask_getbit = 128;
  98. }
  99. return ((m_buffer_getbit & m_mask_getbit) != 0);
  100. }
  101. void CLzariEx::InitTree(void) /* Initialize trees */
  102. {
  103. int i;
  104. /* For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right and
  105. left children of node i. These nodes need not be initialized.
  106. Also, m_dad[i] is the parent of node i. These are initialized to
  107. NIL (= N), which stands for 'not used.'
  108. For i = 0 to 255, m_rson[N + i + 1] is the root of the tree
  109. for strings that begin with character i. These are initialized
  110. to NIL. Note there are 256 trees. */
  111. for (i = N + 1; i <= N + 256; i++)
  112. m_rson[i] = NIL; /* root */
  113. for (i = 0; i < N; i++)
  114. m_dad[i] = NIL; /* node */
  115. }
  116. void CLzariEx::InsertNode(int r)
  117. /* Inserts string of length F, m_text_buf[r..r+F-1], into one of the
  118. trees (m_text_buf[r]'th tree) and returns the longest-match position
  119. and length via the global variables m_match_position and m_match_length.
  120. If m_match_length = F, then removes the old node in favor of the new
  121. one, because the old one will be deleted sooner.
  122. Note r plays double role, as tree node and position in buffer. */
  123. {
  124. int i, p, cmp, temp;
  125. unsigned char *key;
  126. cmp = 1;
  127. key = &m_text_buf[r];
  128. p = N + 1 + key[0];
  129. m_rson[r] = m_lson[r] = NIL;
  130. m_match_length = 0;
  131. for ( ; ; )
  132. {
  133. if (cmp >= 0)
  134. {
  135. if (m_rson[p] != NIL)
  136. p = m_rson[p];
  137. else
  138. {
  139. m_rson[p] = r;
  140. m_dad[r] = p;
  141. return;
  142. }
  143. }
  144. else
  145. {
  146. if (m_lson[p] != NIL)
  147. p = m_lson[p];
  148. else
  149. {
  150. m_lson[p] = r;
  151. m_dad[r] = p;
  152. return;
  153. }
  154. }
  155. for (i = 1; i < F; i++)
  156. if ((cmp = key[i] - m_text_buf[p + i]) != 0)
  157. break;
  158. if (i > THRESHOLD)
  159. {
  160. if (i > m_match_length)
  161. {
  162. m_match_position = (r - p) & (N - 1);
  163. if ((m_match_length = i) >= F)
  164. break;
  165. }
  166. else if (i == m_match_length)
  167. {
  168. if ((temp = (r - p) & (N - 1)) < m_match_position)
  169. m_match_position = temp;
  170. }
  171. }
  172. }
  173. m_dad[r] = m_dad[p];
  174. m_lson[r] = m_lson[p];
  175. m_rson[r] = m_rson[p];
  176. m_dad[m_lson[p]] = r;
  177. m_dad[m_rson[p]] = r;
  178. if (m_rson[m_dad[p]] == p)
  179. m_rson[m_dad[p]] = r;
  180. else
  181. m_lson[m_dad[p]] = r;
  182. m_dad[p] = NIL; /* remove p */
  183. }
  184. void CLzariEx::DeleteNode(int p) /* Delete node p from tree */
  185. {
  186. int q;
  187. if (m_dad[p] == NIL)
  188. return; /* not in tree */
  189. if (m_rson[p] == NIL)
  190. q = m_lson[p];
  191. else if (m_lson[p] == NIL)
  192. q = m_rson[p];
  193. else
  194. {
  195. q = m_lson[p];
  196. if (m_rson[q] != NIL)
  197. {
  198. do
  199. {
  200. q = m_rson[q];
  201. } while (m_rson[q] != NIL);
  202. m_rson[m_dad[q]] = m_lson[q];
  203. m_dad[m_lson[q]] = m_dad[q];
  204. m_lson[q] = m_lson[p];
  205. m_dad[m_lson[p]] = q;
  206. }
  207. m_rson[q] = m_rson[p];
  208. m_dad[m_rson[p]] = q;
  209. }
  210. m_dad[q] = m_dad[p];
  211. if (m_rson[m_dad[p]] == p)
  212. m_rson[m_dad[p]] = q;
  213. else
  214. m_lson[m_dad[p]] = q;
  215. m_dad[p] = NIL;
  216. }
  217. void CLzariEx::StartModel(void) /* Initialize model */
  218. {
  219. int ch, sym, i;
  220. m_sym_cum[N_CHAR] = 0;
  221. for (sym = N_CHAR; sym >= 1; sym--)
  222. {
  223. ch = sym - 1;
  224. m_char_to_sym[ch] = sym;
  225. m_sym_to_char[sym] = ch;
  226. m_sym_freq[sym] = 1;
  227. m_sym_cum[sym - 1] = m_sym_cum[sym] + m_sym_freq[sym];
  228. }
  229. m_sym_freq[0] = 0; /* sentinel (!= m_sym_freq[1]) */
  230. m_position_cum[N] = 0;
  231. for (i = N; i >= 1; i--)
  232. m_position_cum[i - 1] = m_position_cum[i] + 10000 / (i + 200);
  233. /* empirical distribution function (quite tentative) */
  234. /* Please devise a better mechanism! */
  235. }
  236. void CLzariEx::UpdateModel(int sym)
  237. {
  238. int i, c, ch_i, ch_sym;
  239. if (m_sym_cum[0] >= MAX_CUM)
  240. {
  241. c = 0;
  242. for (i = N_CHAR; i > 0; i--)
  243. {
  244. m_sym_cum[i] = c;
  245. c += (m_sym_freq[i] = (m_sym_freq[i] + 1) >> 1);
  246. }
  247. m_sym_cum[0] = c;
  248. }
  249. for (i = sym; m_sym_freq[i] == m_sym_freq[i - 1]; i--)
  250. ;
  251. if (i < sym)
  252. {
  253. ch_i = m_sym_to_char[i];
  254. ch_sym = m_sym_to_char[sym];
  255. m_sym_to_char[i] = ch_sym;
  256. m_sym_to_char[sym] = ch_i;
  257. m_char_to_sym[ch_i] = sym;
  258. m_char_to_sym[ch_sym] = i;
  259. }
  260. m_sym_freq[i]++;
  261. while (--i >= 0)
  262. m_sym_cum[i]++;
  263. }
  264. void CLzariEx::Output(int bit) /* Output 1 bit, followed by its complements */
  265. {
  266. PutBit(bit);
  267. for ( ; m_shifts > 0; m_shifts--)
  268. PutBit(! bit);
  269. }
  270. void CLzariEx::EncodeChar(int ch)
  271. {
  272. int sym;
  273. unsigned long int range;
  274. sym = m_char_to_sym[ch];
  275. range = m_high - m_low;
  276. m_high = m_low + (range * m_sym_cum[sym - 1]) / m_sym_cum[0];
  277. m_low += (range * m_sym_cum[sym ]) / m_sym_cum[0];
  278. for ( ; ; )
  279. {
  280. if (m_high <= Q2)
  281. Output(0);
  282. else if (m_low >= Q2)
  283. {
  284. Output(1);
  285. m_low -= Q2;
  286. m_high -= Q2;
  287. }
  288. else if (m_low >= Q1 && m_high <= Q3)
  289. {
  290. m_shifts++;
  291. m_low -= Q1;
  292. m_high -= Q1;
  293. }
  294. else
  295. break;
  296. m_low += m_low;
  297. m_high += m_high;
  298. }
  299. UpdateModel(sym);
  300. }
  301. void CLzariEx::EncodePosition(int position)
  302. {
  303. unsigned long int range;
  304. range = m_high - m_low;
  305. m_high = m_low + (range * m_position_cum[position ]) / m_position_cum[0];
  306. m_low += (range * m_position_cum[position + 1]) / m_position_cum[0];
  307. for ( ; ; )
  308. {
  309. if (m_high <= Q2)
  310. Output(0);
  311. else if (m_low >= Q2)
  312. {
  313. Output(1);
  314. m_low -= Q2;
  315. m_high -= Q2;
  316. }
  317. else if (m_low >= Q1 && m_high <= Q3)
  318. {
  319. m_shifts++;
  320. m_low -= Q1;
  321. m_high -= Q1;
  322. } else break;
  323. m_low += m_low;
  324. m_high += m_high;
  325. }
  326. }
  327. void CLzariEx::EncodeEnd(void)
  328. {
  329. m_shifts++;
  330. if (m_low < Q1)
  331. Output(0);
  332. else
  333. Output(1);
  334. FlushBitBuffer(); /* flush bits remaining in buffer */
  335. }
  336. int CLzariEx::BinarySearchSym(unsigned int x)
  337. /* 1 if x >= m_sym_cum[1],
  338. N_CHAR if m_sym_cum[N_CHAR] > x,
  339. i such that m_sym_cum[i - 1] > x >= m_sym_cum[i] otherwise */
  340. {
  341. int i, j, k;
  342. i = 1; j = N_CHAR;
  343. while (i < j)
  344. {
  345. k = (i + j) / 2;
  346. if (m_sym_cum[k] > x)
  347. i = k + 1;
  348. else
  349. j = k;
  350. }
  351. return i;
  352. }
  353. int CLzariEx::BinarySearchPos(unsigned int x)
  354. /* 0 if x >= m_position_cum[1],
  355. N - 1 if m_position_cum[N] > x,
  356. i such that m_position_cum[i] > x >= m_position_cum[i + 1] otherwise */
  357. {
  358. int i, j, k;
  359. i = 1; j = N;
  360. while (i < j)
  361. {
  362. k = (i + j) / 2;
  363. if (m_position_cum[k] > x)
  364. i = k + 1;
  365. else
  366. j = k;
  367. }
  368. return i - 1;
  369. }
  370. void CLzariEx::StartDecode(void)
  371. {
  372. int i;
  373. for (i = 0; i < M + 2; i++)
  374. m_value = 2 * m_value + GetBit();
  375. }
  376. int CLzariEx::DecodeChar(void)
  377. {
  378. int sym, ch;
  379. unsigned long int range;
  380. range = m_high - m_low;
  381. sym = BinarySearchSym((unsigned int)(((m_value - m_low + 1) * m_sym_cum[0] - 1) / range));
  382. m_high = m_low + (range * m_sym_cum[sym - 1]) / m_sym_cum[0];
  383. m_low += (range * m_sym_cum[sym ]) / m_sym_cum[0];
  384. for ( ; ; )
  385. {
  386. if (m_low >= Q2)
  387. {
  388. m_value -= Q2;
  389. m_low -= Q2;
  390. m_high -= Q2;
  391. }
  392. else if (m_low >= Q1 && m_high <= Q3)
  393. {
  394. m_value -= Q1;
  395. m_low -= Q1;
  396. m_high -= Q1;
  397. }
  398. else if (m_high > Q2)
  399. break;
  400. m_low += m_low;
  401. m_high += m_high;
  402. m_value = 2 * m_value + GetBit();
  403. }
  404. ch = m_sym_to_char[sym];
  405. UpdateModel(sym);
  406. return ch;
  407. }
  408. int CLzariEx::DecodePosition(void)
  409. {
  410. int position;
  411. unsigned long int range;
  412. range = m_high - m_low;
  413. position = BinarySearchPos((unsigned int)(((m_value - m_low + 1) * m_position_cum[0] - 1) / range));
  414. m_high = m_low + (range * m_position_cum[position ]) / m_position_cum[0];
  415. m_low += (range * m_position_cum[position + 1]) / m_position_cum[0];
  416. for ( ; ; )
  417. {
  418. if (m_low >= Q2) {
  419. m_value -= Q2;
  420. m_low -= Q2;
  421. m_high -= Q2;
  422. }
  423. else if (m_low >= Q1 && m_high <= Q3)
  424. {
  425. m_value -= Q1;
  426. m_low -= Q1;
  427. m_high -= Q1;
  428. }
  429. else if (m_high > Q2)
  430. break;
  431. m_low += m_low;
  432. m_high += m_high;
  433. m_value = 2 * m_value + GetBit();
  434. }
  435. return position;
  436. }
  437. /********** Encode and Decode **********/
  438. void CLzariEx::Encode(void)
  439. {
  440. int i, c, len, r, s, last_match_length;
  441. m_textsize = m_nSrcLength;
  442. m_vtDestData.resize(sizeof(m_textsize));
  443. memcpy(&m_vtDestData[0], &m_textsize, sizeof(m_textsize));
  444. m_codesize += sizeof(m_textsize);
  445. if (m_textsize == 0)
  446. return;
  447. m_nInCur = 0;
  448. m_textsize = 0;
  449. StartModel();
  450. InitTree();
  451. s = 0; r = N - F;
  452. for (i = s; i < r; i++)
  453. m_text_buf[i] = ' ';
  454. for (len = 0; len < F && ( m_nInCur < m_nSrcLength); len++)
  455. {
  456. c = m_pSrcData[m_nInCur++];
  457. m_text_buf[r + len] = c;
  458. }
  459. m_textsize = len;
  460. for (i = 1; i <= F; i++)
  461. InsertNode(r - i);
  462. InsertNode(r);
  463. do
  464. {
  465. if (m_match_length > len) m_match_length = len;
  466. if (m_match_length <= THRESHOLD)
  467. {
  468. m_match_length = 1;
  469. EncodeChar(m_text_buf[r]);
  470. }
  471. else
  472. {
  473. EncodeChar(255 - THRESHOLD + m_match_length);
  474. EncodePosition(m_match_position - 1);
  475. }
  476. last_match_length = m_match_length;
  477. for (i = 0; i < last_match_length && (m_nInCur < m_nSrcLength) ; i++)
  478. {
  479. c = m_pSrcData[m_nInCur++];
  480. DeleteNode(s);
  481. m_text_buf[s] = c;
  482. if (s < F - 1) m_text_buf[s + N] = c;
  483. s = (s + 1) & (N - 1);
  484. r = (r + 1) & (N - 1);
  485. InsertNode(r);
  486. }
  487. if ((m_textsize += i) > m_printcount)
  488. {
  489. printf("%12ld\r", m_textsize);
  490. m_printcount += 1024;
  491. }
  492. while (i++ < last_match_length)
  493. {
  494. DeleteNode(s);
  495. s = (s + 1) & (N - 1);
  496. r = (r + 1) & (N - 1);
  497. if (--len) InsertNode(r);
  498. }
  499. } while (len > 0);
  500. EncodeEnd();
  501. printf("In : %lu bytes\n", m_textsize);
  502. printf("Out: %lu bytes\n", m_codesize);
  503. printf("Out/In: %.3f\n", (double)m_codesize / m_textsize);
  504. }
  505. void CLzariEx::Decode(void)
  506. {
  507. int i, j, k, r, c;
  508. unsigned long int count;
  509. if ( m_nSrcLength < sizeof(m_textsize) )
  510. Error("Read Error"); /* read size of text */
  511. memcpy(&m_textsize, m_pSrcData + m_nInCur, sizeof(m_textsize));
  512. m_nDestLength = m_textsize;
  513. m_nInCur += sizeof(m_textsize);
  514. if (m_textsize == 0)
  515. return;
  516. StartDecode();
  517. StartModel();
  518. for (i = 0; i < N - F; i++)
  519. m_text_buf[i] = ' ';
  520. r = N - F;
  521. for (count = 0; count < m_textsize; )
  522. {
  523. c = DecodeChar();
  524. if (c < 256)
  525. {
  526. m_vtDestData.push_back(c);
  527. m_text_buf[r++] = c;
  528. r &= (N - 1);
  529. count++;
  530. }
  531. else
  532. {
  533. i = (r - DecodePosition() - 1) & (N - 1);
  534. j = c - 255 + THRESHOLD;
  535. for (k = 0; k < j; k++)
  536. {
  537. c = m_text_buf[(i + k) & (N - 1)];
  538. m_vtDestData.push_back(c);
  539. m_text_buf[r++] = c;
  540. r &= (N - 1);
  541. count++;
  542. }
  543. }
  544. if (count > m_printcount)
  545. {
  546. printf("%12lu\r", count);
  547. m_printcount += 1024;
  548. }
  549. }
  550. printf("%12lu\n", count);
  551. }
  552. void CLzariEx::Compress(const BYTE *pInBuffer,int nInLength,const BYTE *&pOutBuffer ,int &nOutLength)
  553. {
  554. m_pSrcData = pInBuffer;
  555. m_nSrcLength = nInLength;
  556. m_nInCur = 0;
  557. Encode();
  558. pOutBuffer = &m_vtDestData[0];
  559. nOutLength = m_vtDestData.size();
  560. }
  561. void CLzariEx::UnCompress(const BYTE *pInBuffer,int nInLength,const BYTE *&pOutBuffer ,int &nOutLength)
  562. {
  563. m_pSrcData = pInBuffer;
  564. m_nSrcLength = nInLength;
  565. m_nInCur = 0;
  566. TRACE("%d,%d,%d\t\n", m_nSrcLength,m_nInCur,m_textsize);
  567. Decode();
  568. pOutBuffer = &m_vtDestData[0];
  569. nOutLength = m_vtDestData.size();
  570. //m_vtDestData.push_back(0);
  571. TRACE("%d,%d,%d\t\n", m_nSrcLength,m_nInCur,m_textsize);
  572. }