lzari.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. /**************************************************************
  2. LZARI.C -- A Data Compression Program
  3. (tab = 4 spaces)
  4. ***************************************************************
  5. 4/7/1989 Haruhiko Okumura
  6. Use, distribute, and modify this program freely.
  7. Please send me your improved versions.
  8. PC-VAN SCIENCE
  9. NIFTY-Serve PAF01022
  10. CompuServe 74050,1022
  11. **************************************************************/
  12. /********************************************************************
  13. lzari.cpp -- A Data Compression Class
  14. created: 2004/10/04
  15. created: 4:10:2004 16:44
  16. file base: lzari
  17. file ext: cpp
  18. author: 阙荣文 (querw@sina.com)
  19. purpose: 如上所述,lzari.c提供了lzari压缩算法的实现,基于lzari.c我把它
  20. 做成了一个c++类方便使用
  21. *********************************************************************/
  22. #include "stdafx.h"
  23. //#include <stdio.h>
  24. //#include <stdlib.h>
  25. //#include <string.h>
  26. //#include <ctype.h>
  27. #include "Lzari.h"
  28. LZARI::LZARI()
  29. {
  30. infile = NULL;
  31. outfile = NULL;
  32. textsize = 0;
  33. codesize = 0;
  34. printcount = 0;
  35. low = 0;
  36. high = Q4;
  37. value = 0;
  38. shifts = 0;/* counts for magnifying low and high around Q2 */
  39. m_bMem = FALSE;
  40. m_pInBuffer = NULL;
  41. m_nInLength = 0;
  42. m_nInCur = 0;
  43. //m_pOutBuffer = NULL;
  44. m_nOutLength = 0;
  45. // m_nOutCur = 0;
  46. buffer_putbit = 0;
  47. mask_putbit = 128;
  48. buffer_getbit = 0;
  49. mask_getbit = 0;
  50. }
  51. LZARI::~LZARI()
  52. {
  53. Release();
  54. }
  55. void LZARI::Error(char *message)
  56. {
  57. #ifdef _OUTPUT_STATUS
  58. printf("\n%s\n", message);
  59. #endif
  60. //exit(EXIT_FAILURE);
  61. int e = 1;
  62. throw e;
  63. }
  64. void LZARI::PutBit(int bit) /* Output one bit (bit = 0,1) */
  65. {
  66. if (bit) buffer_putbit |= mask_putbit;
  67. if ((mask_putbit >>= 1) == 0)
  68. {
  69. if (!m_bMem)
  70. {
  71. if (putc(buffer_putbit, outfile) == EOF) Error("Write Error");
  72. }
  73. else
  74. {
  75. //if (m_nOutCur == m_nOutLength) Error("Write Error");
  76. //m_pOutBuffer[m_nOutCur++] = buffer;
  77. m_OutBuffer.push_back(buffer_putbit);
  78. //m_nOutCur++;
  79. }
  80. buffer_putbit = 0;
  81. mask_putbit = 128;
  82. codesize++;
  83. }
  84. }
  85. void LZARI::FlushBitBuffer(void) /* Send remaining bits */
  86. {
  87. int i;
  88. for (i = 0; i < 7; i++) PutBit(0);
  89. }
  90. int LZARI::GetBit(void) /* Get one bit (0 or 1) */
  91. {
  92. if ((mask_getbit >>= 1) == 0)
  93. {
  94. if (!m_bMem)
  95. buffer_getbit = getc(infile);
  96. else
  97. buffer_getbit = m_pInBuffer[m_nInCur++];
  98. mask_getbit = 128;
  99. }
  100. return ((buffer_getbit & mask_getbit) != 0);
  101. }
  102. /********** LZSS with multiple binary trees **********/
  103. void LZARI::InitTree(void) /* Initialize trees */
  104. {
  105. int i;
  106. /* For i = 0 to N - 1, rson[i] and lson[i] will be the right and
  107. left children of node i. These nodes need not be initialized.
  108. Also, dad[i] is the parent of node i. These are initialized to
  109. NIL (= N), which stands for 'not used.'
  110. For i = 0 to 255, rson[N + i + 1] is the root of the tree
  111. for strings that begin with character i. These are initialized
  112. to NIL. Note there are 256 trees. */
  113. for (i = N + 1; i <= N + 256; i++) rson[i] = NIL; /* root */
  114. for (i = 0; i < N; i++) dad[i] = NIL; /* node */
  115. }
  116. void LZARI::InsertNode(int r)
  117. /* Inserts string of length F, text_buf[r..r+F-1], into one of the
  118. trees (text_buf[r]'th tree) and returns the longest-match position
  119. and length via the global variables match_position and match_length.
  120. If match_length = F, then removes the old node in favor of the new
  121. one, because the old one will be deleted sooner.
  122. Note r plays double role, as tree node and position in buffer. */
  123. {
  124. int i, p, cmp, temp;
  125. unsigned char *key;
  126. cmp = 1; key = &text_buf[r]; p = N + 1 + key[0];
  127. rson[r] = lson[r] = NIL; match_length = 0;
  128. for (;;)
  129. {
  130. if (cmp >= 0)
  131. {
  132. if (rson[p] != NIL) p = rson[p];
  133. else {
  134. rson[p] = r; dad[r] = p; return;
  135. }
  136. }
  137. else
  138. {
  139. if (lson[p] != NIL) p = lson[p];
  140. else {
  141. lson[p] = r; dad[r] = p; return;
  142. }
  143. }
  144. for (i = 1; i < F; i++)
  145. if ((cmp = key[i] - text_buf[p + i]) != 0) break;
  146. if (i > THRESHOLD)
  147. {
  148. if (i > match_length)
  149. {
  150. match_position = (r - p) & (N - 1);
  151. if ((match_length = i) >= F) break;
  152. }
  153. else if (i == match_length)
  154. {
  155. if ((temp = (r - p) & (N - 1)) < match_position)
  156. match_position = temp;
  157. }
  158. }
  159. }
  160. dad[r] = dad[p]; lson[r] = lson[p]; rson[r] = rson[p];
  161. dad[lson[p]] = r; dad[rson[p]] = r;
  162. if (rson[dad[p]] == p) rson[dad[p]] = r;
  163. else lson[dad[p]] = r;
  164. dad[p] = NIL; /* remove p */
  165. }
  166. void LZARI::DeleteNode(int p) /* Delete node p from tree */
  167. {
  168. int q;
  169. if (dad[p] == NIL) return; /* not in tree */
  170. if (rson[p] == NIL) q = lson[p];
  171. else if (lson[p] == NIL) q = rson[p];
  172. else
  173. {
  174. q = lson[p];
  175. if (rson[q] != NIL)
  176. {
  177. do {
  178. q = rson[q];
  179. } while (rson[q] != NIL);
  180. rson[dad[q]] = lson[q]; dad[lson[q]] = dad[q];
  181. lson[q] = lson[p]; dad[lson[p]] = q;
  182. }
  183. rson[q] = rson[p]; dad[rson[p]] = q;
  184. }
  185. dad[q] = dad[p];
  186. if (rson[dad[p]] == p) rson[dad[p]] = q;
  187. else lson[dad[p]] = q;
  188. dad[p] = NIL;
  189. }
  190. /********** Arithmetic Compression **********/
  191. /* If you are not familiar with arithmetic compression, you should read
  192. I. E. Witten, R. M. Neal, and J. G. Cleary,
  193. Communications of the ACM, Vol. 30, pp. 520-540 (1987),
  194. from which much have been borrowed. */
  195. /* character code = 0, 1, ..., N_CHAR - 1 */
  196. void LZARI::StartModel(void) /* Initialize model */
  197. {
  198. int ch, sym, i;
  199. sym_cum[N_CHAR] = 0;
  200. for (sym = N_CHAR; sym >= 1; sym--)
  201. {
  202. ch = sym - 1;
  203. char_to_sym[ch] = sym; sym_to_char[sym] = ch;
  204. sym_freq[sym] = 1;
  205. sym_cum[sym - 1] = sym_cum[sym] + sym_freq[sym];
  206. }
  207. sym_freq[0] = 0; /* sentinel (!= sym_freq[1]) */
  208. position_cum[N] = 0;
  209. for (i = N; i >= 1; i--)
  210. position_cum[i - 1] = position_cum[i] + 10000 / (i + 200);
  211. /* empirical distribution function (quite tentative) */
  212. /* Please devise a better mechanism! */
  213. }
  214. void LZARI::UpdateModel(int sym)
  215. {
  216. int i, c, ch_i, ch_sym;
  217. if (sym_cum[0] >= MAX_CUM)
  218. {
  219. c = 0;
  220. for (i = N_CHAR; i > 0; i--)
  221. {
  222. sym_cum[i] = c;
  223. c += (sym_freq[i] = (sym_freq[i] + 1) >> 1);
  224. }
  225. sym_cum[0] = c;
  226. }
  227. for (i = sym; sym_freq[i] == sym_freq[i - 1]; i--);
  228. if (i < sym)
  229. {
  230. ch_i = sym_to_char[i]; ch_sym = sym_to_char[sym];
  231. sym_to_char[i] = ch_sym; sym_to_char[sym] = ch_i;
  232. char_to_sym[ch_i] = sym; char_to_sym[ch_sym] = i;
  233. }
  234. sym_freq[i]++;
  235. while (--i >= 0) sym_cum[i]++;
  236. }
  237. void LZARI::Output(int bit) /* Output 1 bit, followed by its complements */
  238. {
  239. PutBit(bit);
  240. for (; shifts > 0; shifts--) PutBit(!bit);
  241. }
  242. void LZARI::EncodeChar(int ch)
  243. {
  244. int sym;
  245. unsigned long int range;
  246. sym = char_to_sym[ch];
  247. range = high - low;
  248. high = low + (range * sym_cum[sym - 1]) / sym_cum[0];
  249. low += (range * sym_cum[sym]) / sym_cum[0];
  250. for (;;)
  251. {
  252. if (high <= Q2) Output(0);
  253. else if (low >= Q2)
  254. {
  255. Output(1); low -= Q2; high -= Q2;
  256. }
  257. else if (low >= Q1 && high <= Q3)
  258. {
  259. shifts++; low -= Q1; high -= Q1;
  260. }
  261. else break;
  262. low += low;
  263. high += high;
  264. }
  265. UpdateModel(sym);
  266. }
  267. void LZARI::EncodePosition(int position)
  268. {
  269. unsigned long int range;
  270. range = high - low;
  271. high = low + (range * position_cum[position]) / position_cum[0];
  272. low += (range * position_cum[position + 1]) / position_cum[0];
  273. for (;;)
  274. {
  275. if (high <= Q2) Output(0);
  276. else if (low >= Q2)
  277. {
  278. Output(1); low -= Q2; high -= Q2;
  279. }
  280. else if (low >= Q1 && high <= Q3)
  281. {
  282. shifts++; low -= Q1; high -= Q1;
  283. }
  284. else break;
  285. low += low;
  286. high += high;
  287. }
  288. }
  289. void LZARI::EncodeEnd(void)
  290. {
  291. shifts++;
  292. if (low < Q1) Output(0); else Output(1);
  293. FlushBitBuffer(); /* flush bits remaining in buffer */
  294. }
  295. int LZARI::BinarySearchSym(unsigned int x)
  296. /* 1 if x >= sym_cum[1],
  297. N_CHAR if sym_cum[N_CHAR] > x,
  298. i such that sym_cum[i - 1] > x >= sym_cum[i] otherwise */
  299. {
  300. int i, j, k;
  301. i = 1; j = N_CHAR;
  302. while (i < j)
  303. {
  304. k = (i + j) / 2;
  305. if (sym_cum[k] > x) i = k + 1; else j = k;
  306. }
  307. return i;
  308. }
  309. int LZARI::BinarySearchPos(unsigned int x)
  310. /* 0 if x >= position_cum[1],
  311. N - 1 if position_cum[N] > x,
  312. i such that position_cum[i] > x >= position_cum[i + 1] otherwise */
  313. {
  314. int i, j, k;
  315. i = 1; j = N;
  316. while (i < j)
  317. {
  318. k = (i + j) / 2;
  319. if (position_cum[k] > x) i = k + 1; else j = k;
  320. }
  321. return i - 1;
  322. }
  323. void LZARI::StartDecode(void)
  324. {
  325. int i;
  326. for (i = 0; i < M + 2; i++)
  327. value = 2 * value + GetBit();
  328. }
  329. int LZARI::DecodeChar(void)
  330. {
  331. int sym, ch;
  332. unsigned long int range;
  333. range = high - low;
  334. sym = BinarySearchSym((unsigned int)
  335. (((value - low + 1) * sym_cum[0] - 1) / range));
  336. high = low + (range * sym_cum[sym - 1]) / sym_cum[0];
  337. low += (range * sym_cum[sym]) / sym_cum[0];
  338. for (;;) {
  339. if (low >= Q2) {
  340. value -= Q2; low -= Q2; high -= Q2;
  341. }
  342. else if (low >= Q1 && high <= Q3) {
  343. value -= Q1; low -= Q1; high -= Q1;
  344. }
  345. else if (high > Q2) break;
  346. low += low; high += high;
  347. value = 2 * value + GetBit();
  348. }
  349. ch = sym_to_char[sym];
  350. UpdateModel(sym);
  351. return ch;
  352. }
  353. int LZARI::DecodePosition(void)
  354. {
  355. int position;
  356. unsigned long int range;
  357. range = high - low;
  358. position = BinarySearchPos((unsigned int)
  359. (((value - low + 1) * position_cum[0] - 1) / range));
  360. high = low + (range * position_cum[position]) / position_cum[0];
  361. low += (range * position_cum[position + 1]) / position_cum[0];
  362. for (;;) {
  363. if (low >= Q2) {
  364. value -= Q2; low -= Q2; high -= Q2;
  365. }
  366. else if (low >= Q1 && high <= Q3) {
  367. value -= Q1; low -= Q1; high -= Q1;
  368. }
  369. else if (high > Q2) break;
  370. low += low; high += high;
  371. value = 2 * value + GetBit();
  372. }
  373. return position;
  374. }
  375. /********** Encode and Decode **********/
  376. void LZARI::Encode(void)
  377. {
  378. int i, c, len, r, s, last_match_length;
  379. if (!m_bMem)
  380. {
  381. fseek(infile, 0L, SEEK_END);
  382. textsize = ftell(infile);
  383. if (fwrite(&textsize, sizeof textsize, 1, outfile) < 1)
  384. Error("Write Error"); /* output size of text */
  385. codesize += sizeof textsize;
  386. if (textsize == 0) return;
  387. rewind(infile);
  388. textsize = 0;
  389. }
  390. else
  391. {
  392. textsize = m_nInLength;
  393. m_OutBuffer.resize(sizeof textsize);
  394. memcpy(&m_OutBuffer[0], &textsize, sizeof textsize);
  395. //m_nOutCur += sizeof textsize;
  396. codesize += sizeof textsize;
  397. if (textsize == 0) return;
  398. m_nInCur = 0;
  399. textsize = 0;
  400. }
  401. StartModel();
  402. InitTree();
  403. s = 0; r = N - F;
  404. for (i = s; i < r; i++) text_buf[i] = ' ';
  405. if (!m_bMem)
  406. for (len = 0; len < F && (c = getc(infile)) != EOF; len++) text_buf[r + len] = c;
  407. else
  408. for (len = 0; len < F && m_nInCur < m_nInLength; len++)
  409. {
  410. c = m_pInBuffer[m_nInCur++];
  411. text_buf[r + len] = c;
  412. }
  413. textsize = len;
  414. for (i = 1; i <= F; i++) InsertNode(r - i);
  415. InsertNode(r);
  416. do {
  417. if (match_length > len) match_length = len;
  418. if (match_length <= THRESHOLD)
  419. {
  420. match_length = 1; EncodeChar(text_buf[r]);
  421. }
  422. else
  423. {
  424. EncodeChar(255 - THRESHOLD + match_length);
  425. EncodePosition(match_position - 1);
  426. }
  427. last_match_length = match_length;
  428. if (!m_bMem)
  429. {
  430. for (i = 0; i < last_match_length && (c = getc(infile)) != EOF; i++)
  431. {
  432. DeleteNode(s); text_buf[s] = c;
  433. if (s < F - 1) text_buf[s + N] = c;
  434. s = (s + 1) & (N - 1);
  435. r = (r + 1) & (N - 1);
  436. InsertNode(r);
  437. }
  438. }
  439. else
  440. {
  441. for (i = 0; i < last_match_length && m_nInCur < m_nInLength; i++)
  442. {
  443. c = m_pInBuffer[m_nInCur++];
  444. DeleteNode(s);
  445. text_buf[s] = c;
  446. if (s < F - 1) text_buf[s + N] = c;
  447. s = (s + 1) & (N - 1);
  448. r = (r + 1) & (N - 1);
  449. InsertNode(r);
  450. }
  451. }
  452. if ((textsize += i) > printcount)
  453. {
  454. #ifdef _OUTPUT_STATUS
  455. printf("%12ld\r", textsize);
  456. #endif
  457. printcount += 1024;
  458. }
  459. while (i++ < last_match_length)
  460. {
  461. DeleteNode(s);
  462. s = (s + 1) & (N - 1);
  463. r = (r + 1) & (N - 1);
  464. if (--len) InsertNode(r);
  465. }
  466. } while (len > 0);
  467. EncodeEnd();
  468. #ifdef _OUTPUT_STATUS
  469. printf("In : %lu bytes\n", textsize);
  470. printf("Out: %lu bytes\n", codesize);
  471. printf("Out/In: %.3f\n", (double)codesize / textsize);
  472. #endif
  473. }
  474. void LZARI::Decode(void)
  475. {
  476. int i, j, k, r, c;
  477. unsigned long int count;
  478. if (!m_bMem)
  479. {
  480. if (fread(&textsize, sizeof textsize, 1, infile) < 1)
  481. Error("Read Error"); /* read size of text */
  482. }
  483. else
  484. {
  485. if (m_nInLength < sizeof textsize)
  486. Error("Read Error");
  487. memcpy(&textsize, m_pInBuffer + m_nInCur, sizeof textsize);
  488. //m_OutBuffer.reserve(textsize);
  489. m_nOutLength = textsize;
  490. //m_nOutCur = 0;
  491. m_nInCur += sizeof textsize;
  492. }
  493. if (textsize == 0) return;
  494. StartDecode();
  495. StartModel();
  496. for (i = 0; i < N - F; i++) text_buf[i] = ' ';
  497. r = N - F;
  498. for (count = 0; count < textsize;)
  499. {
  500. c = DecodeChar();
  501. if (c < 256)
  502. {
  503. if (!m_bMem)
  504. putc(c, outfile);
  505. else
  506. {
  507. //m_OutBuffer[m_nOutCur++] = c;
  508. m_OutBuffer.push_back(c);
  509. //m_nOutCur++;
  510. }
  511. text_buf[r++] = c;
  512. r &= (N - 1);
  513. count++;
  514. }
  515. else
  516. {
  517. i = (r - DecodePosition() - 1) & (N - 1);
  518. j = c - 255 + THRESHOLD;
  519. for (k = 0; k < j; k++)
  520. {
  521. c = text_buf[(i + k) & (N - 1)];
  522. if (!m_bMem)
  523. putc(c, outfile);
  524. else
  525. {
  526. // m_pOutBuffer[m_nOutCur++] = c;
  527. m_OutBuffer.push_back(c);
  528. //m_nOutCur ++;
  529. }
  530. text_buf[r++] = c;
  531. r &= (N - 1);
  532. count++;
  533. }
  534. }
  535. if (count > printcount)
  536. {
  537. #ifdef _OUTPUT_STATUS
  538. printf("%12lu\r", count);
  539. #endif
  540. printcount += 1024;
  541. }
  542. }
  543. #ifdef _OUTPUT_STATUS
  544. printf("%12lu\n", count);
  545. #endif
  546. }
  547. void LZARI::Compress(const char *lpszInfile, const char *lpszOutfile)
  548. {
  549. m_bMem = FALSE;
  550. fopen_s(&infile, lpszInfile, "rb");
  551. fopen_s(&outfile, lpszOutfile, "wb");
  552. if (infile && outfile)
  553. {
  554. Encode();
  555. fclose(infile);
  556. fclose(outfile);
  557. infile = NULL;
  558. outfile = NULL;
  559. }
  560. }
  561. void LZARI::UnCompress(const char *lpszInfile, const char *lpszOutfile)
  562. {
  563. m_bMem = FALSE;
  564. fopen_s(&infile, lpszInfile, "rb");
  565. fopen_s(&outfile, lpszOutfile, "wb");
  566. if (infile && outfile)
  567. {
  568. Decode();
  569. fclose(infile);
  570. fclose(outfile);
  571. infile = NULL;
  572. outfile = NULL;
  573. }
  574. }
  575. void LZARI::Compress(const BYTE *pInBuffer, int nInLength, const BYTE *&pOutBuffer, int &nOutLength)
  576. {
  577. m_pInBuffer = pInBuffer;
  578. m_nInLength = nInLength;
  579. m_nInCur = 0;
  580. // m_nOutCur = 0;
  581. m_bMem = TRUE;
  582. Encode();
  583. pOutBuffer = &m_OutBuffer[0];
  584. nOutLength = m_OutBuffer.size();
  585. }
  586. void LZARI::Release()
  587. {
  588. if (!m_OutBuffer.empty())
  589. {
  590. infile = NULL;
  591. outfile = NULL;
  592. textsize = 0;
  593. codesize = 0;
  594. printcount = 0;
  595. low = 0;
  596. high = Q4;
  597. value = 0;
  598. shifts = 0;
  599. m_bMem = FALSE;
  600. m_pInBuffer = NULL;
  601. m_nInLength = 0;
  602. m_nInCur = 0;
  603. m_OutBuffer.clear();
  604. m_nOutLength = 0;
  605. buffer_putbit = 0;
  606. mask_putbit = 128;
  607. buffer_getbit = 0;
  608. mask_getbit = 0;
  609. }
  610. }
  611. void LZARI::UnCompress(const BYTE *pInBuffer, int nInLength, const BYTE *&pOutBuffer, int &nOutLength)
  612. {
  613. m_pInBuffer = pInBuffer;
  614. m_nInLength = nInLength;
  615. m_nInCur = 0;
  616. m_bMem = TRUE;
  617. Decode();
  618. pOutBuffer = &m_OutBuffer[0];
  619. nOutLength = m_OutBuffer.size();
  620. m_OutBuffer.push_back(0);
  621. }