lzari.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727
  1. /**************************************************************
  2. LZARI.C -- A Data Compression Program
  3. (tab = 4 spaces)
  4. ***************************************************************
  5. 4/7/1989 Haruhiko Okumura
  6. Use, distribute, and modify this program freely.
  7. Please send me your improved versions.
  8. PC-VAN SCIENCE
  9. NIFTY-Serve PAF01022
  10. CompuServe 74050,1022
  11. **************************************************************/
  12. /********************************************************************
  13. lzari.cpp -- A Data Compression Class
  14. created: 2004/10/04
  15. created: 4:10:2004 16:44
  16. file base: lzari
  17. file ext: cpp
  18. author: 阙荣文 (querw@sina.com)
  19. purpose: 如上所述,lzari.c提供了lzari压缩算法的实现,基于lzari.c我把它
  20. 做成了一个c++类方便使用
  21. *********************************************************************/
  22. #include "stdafx.h"
  23. #include "Lzari.h"
  24. LZARI::LZARI()
  25. {
  26. infile = NULL;
  27. outfile = NULL;
  28. textsize = 0;
  29. codesize = 0;
  30. printcount = 0;
  31. low = 0;
  32. high = Q4;
  33. value = 0;
  34. shifts = 0;/* counts for magnifying low and high around Q2 */
  35. m_bMem = FALSE;
  36. m_pInBuffer = NULL;
  37. m_nInLength = 0;
  38. m_nInCur = 0;
  39. //m_pOutBuffer = NULL;
  40. m_nOutLength = 0;
  41. // m_nOutCur = 0;
  42. buffer_putbit = 0;
  43. mask_putbit = 128;
  44. buffer_getbit = 0;
  45. mask_getbit = 0;
  46. }
  47. LZARI::~LZARI()
  48. {
  49. Release();
  50. }
  51. void LZARI::Error(char *message)
  52. {
  53. #ifdef _OUTPUT_STATUS
  54. printf("\n%s\n", message);
  55. #endif
  56. //exit(EXIT_FAILURE);
  57. int e = 1;
  58. throw e;
  59. }
  60. void LZARI::PutBit(int bit) /* Output one bit (bit = 0,1) */
  61. {
  62. if (bit) buffer_putbit |= mask_putbit;
  63. if ((mask_putbit >>= 1) == 0)
  64. {
  65. if (!m_bMem)
  66. {
  67. if (putc(buffer_putbit, outfile) == EOF) Error("Write Error");
  68. }
  69. else
  70. {
  71. //if (m_nOutCur == m_nOutLength) Error("Write Error");
  72. //m_pOutBuffer[m_nOutCur++] = buffer;
  73. m_OutBuffer.push_back(buffer_putbit);
  74. //m_nOutCur++;
  75. }
  76. buffer_putbit = 0;
  77. mask_putbit = 128;
  78. codesize++;
  79. }
  80. }
  81. void LZARI::FlushBitBuffer(void) /* Send remaining bits */
  82. {
  83. for (int i = 0; i < 7; i++)
  84. PutBit(0);
  85. }
  86. int LZARI::GetBit(void) /* Get one bit (0 or 1) */
  87. {
  88. //ASSERT(infile);
  89. //ASSERT(m_nInLength >= m_nInCur + 1);
  90. if ((mask_getbit >>= 1) == 0)
  91. {
  92. if (!m_bMem)
  93. buffer_getbit = getc(infile);
  94. else if ( m_nInCur < m_nInLength)
  95. buffer_getbit = m_pInBuffer[m_nInCur++];
  96. mask_getbit = 128;
  97. }
  98. return ((buffer_getbit & mask_getbit) != 0);
  99. }
  100. /********** LZSS with multiple binary trees **********/
  101. void LZARI::InitTree(void) /* Initialize trees */
  102. {
  103. int i;
  104. /* For i = 0 to N - 1, rson[i] and lson[i] will be the right and
  105. left children of node i. These nodes need not be initialized.
  106. Also, dad[i] is the parent of node i. These are initialized to
  107. NIL (= N), which stands for 'not used.'
  108. For i = 0 to 255, rson[N + i + 1] is the root of the tree
  109. for strings that begin with character i. These are initialized
  110. to NIL. Note there are 256 trees. */
  111. for (i = N + 1; i <= N + 256; i++) rson[i] = NIL; /* root */
  112. for (i = 0; i < N; i++) dad[i] = NIL; /* node */
  113. }
  114. void LZARI::InsertNode(int r)
  115. /* Inserts string of length F, text_buf[r..r+F-1], into one of the
  116. trees (text_buf[r]'th tree) and returns the longest-match position
  117. and length via the global variables match_position and match_length.
  118. If match_length = F, then removes the old node in favor of the new
  119. one, because the old one will be deleted sooner.
  120. Note r plays double role, as tree node and position in buffer. */
  121. {
  122. int i, p, cmp, temp;
  123. unsigned char *key;
  124. cmp = 1; key = &text_buf[r]; p = N + 1 + key[0];
  125. rson[r] = lson[r] = NIL; match_length = 0;
  126. for ( ; ; )
  127. {
  128. if (cmp >= 0)
  129. {
  130. if (rson[p] != NIL) p = rson[p];
  131. else
  132. {
  133. rson[p] = r;
  134. dad[r] = p;
  135. return;
  136. }
  137. }
  138. else
  139. {
  140. if (lson[p] != NIL) p = lson[p];
  141. else
  142. {
  143. lson[p] = r;
  144. dad[r] = p;
  145. return;
  146. }
  147. }
  148. for (i = 1; i < F; i++)
  149. if ((cmp = key[i] - text_buf[p + i]) != 0) break;
  150. if (i > THRESHOLD)
  151. {
  152. if (i > match_length)
  153. {
  154. match_position = (r - p) & (N - 1);
  155. if ((match_length = i) >= F) break;
  156. } else if (i == match_length)
  157. {
  158. if ((temp = (r - p) & (N - 1)) < match_position)
  159. match_position = temp;
  160. }
  161. }
  162. }
  163. dad[r] = dad[p]; lson[r] = lson[p]; rson[r] = rson[p];
  164. dad[lson[p]] = r; dad[rson[p]] = r;
  165. if (rson[dad[p]] == p) rson[dad[p]] = r;
  166. else lson[dad[p]] = r;
  167. dad[p] = NIL; /* remove p */
  168. }
  169. void LZARI::DeleteNode(int p) /* Delete node p from tree */
  170. {
  171. int q;
  172. if (dad[p] == NIL) return; /* not in tree */
  173. if (rson[p] == NIL) q = lson[p];
  174. else if (lson[p] == NIL) q = rson[p];
  175. else
  176. {
  177. q = lson[p];
  178. if (rson[q] != NIL)
  179. {
  180. do { q = rson[q]; } while (rson[q] != NIL);
  181. rson[dad[q]] = lson[q]; dad[lson[q]] = dad[q];
  182. lson[q] = lson[p]; dad[lson[p]] = q;
  183. }
  184. rson[q] = rson[p]; dad[rson[p]] = q;
  185. }
  186. dad[q] = dad[p];
  187. if (rson[dad[p]] == p) rson[dad[p]] = q;
  188. else lson[dad[p]] = q;
  189. dad[p] = NIL;
  190. }
  191. /********** Arithmetic Compression **********/
  192. /* If you are not familiar with arithmetic compression, you should read
  193. I. E. Witten, R. M. Neal, and J. G. Cleary,
  194. Communications of the ACM, Vol. 30, pp. 520-540 (1987),
  195. from which much have been borrowed. */
  196. /* character code = 0, 1, ..., N_CHAR - 1 */
  197. void LZARI::StartModel(void) /* Initialize model */
  198. {
  199. int ch, sym, i;
  200. sym_cum[N_CHAR] = 0;
  201. for (sym = N_CHAR; sym >= 1; sym--)
  202. {
  203. ch = sym - 1;
  204. char_to_sym[ch] = sym; sym_to_char[sym] = ch;
  205. sym_freq[sym] = 1;
  206. sym_cum[sym - 1] = sym_cum[sym] + sym_freq[sym];
  207. }
  208. sym_freq[0] = 0; /* sentinel (!= sym_freq[1]) */
  209. position_cum[N] = 0;
  210. for (i = N; i >= 1; i--)
  211. position_cum[i - 1] = position_cum[i] + 10000 / (i + 200);
  212. /* empirical distribution function (quite tentative) */
  213. /* Please devise a better mechanism! */
  214. }
  215. void LZARI::UpdateModel(int sym)
  216. {
  217. int i, c, ch_i, ch_sym;
  218. if (sym_cum[0] >= MAX_CUM)
  219. {
  220. c = 0;
  221. for (i = N_CHAR; i > 0; i--)
  222. {
  223. sym_cum[i] = c;
  224. c += (sym_freq[i] = (sym_freq[i] + 1) >> 1);
  225. }
  226. sym_cum[0] = c;
  227. }
  228. for (i = sym; sym_freq[i] == sym_freq[i - 1]; i--) ;
  229. if (i < sym)
  230. {
  231. ch_i = sym_to_char[i]; ch_sym = sym_to_char[sym];
  232. sym_to_char[i] = ch_sym; sym_to_char[sym] = ch_i;
  233. char_to_sym[ch_i] = sym; char_to_sym[ch_sym] = i;
  234. }
  235. sym_freq[i]++;
  236. while (--i >= 0) sym_cum[i]++;
  237. }
  238. void LZARI::Output(int bit) /* Output 1 bit, followed by its complements */
  239. {
  240. PutBit(bit);
  241. for ( ; shifts > 0; shifts--)
  242. PutBit(! bit);
  243. }
  244. void LZARI::EncodeChar(int ch)
  245. {
  246. int sym;
  247. unsigned long int range;
  248. sym = char_to_sym[ch];
  249. range = high - low;
  250. high = low + (range * sym_cum[sym - 1]) / sym_cum[0];
  251. low += (range * sym_cum[sym ]) / sym_cum[0];
  252. for ( ; ; )
  253. {
  254. if (high <= Q2) Output(0);
  255. else if (low >= Q2)
  256. {
  257. Output(1); low -= Q2; high -= Q2;
  258. }
  259. else if (low >= Q1 && high <= Q3)
  260. {
  261. shifts++; low -= Q1; high -= Q1;
  262. }
  263. else break;
  264. low += low;
  265. high += high;
  266. }
  267. UpdateModel(sym);
  268. }
  269. void LZARI::EncodePosition(int position)
  270. {
  271. unsigned long int range;
  272. range = high - low;
  273. high = low + (range * position_cum[position]) / position_cum[0];
  274. low += (range * position_cum[position + 1]) / position_cum[0];
  275. for ( ; ; )
  276. {
  277. if (high <= Q2) Output(0);
  278. else if (low >= Q2)
  279. {
  280. Output(1); low -= Q2; high -= Q2;
  281. }
  282. else if (low >= Q1 && high <= Q3)
  283. {
  284. shifts++; low -= Q1; high -= Q1;
  285. }
  286. else break;
  287. low += low;
  288. high += high;
  289. }
  290. }
  291. void LZARI::EncodeEnd(void)
  292. {
  293. shifts++;
  294. if (low < Q1)
  295. Output(0);
  296. else
  297. Output(1);
  298. FlushBitBuffer(); /* flush bits remaining in buffer */
  299. }
  300. int LZARI::BinarySearchSym(unsigned int x)
  301. /* 1 if x >= sym_cum[1],
  302. N_CHAR if sym_cum[N_CHAR] > x,
  303. i such that sym_cum[i - 1] > x >= sym_cum[i] otherwise */
  304. {
  305. int i, j, k;
  306. i = 1; j = N_CHAR;
  307. while (i < j)
  308. {
  309. k = (i + j) / 2;
  310. if (sym_cum[k] > x) i = k + 1; else j = k;
  311. }
  312. return i;
  313. }
  314. int LZARI::BinarySearchPos(unsigned int x)
  315. /* 0 if x >= position_cum[1],
  316. N - 1 if position_cum[N] > x,
  317. i such that position_cum[i] > x >= position_cum[i + 1] otherwise */
  318. {
  319. int i, j, k;
  320. i = 1; j = N;
  321. while (i < j)
  322. {
  323. k = (i + j) / 2;
  324. if (position_cum[k] > x) i = k + 1; else j = k;
  325. }
  326. return i - 1;
  327. }
  328. void LZARI::StartDecode(void)
  329. {
  330. int i;
  331. for (i = 0; i < M + 2; i++)
  332. value = 2 * value + GetBit();
  333. }
  334. int LZARI::DecodeChar(void)
  335. {
  336. int sym, ch;
  337. unsigned long int range;
  338. range = high - low;
  339. sym = BinarySearchSym((unsigned int)
  340. (((value - low + 1) * sym_cum[0] - 1) / range));
  341. high = low + (range * sym_cum[sym - 1]) / sym_cum[0];
  342. low += (range * sym_cum[sym ]) / sym_cum[0];
  343. for ( ; ; )
  344. {
  345. if (low >= Q2)
  346. {
  347. value -= Q2; low -= Q2; high -= Q2;
  348. }
  349. else if (low >= Q1 && high <= Q3)
  350. {
  351. value -= Q1; low -= Q1; high -= Q1;
  352. }
  353. else if (high > Q2) break;
  354. low += low;
  355. high += high;
  356. value = 2 * value + GetBit();
  357. }
  358. ch = sym_to_char[sym];
  359. UpdateModel(sym);
  360. return ch;
  361. }
  362. int LZARI::DecodePosition(void) // releae下出问题了;
  363. {
  364. int position;
  365. unsigned long int range;
  366. range = high - low;
  367. position = BinarySearchPos((unsigned int)
  368. (((value - low + 1) * position_cum[0] - 1) / range));
  369. high = low + (range * position_cum[position ]) / position_cum[0];
  370. low += (range * position_cum[position + 1]) / position_cum[0];
  371. for ( ; ; )
  372. {
  373. if (low >= Q2)
  374. {
  375. value -= Q2; low -= Q2; high -= Q2;
  376. }
  377. else if (low >= Q1 && high <= Q3)
  378. {
  379. value -= Q1; low -= Q1; high -= Q1;
  380. }
  381. else if (high > Q2)
  382. break;
  383. low += low;
  384. high += high;
  385. if ( m_nInCur + 1 == m_nInLength )
  386. {
  387. AfxMessageBox(_T("dddd"));
  388. int sss = 0;
  389. }
  390. int kkd = GetBit();
  391. value = 2 * value + kkd;
  392. }
  393. return position;
  394. }
  395. /********** Encode and Decode **********/
  396. void LZARI::Encode(void)
  397. {
  398. int i, c, len, r, s, last_match_length;
  399. if(!m_bMem)
  400. {
  401. fseek(infile, 0L, SEEK_END);
  402. textsize = ftell(infile);
  403. if (fwrite(&textsize, sizeof textsize, 1, outfile) < 1)
  404. Error("Write Error"); /* output size of text */
  405. codesize += sizeof textsize;
  406. if (textsize == 0) return;
  407. rewind(infile);
  408. textsize = 0;
  409. }
  410. else
  411. {
  412. textsize = m_nInLength;
  413. m_OutBuffer.resize(sizeof textsize);
  414. memcpy(&m_OutBuffer[0],&textsize,sizeof textsize);
  415. //m_nOutCur += sizeof textsize;
  416. codesize += sizeof textsize;
  417. if(textsize == 0) return;
  418. m_nInCur = 0;
  419. textsize = 0;
  420. }
  421. StartModel();
  422. InitTree();
  423. s = 0; r = N - F;
  424. for (i = s; i < r; i++) text_buf[i] = ' ';
  425. if(!m_bMem)
  426. for (len = 0; len < F && (c = getc(infile)) != EOF; len++) text_buf[r + len] = c;
  427. else
  428. for (len = 0; len < F && m_nInCur < m_nInLength ; len++)
  429. {
  430. c = m_pInBuffer[m_nInCur++];
  431. text_buf[r + len] = c;
  432. }
  433. textsize = len;
  434. for (i = 1; i <= F; i++)
  435. InsertNode(r - i);
  436. InsertNode(r);
  437. do {
  438. if (match_length > len) match_length = len;
  439. if (match_length <= THRESHOLD)
  440. {
  441. match_length = 1;
  442. EncodeChar(text_buf[r]);
  443. }
  444. else
  445. {
  446. EncodeChar(255 - THRESHOLD + match_length);
  447. EncodePosition(match_position - 1);
  448. }
  449. last_match_length = match_length;
  450. if(!m_bMem)
  451. {
  452. for (i = 0; i < last_match_length && (c = getc(infile)) != EOF; i++)
  453. {
  454. DeleteNode(s); text_buf[s] = c;
  455. if (s < F - 1) text_buf[s + N] = c;
  456. s = (s + 1) & (N - 1);
  457. r = (r + 1) & (N - 1);
  458. InsertNode(r);
  459. }
  460. }
  461. else
  462. {
  463. for (i = 0; i < last_match_length && m_nInCur < m_nInLength ; i++)
  464. {
  465. c = m_pInBuffer[m_nInCur++];
  466. DeleteNode(s);
  467. text_buf[s] = c;
  468. if (s < F - 1) text_buf[s + N] = c;
  469. s = (s + 1) & (N - 1);
  470. r = (r + 1) & (N - 1);
  471. InsertNode(r);
  472. }
  473. }
  474. if ((textsize += i) > printcount)
  475. {
  476. #ifdef _OUTPUT_STATUS
  477. printf("%12ld\r", textsize);
  478. #endif
  479. printcount += 1024;
  480. }
  481. while (i++ < last_match_length)
  482. {
  483. DeleteNode(s);
  484. s = (s + 1) & (N - 1);
  485. r = (r + 1) & (N - 1);
  486. if (--len) InsertNode(r);
  487. }
  488. } while (len > 0);
  489. EncodeEnd();
  490. #ifdef _OUTPUT_STATUS
  491. printf("In : %lu bytes\n", textsize);
  492. printf("Out: %lu bytes\n", codesize);
  493. printf("Out/In: %.3f\n", (double)codesize / textsize);
  494. #endif
  495. }
  496. void LZARI::Decode(void)
  497. {
  498. int i, j, k, r, c;
  499. unsigned long int count;
  500. if (!m_bMem)
  501. {
  502. if (fread(&textsize, sizeof textsize, 1, infile) < 1)
  503. Error("Read Error"); /* read size of text */
  504. }
  505. else
  506. {
  507. if(m_nInLength < sizeof textsize)
  508. Error("Read Error");
  509. memcpy(&textsize,m_pInBuffer + m_nInCur,sizeof textsize);
  510. //m_OutBuffer.reserve(textsize);
  511. m_nOutLength = textsize;
  512. //m_nOutCur = 0;
  513. m_nInCur += sizeof textsize;
  514. }
  515. if (textsize == 0) return;
  516. StartDecode();
  517. StartModel();
  518. for (i = 0; i < N - F; i++) text_buf[i] = ' ';
  519. r = N - F;
  520. for (count = 0; count < textsize; )
  521. {
  522. c = DecodeChar();
  523. if (c < 256)
  524. {
  525. if(!m_bMem)
  526. putc(c, outfile);
  527. else
  528. {
  529. //m_OutBuffer[m_nOutCur++] = c;
  530. m_OutBuffer.push_back(c);
  531. //m_nOutCur++;
  532. }
  533. text_buf[r++] = c;
  534. r &= (N - 1);
  535. count++;
  536. }
  537. else
  538. {
  539. i = (r - DecodePosition() - 1) & (N - 1);
  540. j = c - 255 + THRESHOLD;
  541. for (k = 0; k < j; k++)
  542. {
  543. c = text_buf[(i + k) & (N - 1)];
  544. if(!m_bMem)
  545. putc(c, outfile);
  546. else
  547. {
  548. // m_pOutBuffer[m_nOutCur++] = c;
  549. m_OutBuffer.push_back(c);
  550. //m_nOutCur ++;
  551. }
  552. text_buf[r++] = c;
  553. r &= (N - 1);
  554. count++;
  555. }
  556. }
  557. if (count > printcount)
  558. {
  559. #ifdef _OUTPUT_STATUS
  560. printf("%12lu\r", count);
  561. #endif
  562. printcount += 1024;
  563. }
  564. }
  565. #ifdef _OUTPUT_STATUS
  566. printf("%12lu\n", count);
  567. #endif
  568. }
  569. void LZARI::Compress(const char *lpszInfile,const char *lpszOutfile)
  570. {
  571. m_bMem = FALSE;
  572. infile = fopen(lpszInfile,"rb");
  573. outfile = fopen(lpszOutfile,"wb");
  574. if(infile && outfile)
  575. {
  576. Encode();
  577. fclose(infile);
  578. fclose(outfile);
  579. infile = NULL;
  580. outfile = NULL;
  581. }
  582. }
  583. void LZARI::UnCompress(const char *lpszInfile,const char *lpszOutfile)
  584. {
  585. m_bMem = FALSE;
  586. infile = fopen(lpszInfile,"rb");
  587. outfile = fopen(lpszOutfile,"wb");
  588. if(infile && outfile)
  589. {
  590. Decode();
  591. fclose(infile);
  592. fclose(outfile);
  593. infile = NULL;
  594. outfile = NULL;
  595. }
  596. }
  597. void LZARI::Compress(const BYTE *pInBuffer,int nInLength,const BYTE *&pOutBuffer ,int &nOutLength)
  598. {
  599. m_pInBuffer = pInBuffer;
  600. m_nInLength = nInLength;
  601. m_nInCur = 0;
  602. // m_nOutCur = 0;
  603. m_bMem = TRUE;
  604. Encode();
  605. pOutBuffer = &m_OutBuffer[0];
  606. nOutLength = m_OutBuffer.size();
  607. }
  608. void LZARI::Release()
  609. {
  610. OutputDebugString(_T("析构FILE\n"));
  611. if(!m_OutBuffer.empty())
  612. {
  613. infile = NULL;
  614. outfile = NULL;
  615. textsize = 0;
  616. codesize = 0;
  617. printcount = 0;
  618. low = 0;
  619. high = Q4;
  620. value = 0;
  621. shifts = 0;
  622. m_bMem = FALSE;
  623. m_pInBuffer = NULL;
  624. m_nInLength = 0;
  625. m_nInCur = 0;
  626. m_OutBuffer.clear();
  627. m_nOutLength = 0;
  628. buffer_putbit = 0;
  629. mask_putbit = 128;
  630. buffer_getbit = 0;
  631. mask_getbit = 0;
  632. }
  633. }
  634. void LZARI::UnCompress(const BYTE *pInBuffer,int nInLength,const BYTE *&pOutBuffer ,int &nOutLength)
  635. {
  636. m_pInBuffer = pInBuffer;
  637. m_nInLength = nInLength;
  638. m_nInCur = 0;
  639. m_bMem = TRUE;
  640. Decode();
  641. pOutBuffer = &m_OutBuffer[0];
  642. nOutLength = m_OutBuffer.size();
  643. m_OutBuffer.push_back(0);
  644. }