deflate.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. /* deflate.c - deflate/inflate code for gzip and friends
  2. *
  3. * Copyright 2014 Rob Landley <rob@landley.net>
  4. *
  5. * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
  6. * LSB 4.1 has gzip, gunzip, and zcat
  7. *
  8. * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
  9. */
  10. #include "toys.h"
  11. struct deflate {
  12. // Huffman codes: base offset and extra bits tables (length and distance)
  13. char lenbits[29], distbits[30];
  14. unsigned short lenbase[29], distbase[30];
  15. void *fixdisthuff, *fixlithuff;
  16. // CRC
  17. void (*crcfunc)(struct deflate *dd, char *data, int len);
  18. unsigned crctable[256], crc;
  19. // Tables only used for deflation
  20. unsigned short *hashhead, *hashchain;
  21. // Compressed data buffer (extra space malloced at end)
  22. unsigned pos, len;
  23. int infd, outfd;
  24. char data[];
  25. };
  26. // little endian bit buffer
  27. struct bitbuf {
  28. int fd, bitpos, len, max;
  29. char buf[];
  30. };
  31. // malloc a struct bitbuf
  32. static struct bitbuf *bitbuf_init(int fd, int size)
  33. {
  34. struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
  35. bb->max = size;
  36. bb->fd = fd;
  37. return bb;
  38. }
  39. // Advance bitpos without the overhead of recording bits
  40. // Loads more data when input buffer empty
  41. static void bitbuf_skip(struct bitbuf *bb, int bits)
  42. {
  43. int pos = bb->bitpos + bits, len = bb->len << 3;
  44. while (pos >= len) {
  45. pos -= len;
  46. len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
  47. if (bb->len < 1) perror_exit("inflate EOF");
  48. }
  49. bb->bitpos = pos;
  50. }
  51. // Optimized single bit inlined version
  52. static inline int bitbuf_bit(struct bitbuf *bb)
  53. {
  54. int bufpos = bb->bitpos>>3;
  55. if (bufpos == bb->len) {
  56. bitbuf_skip(bb, 0);
  57. bufpos = 0;
  58. }
  59. return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
  60. }
  61. // Fetch the next X bits from the bitbuf, little endian
  62. static unsigned bitbuf_get(struct bitbuf *bb, int bits)
  63. {
  64. int result = 0, offset = 0;
  65. while (bits) {
  66. int click = bb->bitpos >> 3, blow, blen;
  67. // Load more data if buffer empty
  68. if (click == bb->len) bitbuf_skip(bb, click = 0);
  69. // grab bits from next byte
  70. blow = bb->bitpos & 7;
  71. blen = 8-blow;
  72. if (blen > bits) blen = bits;
  73. result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
  74. offset += blen;
  75. bits -= blen;
  76. bb->bitpos += blen;
  77. }
  78. return result;
  79. }
  80. static void bitbuf_flush(struct bitbuf *bb)
  81. {
  82. if (!bb->bitpos) return;
  83. xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3);
  84. memset(bb->buf, 0, bb->max);
  85. bb->bitpos = 0;
  86. }
  87. static void bitbuf_put(struct bitbuf *bb, int data, int len)
  88. {
  89. while (len) {
  90. int click = bb->bitpos >> 3, blow, blen;
  91. // Flush buffer if necessary
  92. if (click == bb->max) {
  93. bitbuf_flush(bb);
  94. click = 0;
  95. }
  96. blow = bb->bitpos & 7;
  97. blen = 8-blow;
  98. if (blen > len) blen = len;
  99. bb->buf[click] |= data << blow;
  100. bb->bitpos += blen;
  101. data >>= blen;
  102. len -= blen;
  103. }
  104. }
  105. static void output_byte(struct deflate *dd, char sym)
  106. {
  107. int pos = dd->pos++ & 32767;
  108. dd->data[pos] = sym;
  109. if (pos == 32767) {
  110. xwrite(dd->outfd, dd->data, 32768);
  111. if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768);
  112. }
  113. }
  114. // Huffman coding uses bits to traverse a binary tree to a leaf node,
  115. // By placing frequently occurring symbols at shorter paths, frequently
  116. // used symbols may be represented in fewer bits than uncommon symbols.
  117. // (length[0] isn't used but code's clearer if it's there.)
  118. struct huff {
  119. unsigned short length[16]; // How many symbols have this bit length?
  120. unsigned short symbol[288]; // sorted by bit length, then ascending order
  121. };
  122. // Create simple huffman tree from array of bit lengths.
  123. // The symbols in the huffman trees are sorted (first by bit length
  124. // of the code to reach them, then by symbol number). This means that given
  125. // the bit length of each symbol, we can construct a unique tree.
  126. static void len2huff(struct huff *huff, char bitlen[], int len)
  127. {
  128. int offset[16];
  129. int i;
  130. // Count number of codes at each bit length
  131. memset(huff, 0, sizeof(struct huff));
  132. for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
  133. // Sort symbols by bit length, then symbol. Get list of starting positions
  134. // for each group, then write each symbol to next position within its group.
  135. *huff->length = *offset = 0;
  136. for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
  137. for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
  138. }
  139. // Fetch and decode next huffman coded symbol from bitbuf.
  140. // This takes advantage of the sorting to navigate the tree as an array:
  141. // each time we fetch a bit we have all the codes at that bit level in
  142. // order with no gaps.
  143. static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
  144. {
  145. unsigned short *length = huff->length;
  146. int start = 0, offset = 0;
  147. // Traverse through the bit lengths until our code is in this range
  148. for (;;) {
  149. offset = (offset << 1) | bitbuf_bit(bb);
  150. start += *++length;
  151. if ((offset -= *length) < 0) break;
  152. if ((length - huff->length) & 16) error_exit("bad symbol");
  153. }
  154. return huff->symbol[start + offset];
  155. }
  156. // Decompress deflated data from bitbuf to dd->outfd.
  157. static void inflate(struct deflate *dd, struct bitbuf *bb)
  158. {
  159. dd->crc = ~0;
  160. // repeat until spanked
  161. for (;;) {
  162. int final, type;
  163. final = bitbuf_get(bb, 1);
  164. type = bitbuf_get(bb, 2);
  165. if (type == 3) error_exit("bad type");
  166. // Uncompressed block?
  167. if (!type) {
  168. int len, nlen;
  169. // Align to byte, read length
  170. bitbuf_skip(bb, (8-bb->bitpos)&7);
  171. len = bitbuf_get(bb, 16);
  172. nlen = bitbuf_get(bb, 16);
  173. if (len != (0xffff & ~nlen)) error_exit("bad len");
  174. // Dump literal output data
  175. while (len) {
  176. int pos = bb->bitpos >> 3, bblen = bb->len - pos;
  177. char *p = bb->buf+pos;
  178. // dump bytes until done or end of current bitbuf contents
  179. if (bblen > len) bblen = len;
  180. pos = bblen;
  181. while (pos--) output_byte(dd, *(p++));
  182. bitbuf_skip(bb, bblen << 3);
  183. len -= bblen;
  184. }
  185. // Compressed block
  186. } else {
  187. struct huff *disthuff, *lithuff;
  188. // Dynamic huffman codes?
  189. if (type == 2) {
  190. struct huff *h2 = ((struct huff *)libbuf)+1;
  191. int i, litlen, distlen, hufflen;
  192. char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
  193. "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
  194. // The huffman trees are stored as a series of bit lengths
  195. litlen = bitbuf_get(bb, 5)+257; // max 288
  196. distlen = bitbuf_get(bb, 5)+1; // max 32
  197. hufflen = bitbuf_get(bb, 4)+4; // max 19
  198. // The literal and distance codes are themselves compressed, in
  199. // a complicated way: an array of bit lengths (hufflen many
  200. // entries, each 3 bits) is used to fill out an array of 19 entries
  201. // in a magic order, leaving the rest 0. Then make a tree out of it:
  202. memset(bits = libbuf+1, 0, 19);
  203. for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
  204. len2huff(h2, bits, 19);
  205. // Use that tree to read in the literal and distance bit lengths
  206. for (i = 0; i < litlen + distlen;) {
  207. int sym = huff_and_puff(bb, h2);
  208. // 0-15 are literals, 16 = repeat previous code 3-6 times,
  209. // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
  210. if (sym < 16) bits[i++] = sym;
  211. else {
  212. int len = sym & 2;
  213. len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
  214. memset(bits+i, bits[i-1] * !(sym&3), len);
  215. i += len;
  216. }
  217. }
  218. if (i > litlen+distlen) error_exit("bad tree");
  219. len2huff(lithuff = h2, bits, litlen);
  220. len2huff(disthuff = ((struct huff *)libbuf)+2, bits+litlen, distlen);
  221. // Static huffman codes
  222. } else {
  223. lithuff = dd->fixlithuff;
  224. disthuff = dd->fixdisthuff;
  225. }
  226. // Use huffman tables to decode block of compressed symbols
  227. for (;;) {
  228. int sym = huff_and_puff(bb, lithuff);
  229. // Literal?
  230. if (sym < 256) output_byte(dd, sym);
  231. // Copy range?
  232. else if (sym > 256) {
  233. int len, dist;
  234. sym -= 257;
  235. len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]);
  236. sym = huff_and_puff(bb, disthuff);
  237. dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]);
  238. sym = dd->pos & 32767;
  239. while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]);
  240. // End of block
  241. } else break;
  242. }
  243. }
  244. // Was that the last block?
  245. if (final) break;
  246. }
  247. if (dd->pos & 32767) {
  248. xwrite(dd->outfd, dd->data, dd->pos&32767);
  249. if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767);
  250. }
  251. }
  252. // Deflate from dd->infd to bitbuf
  253. // For deflate, dd->len = input read, dd->pos = input consumed
  254. static void deflate(struct deflate *dd, struct bitbuf *bb)
  255. {
  256. char *data = dd->data;
  257. int len, final = 0;
  258. dd->crc = ~0;
  259. while (!final) {
  260. // Read next half-window of data if we haven't hit EOF yet.
  261. len = readall(dd->infd, data+(dd->len&32768), 32768);
  262. if (len < 0) perror_exit("read"); // todo: add filename
  263. if (len != 32768) final++;
  264. if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len);
  265. // dd->len += len; crcfunc advances len TODO
  266. // store block as literal
  267. bitbuf_put(bb, final, 1);
  268. bitbuf_put(bb, 0, 1);
  269. bitbuf_put(bb, 0, (8-bb->bitpos)&7);
  270. bitbuf_put(bb, len, 16);
  271. bitbuf_put(bb, 0xffff & ~len, 16);
  272. // repeat until spanked
  273. while (dd->pos != dd->len) {
  274. unsigned pos = dd->pos&65535;
  275. bitbuf_put(bb, data[pos], 8);
  276. // need to refill buffer?
  277. if (!(32767 & ++dd->pos) && !final) break;
  278. }
  279. }
  280. bitbuf_flush(bb);
  281. }
  282. // Allocate memory for deflate/inflate.
  283. static struct deflate *init_deflate(int compress)
  284. {
  285. int i, n = 1;
  286. struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1));
  287. memset(dd, 0, sizeof(struct deflate));
  288. // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain
  289. if (compress) {
  290. dd->hashhead = (unsigned short *)(dd->data+65536);
  291. dd->hashchain = (unsigned short *)(dd->data+65536+32768);
  292. }
  293. // Calculate lenbits, lenbase, distbits, distbase
  294. *dd->lenbase = 3;
  295. for (i = 0; i<sizeof(dd->lenbits)-1; i++) {
  296. if (i>4) {
  297. if (!(i&3)) {
  298. dd->lenbits[i]++;
  299. n <<= 1;
  300. }
  301. if (i == 27) n--;
  302. else dd->lenbits[i+1] = dd->lenbits[i];
  303. }
  304. dd->lenbase[i+1] = n + dd->lenbase[i];
  305. }
  306. n = 0;
  307. for (i = 0; i<sizeof(dd->distbits); i++) {
  308. dd->distbase[i] = 1<<n;
  309. if (i) dd->distbase[i] += dd->distbase[i-1];
  310. if (i>3 && !(i&1)) n++;
  311. dd->distbits[i] = n;
  312. }
  313. // TODO layout and lifetime of this?
  314. // Init fixed huffman tables
  315. for (i=0; i<288; i++) libbuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
  316. len2huff(dd->fixlithuff = ((struct huff *)libbuf)+3, libbuf, 288);
  317. memset(libbuf, 5, 30);
  318. len2huff(dd->fixdisthuff = ((struct huff *)libbuf)+4, libbuf, 30);
  319. return dd;
  320. }
  321. // Return true/false whether we consumed a gzip header.
  322. static int is_gzip(struct bitbuf *bb)
  323. {
  324. int flags;
  325. // Confirm signature
  326. if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
  327. return 0;
  328. bitbuf_skip(bb, 6*8);
  329. // Skip extra, name, comment, header CRC fields
  330. if (flags & 4) bitbuf_skip(bb, bitbuf_get(bb, 16) * 8);
  331. if (flags & 8) while (bitbuf_get(bb, 8));
  332. if (flags & 16) while (bitbuf_get(bb, 8));
  333. if (flags & 2) bitbuf_skip(bb, 16);
  334. return 1;
  335. }
  336. static void gzip_crc(struct deflate *dd, char *data, int len)
  337. {
  338. int i;
  339. unsigned crc, *crc_table = dd->crctable;
  340. crc = dd->crc;
  341. for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
  342. dd->crc = crc;
  343. dd->len += len;
  344. }
  345. long long gzip_fd(int infd, int outfd)
  346. {
  347. struct bitbuf *bb = bitbuf_init(outfd, 4096);
  348. struct deflate *dd = init_deflate(1);
  349. long long rc;
  350. // Header from RFC 1952 section 2.2:
  351. // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
  352. // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
  353. // Operating System (FF=unknown)
  354. dd->infd = infd;
  355. xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
  356. // Little endian crc table
  357. crc_init(dd->crctable, 1);
  358. dd->crcfunc = gzip_crc;
  359. deflate(dd, bb);
  360. // tail: crc32, len32
  361. bitbuf_put(bb, 0, (8-bb->bitpos)&7);
  362. bitbuf_put(bb, ~dd->crc, 32);
  363. bitbuf_put(bb, dd->len, 32);
  364. rc = dd->len;
  365. bitbuf_flush(bb);
  366. free(bb);
  367. free(dd);
  368. return rc;
  369. }
  370. long long gunzip_fd(int infd, int outfd)
  371. {
  372. struct bitbuf *bb = bitbuf_init(infd, 4096);
  373. struct deflate *dd = init_deflate(0);
  374. long long rc;
  375. if (!is_gzip(bb)) error_exit("not gzip");
  376. dd->outfd = outfd;
  377. // Little endian crc table
  378. crc_init(dd->crctable, 1);
  379. dd->crcfunc = gzip_crc;
  380. inflate(dd, bb);
  381. // tail: crc32, len32
  382. bitbuf_skip(bb, (8-bb->bitpos)&7);
  383. if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32))
  384. error_exit("bad crc");
  385. rc = dd->len;
  386. free(bb);
  387. free(dd);
  388. return rc;
  389. }