// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package bzip2 implements bzip2 decompression. package bzip2 import "io" // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot // of guessing: https://en.wikipedia.org/wiki/Bzip2 // The source code to pyflate was useful for debugging: // http://www.paul.sladen.org/projects/pyflate // A StructuralError is returned when the bzip2 data is found to be // syntactically invalid. type StructuralError string func (s StructuralError) Error() string { return "bzip2 data invalid: " + string(s) } // A reader decompresses bzip2 compressed data. type reader struct { br bitReader fileCRC uint32 blockCRC uint32 wantBlockCRC uint32 setupDone bool // true if we have parsed the bzip2 header. blockSize int // blockSize in bytes, i.e. 900 * 1000. eof bool c [256]uint // the ``C'' array for the inverse BWT. tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits. tPos uint32 // Index of the next output byte in tt. preRLE []uint32 // contains the RLE data still to be processed. preRLEUsed int // number of entries of preRLE used. lastByte int // the last byte value seen. byteRepeats uint // the number of repeats of lastByte seen. repeats uint // the number of copies of lastByte to output. } // NewReader returns an io.Reader which decompresses bzip2 data from r. // If r does not also implement io.ByteReader, // the decompressor may read more data than necessary from r. func NewReader(r io.Reader) io.Reader { bz2 := new(reader) bz2.br = newBitReader(r) return bz2 } const bzip2FileMagic = 0x425a // "BZ" const bzip2BlockMagic = 0x314159265359 const bzip2FinalMagic = 0x177245385090 // setup parses the bzip2 header. func (bz2 *reader) setup(needMagic bool) error { br := &bz2.br if needMagic { magic := br.ReadBits(16) if magic != bzip2FileMagic { return StructuralError("bad magic value") } } t := br.ReadBits(8) if t != 'h' { return StructuralError("non-Huffman entropy encoding") } level := br.ReadBits(8) if level < '1' || level > '9' { return StructuralError("invalid compression level") } bz2.fileCRC = 0 bz2.blockSize = 100 * 1000 * (level - '0') if bz2.blockSize > len(bz2.tt) { bz2.tt = make([]uint32, bz2.blockSize) } return nil } func (bz2 *reader) Read(buf []byte) (n int, err error) { if bz2.eof { return 0, io.EOF } if !bz2.setupDone { err = bz2.setup(true) brErr := bz2.br.Err() if brErr != nil { err = brErr } if err != nil { return 0, err } bz2.setupDone = true } n, err = bz2.read(buf) brErr := bz2.br.Err() if brErr != nil { err = brErr } return } func (bz2 *reader) readFromBlock(buf []byte) int { // bzip2 is a block based compressor, except that it has a run-length // preprocessing step. The block based nature means that we can // preallocate fixed-size buffers and reuse them. However, the RLE // preprocessing would require allocating huge buffers to store the // maximum expansion. Thus we process blocks all at once, except for // the RLE which we decompress as required. n := 0 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { // We have RLE data pending. // The run-length encoding works like this: // Any sequence of four equal bytes is followed by a length // byte which contains the number of repeats of that byte to // include. (The number of repeats can be zero.) Because we are // decompressing on-demand our state is kept in the reader // object. if bz2.repeats > 0 { buf[n] = byte(bz2.lastByte) n++ bz2.repeats-- if bz2.repeats == 0 { bz2.lastByte = -1 } continue } bz2.tPos = bz2.preRLE[bz2.tPos] b := byte(bz2.tPos) bz2.tPos >>= 8 bz2.preRLEUsed++ if bz2.byteRepeats == 3 { bz2.repeats = uint(b) bz2.byteRepeats = 0 continue } if bz2.lastByte == int(b) { bz2.byteRepeats++ } else { bz2.byteRepeats = 0 } bz2.lastByte = int(b) buf[n] = b n++ } return n } func (bz2 *reader) read(buf []byte) (int, error) { for { n := bz2.readFromBlock(buf) if n > 0 || len(buf) == 0 { bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) return n, nil } // End of block. Check CRC. if bz2.blockCRC != bz2.wantBlockCRC { bz2.br.err = StructuralError("block checksum mismatch") return 0, bz2.br.err } // Find next block. br := &bz2.br switch br.ReadBits64(48) { default: return 0, StructuralError("bad magic value found") case bzip2BlockMagic: // Start of block. err := bz2.readBlock() if err != nil { return 0, err } case bzip2FinalMagic: // Check end-of-file CRC. wantFileCRC := uint32(br.ReadBits64(32)) if br.err != nil { return 0, br.err } if bz2.fileCRC != wantFileCRC { br.err = StructuralError("file checksum mismatch") return 0, br.err } // Skip ahead to byte boundary. // Is there a file concatenated to this one? // It would start with BZ. if br.bits%8 != 0 { br.ReadBits(br.bits % 8) } b, err := br.r.ReadByte() if err == io.EOF { br.err = io.EOF bz2.eof = true return 0, io.EOF } if err != nil { br.err = err return 0, err } z, err := br.r.ReadByte() if err != nil { if err == io.EOF { err = io.ErrUnexpectedEOF } br.err = err return 0, err } if b != 'B' || z != 'Z' { return 0, StructuralError("bad magic value in continuation file") } if err := bz2.setup(false); err != nil { return 0, err } } } } // readBlock reads a bzip2 block. The magic number should already have been consumed. func (bz2 *reader) readBlock() (err error) { br := &bz2.br bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. bz2.blockCRC = 0 bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC randomized := br.ReadBits(1) if randomized != 0 { return StructuralError("deprecated randomized files") } origPtr := uint(br.ReadBits(24)) // If not every byte value is used in the block (i.e., it's text) then // the symbol set is reduced. The symbols used are stored as a // two-level, 16x16 bitmap. symbolRangeUsedBitmap := br.ReadBits(16) symbolPresent := make([]bool, 256) numSymbols := 0 for symRange := uint(0); symRange < 16; symRange++ { if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { bits := br.ReadBits(16) for symbol := uint(0); symbol < 16; symbol++ { if bits&(1<<(15-symbol)) != 0 { symbolPresent[16*symRange+symbol] = true numSymbols++ } } } } if numSymbols == 0 { // There must be an EOF symbol. return StructuralError("no symbols in input") } // A block uses between two and six different Huffman trees. numHuffmanTrees := br.ReadBits(3) if numHuffmanTrees < 2 || numHuffmanTrees > 6 { return StructuralError("invalid number of Huffman trees") } // The Huffman tree can switch every 50 symbols so there's a list of // tree indexes telling us which tree to use for each 50 symbol block. numSelectors := br.ReadBits(15) treeIndexes := make([]uint8, numSelectors) // The tree indexes are move-to-front transformed and stored as unary // numbers. mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) for i := range treeIndexes { c := 0 for { inc := br.ReadBits(1) if inc == 0 { break } c++ } if c >= numHuffmanTrees { return StructuralError("tree index too large") } treeIndexes[i] = mtfTreeDecoder.Decode(c) } // The list of symbols for the move-to-front transform is taken from // the previously decoded symbol bitmap. symbols := make([]byte, numSymbols) nextSymbol := 0 for i := 0; i < 256; i++ { if symbolPresent[i] { symbols[nextSymbol] = byte(i) nextSymbol++ } } mtf := newMTFDecoder(symbols) numSymbols += 2 // to account for RUNA and RUNB symbols huffmanTrees := make([]huffmanTree, numHuffmanTrees) // Now we decode the arrays of code-lengths for each tree. lengths := make([]uint8, numSymbols) for i := range huffmanTrees { // The code lengths are delta encoded from a 5-bit base value. length := br.ReadBits(5) for j := range lengths { for { if length < 1 || length > 20 { return StructuralError("Huffman length out of range") } if !br.ReadBit() { break } if br.ReadBit() { length-- } else { length++ } } lengths[j] = uint8(length) } huffmanTrees[i], err = newHuffmanTree(lengths) if err != nil { return err } } selectorIndex := 1 // the next tree index to use if len(treeIndexes) == 0 { return StructuralError("no tree selectors given") } if int(treeIndexes[0]) >= len(huffmanTrees) { return StructuralError("tree selector out of range") } currentHuffmanTree := huffmanTrees[treeIndexes[0]] bufIndex := 0 // indexes bz2.buf, the output buffer. // The output of the move-to-front transform is run-length encoded and // we merge the decoding into the Huffman parsing loop. These two // variables accumulate the repeat count. See the Wikipedia page for // details. repeat := 0 repeatPower := 0 // The `C' array (used by the inverse BWT) needs to be zero initialized. for i := range bz2.c { bz2.c[i] = 0 } decoded := 0 // counts the number of symbols decoded by the current tree. for { if decoded == 50 { if selectorIndex >= numSelectors { return StructuralError("insufficient selector indices for number of symbols") } if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) { return StructuralError("tree selector out of range") } currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] selectorIndex++ decoded = 0 } v := currentHuffmanTree.Decode(br) decoded++ if v < 2 { // This is either the RUNA or RUNB symbol. if repeat == 0 { repeatPower = 1 } repeat += repeatPower << v repeatPower <<= 1 // This limit of 2 million comes from the bzip2 source // code. It prevents repeat from overflowing. if repeat > 2*1024*1024 { return StructuralError("repeat count too large") } continue } if repeat > 0 { // We have decoded a complete run-length so we need to // replicate the last output symbol. if repeat > bz2.blockSize-bufIndex { return StructuralError("repeats past end of block") } for i := 0; i < repeat; i++ { b := mtf.First() bz2.tt[bufIndex] = uint32(b) bz2.c[b]++ bufIndex++ } repeat = 0 } if int(v) == numSymbols-1 { // This is the EOF symbol. Because it's always at the // end of the move-to-front list, and never gets moved // to the front, it has this unique value. break } // Since two metasymbols (RUNA and RUNB) have values 0 and 1, // one would expect |v-2| to be passed to the MTF decoder. // However, the front of the MTF list is never referenced as 0, // it's always referenced with a run-length of 1. Thus 0 // doesn't need to be encoded and we have |v-1| in the next // line. b := mtf.Decode(int(v - 1)) if bufIndex >= bz2.blockSize { return StructuralError("data exceeds block size") } bz2.tt[bufIndex] = uint32(b) bz2.c[b]++ bufIndex++ } if origPtr >= uint(bufIndex) { return StructuralError("origPtr out of bounds") } // We have completed the entropy decoding. Now we can perform the // inverse BWT and setup the RLE buffer. bz2.preRLE = bz2.tt[:bufIndex] bz2.preRLEUsed = 0 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) bz2.lastByte = -1 bz2.byteRepeats = 0 bz2.repeats = 0 return nil } // inverseBWT implements the inverse Burrows-Wheeler transform as described in // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. // In that document, origPtr is called “I” and c is the “C” array after the // first pass over the data. It's an argument here because we merge the first // pass with the Huffman decoding. // // This also implements the “single array” method from the bzip2 source code // which leaves the output, still shuffled, in the bottom 8 bits of tt with the // index of the next byte in the top 24-bits. The index of the first byte is // returned. func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { sum := uint(0) for i := 0; i < 256; i++ { sum += c[i] c[i] = sum - c[i] } for i := range tt { b := tt[i] & 0xff tt[c[b]] |= uint32(i) << 8 c[b]++ } return tt[origPtr] >> 8 } // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, // causing the bits in the input to be processed in the reverse of the usual order. var crctab [256]uint32 func init() { const poly = 0x04C11DB7 for i := range crctab { crc := uint32(i) << 24 for j := 0; j < 8; j++ { if crc&0x80000000 != 0 { crc = (crc << 1) ^ poly } else { crc <<= 1 } } crctab[i] = crc } } // updateCRC updates the crc value to incorporate the data in b. // The initial value is 0. func updateCRC(val uint32, b []byte) uint32 { crc := ^val for _, v := range b { crc = crctab[byte(crc>>24)^v] ^ (crc << 8) } return ^crc }