《swift-algorithm-club》——算法/压缩-一一网

压缩算法(Compression)

行程长度压缩算法(Run-Length Encoding (RLE))

RLE可能是进行压缩的最简单方法。假设数据如下：

aaaaabbbcdeeeeeeef...
复制代码

然后RLE将其编码为：

5a3b1c1d7e1f...
复制代码

数据有很多重复字节时，RLE可以节省相当多的空间。它在图像上运行良好。

压缩代码

extension Data {
  public func compressRLE() -> Data {
    var data = Data()
    self.withUnsafeBytes { (uPtr: UnsafePointer<UInt8>) in
    	var ptr = uPtr
      let end = ptr + count
      while ptr < end {
        var count = 0
        var byte = ptr.pintee
        var next = byte
      
        while next == byte && ptr < end && count < 64 {
          ptr = ptr.advanced(by: 1)
          next = ptr.pointee
          count += 1
        }
        
        if count > 1 || byte >= 192 {
          var size = 191 + UInt8(count)
          data.append(&size, count: 1)
          data.append(&size, count: 1)
        } else {
          data.append(&byte, count: 1)
        }
      }
    }
    return data
  }
}
复制代码

解压缩代码

public func decompressRLE() -> Data {
  var data = Data()
  self.withUnsafeBytes { (uPtr: UnsafePointer<UInt8>) in
  	var ptr = uPtr
    let end = ptr + count
    
    while ptr < end {
      // Read the next byte. This is either a single value less than 192,
      // or the start of a byte run.
      var byte = ptr.pointee
      ptr = ptr.advanced(by: 1)
      
      if byte < 192 {
        data.append(&byte, count: 1)
      } else if ptr < end {
        // Read the actual data value.
        var value = ptr.pointee
        ptr = ptr.advanced(by: 1)
        
        // And write it out repeatedly.
        for _ in 0 ..< byte - 191 {
          data.append(&value, count: 1)
        }
      }
    }
  }
  return data
}
复制代码

霍夫曼编码(Huffman Coding)

举例子是最直观简单的说明

假设有以下文本

so much words wow many compression
复制代码

计算每个字节出现的频率

space: 5                   u: 1
    0: 5                   h: 1
    s: 4                   d: 1
    m: 3                   a: 1
    w: 3                   y: 1
    c: 2                   p: 1
    r: 2                   e: 1
    n: 2                   i: 1
复制代码

可以这样来编码

sapce: 5    010         u: 1    11001
    0: 5    000         h: 1    10001
    s: 4    101         d: 1    11010
    m: 3    111         a: 1    11011
    w: 3    0010        y: 1    01111
    c: 2    0011        p: 1    11000
    r: 2    1001        e: 1    01110
    n: 2    0110        i: 1    10000
复制代码

101 000 010 111 11001 0011 10001 010 0010 000 1001 11010 101
s   o   _   m   u     c    h     _   w    o   r    d     s

010 0010 000 0010 010 111 11011 0110 01111 010 0011 000 111
_   w    o   w    _   m   a     n    y     _   c    o   m

11000 1001 01110 101 101 10000 000 0110 0
p     r    e     s   s   i     o   n
复制代码

在小输入上使用霍夫曼编码是不利的。

辅助代码

NSData理解的最小数据是字节，但我们处理的是比特，所以我们需要在两者之间进行转换

Npublic class BitWriter {
  public var data = NSMutableData()
  var outByte: UInt8 = 0
  var outCount = 0
  
  public func writeBit(bit: Bool) {
    if outCount == 8 {
      data.append(&outByte, length: 1)
      outCount = 0
    }
    outByte = (outByte << 1) | (bit ? 1 : 0)
    outCount += 1
  } 
  
  public func flush() {
    if outCount > 0 {
      if outCount < 8 {
        let diff = UInt8(8 - outCount)
        outByte <<= diff
      }
      data.append(&outByte, length: 1)
    }
  }
}
复制代码

用于从NSData读取各个位：

public class BitReader {
  var ptr: UnsafePointer<UInt8>
  var inByte: UInt8 = 0
  var inCount = 8
  
  public init(data: NSData) {
    ptr = data.bytes.assumingMemoryBound(to: UInt8.self)
  }
  
  public func readBit() -> Bool {
    if inCount == 8 {
      inByte = ptr.pointee    // load the next byte
      inCount = 0
      ptr = ptr.successor()
    }
    let bit = inByte & 0x80   // read the next bit
    inByte <<= 1
    inCount += 1
    return bit == 0 ? false : true
  }
}
复制代码

class Huffman {
  typealias NodeIndex = Int
  
  struct Node {
    var count = 0
    var index: NodeIndex = -1
    var parent: NodeIndex = -1
    var left: NodeIndex = -1
    var right: NodeIndex = -1
  }
  
  var tree = [Node](repeating: Node(), count: 256)
  
  var root: NodeIndex = -1
}
复制代码

计算输入数据中每个字节出现的频率

fileprivate func countByteFrequency(inData data: NSData) {
  var ptr = data.bytes.assumingMemoryBound(to: UInt8.self)
  for _ in 0..<data.length {
    let i = Int(ptr.pointee)
    tree[i].count += 1
    tree[i].index = i
    ptr = ptr.successor()
  }
}
复制代码

导出频率表

struct Freq {
  var byte: UInt8 = 0
  var count = 0
}

func frequencyTable() -> [Freq] {
  var a = [Freq]()
  for i in 0..<256 where tree[i].count > 0 {
    a.append(Freq(byte: UInt8(i), count: tree[i].count))
  }
  return a
}
复制代码

使用优先级队列

fileprivate func buildTree() {
  var queue = PriortyQueue<Node>(sort: { $0.count < $1.count })
  for node in tree where node.count > 0 {
    queue.enqueue(node)
  }
  
  while queue.count > 1 {
    let node1 = queue.dequeue()!
    let node2 = queue.dequeue()!
    
    var parentNode = Node()
    parentNode.count = node1.count + node2.count
    parentNode.left = node1.index
    parentNode.right = node2.index
   	parentNode.index = tree.count
    tree.append(parentNode)
    
    tree[node1.index].parent = parentNode.index
    tree[node2.index].parent = parentNode.index
    
    queue.enqueue(parentNode)
  }
  
  let rootNode = queue.dequeue()!
  root = rootNode.index
}
复制代码

现在我们知道如何从频率表构建压缩树，我们可以用它来压缩NSData对象的内容。

public func compressData(data: NSData) -> NSData {
  countByteFrequency(inData: data)
  buildTree()
  
  let writer = BitWriter()
  var ptr = data.bytes.assumingMemoryBound(to: UInt8.self)
  for _ in 0..<data.length {
    let c = ptr.pointee
    let i = Int(c)
    traverseTree(writer: writer, nodeIndex: i, childIndex: -1)
    ptr = ptr.successor()
  }
  writer.flush()
  return writer.data
}
复制代码

注意：压缩总是需要两次遍历整个输入数据：首先构建频率表，然后将字节转换为压缩的位序列。

有趣的东西发生在traverseTree()中。这是一种递归方法：

private func traverseTree(writer: BitWriter, nodeIndex h: Int, childIndex child: Int) {
  if tree[h].parent != -1 {
    traverseTree(writer: writer, nodeIndex: tree[h].parent, childIndex: h)
  }
  if child != -1 {
    if child == tree[h].left {
      writer.writeBit(bit: true)
    } else if child == tree[h].right {
      writer.writeBit(bit: false)
    }
  }
}
复制代码

像这样使用compressData()方法：

let s1 = "so much words wow many compression"
if let originalData = s1.dataUsingEncoding(NSUTF8StringEncoding) {
  let huffman1 = Huffman()
  let compressedData = huffman1.compressData(originalData)
  print(compressedData.length)
}
复制代码

解压缩

首先需要一些方法将[Freq]数组转换回压缩树：

fileprivate func restoreTree(fromTable frequencyTable: [Freq]) {
  for freq in frequencyTable {
    let i = Int(freq.byte)
    tree[i].count = freq.count
    tree[i].index = i
  }
  buildTree()
}
复制代码

func decompressData(data: NSData, frequencyTable: [Freq]) -> NSData {
  restoreTree(fromTable: frequencyTable)
  
  let reader = BitReader(data: data)
  let outData = NSMutableData()
  let byteCount = tree[root].count
  
  var i = 0
  while i < byteCount {
    var b = findLeafNode(reader: reader, nodeIndex: root)
    outData.append(&b, length: 1)
    i += 1
  }
  return outData
}
复制代码

也是使用辅助方法遍历树

private func findLeafNode(reader reader: BitReader, nodeIndex: Int) -> UInt8 {
  var h = nodeIndex
  while tree[h].right != -1 {
    if reader.readBit() {
      h = tree[h].left
    } else {
      h = tree[h].right
    }
  }
  return UInt8(h)
}
复制代码

如何使用解压缩的方法：

let frequencyTable = huffman1.frequencyTable()

let huffman2 = Huffman()
let decompressedData = huffman2.decompressData(compressedData, frequencyTable: frequencyTable: frequencyTable)

let s2 = String(data: decompressedData, encoding: NSUTF8StringEncoding)!
复制代码