| // Copyright (c) 2017 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Contains utils for reading, writing and debug printing bit streams. |
| |
| #ifndef SOURCE_COMP_HUFFMAN_CODEC_H_ |
| #define SOURCE_COMP_HUFFMAN_CODEC_H_ |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <functional> |
| #include <iomanip> |
| #include <map> |
| #include <memory> |
| #include <ostream> |
| #include <queue> |
| #include <sstream> |
| #include <stack> |
| #include <string> |
| #include <tuple> |
| #include <unordered_map> |
| #include <utility> |
| #include <vector> |
| |
| namespace spvtools { |
| namespace comp { |
| |
| // Used to generate and apply a Huffman coding scheme. |
| // |Val| is the type of variable being encoded (for example a string or a |
| // literal). |
| template <class Val> |
| class HuffmanCodec { |
| public: |
| // Huffman tree node. |
| struct Node { |
| Node() {} |
| |
| // Creates Node from serialization leaving weight and id undefined. |
| Node(const Val& in_value, uint32_t in_left, uint32_t in_right) |
| : value(in_value), left(in_left), right(in_right) {} |
| |
| Val value = Val(); |
| uint32_t weight = 0; |
| // Ids are issued sequentially starting from 1. Ids are used as an ordering |
| // tie-breaker, to make sure that the ordering (and resulting coding scheme) |
| // are consistent accross multiple platforms. |
| uint32_t id = 0; |
| // Handles of children. |
| uint32_t left = 0; |
| uint32_t right = 0; |
| }; |
| |
| // Creates Huffman codec from a histogramm. |
| // Histogramm counts must not be zero. |
| explicit HuffmanCodec(const std::map<Val, uint32_t>& hist) { |
| if (hist.empty()) return; |
| |
| // Heuristic estimate. |
| nodes_.reserve(3 * hist.size()); |
| |
| // Create NIL. |
| CreateNode(); |
| |
| // The queue is sorted in ascending order by weight (or by node id if |
| // weights are equal). |
| std::vector<uint32_t> queue_vector; |
| queue_vector.reserve(hist.size()); |
| std::priority_queue<uint32_t, std::vector<uint32_t>, |
| std::function<bool(uint32_t, uint32_t)>> |
| queue(std::bind(&HuffmanCodec::LeftIsBigger, this, |
| std::placeholders::_1, std::placeholders::_2), |
| std::move(queue_vector)); |
| |
| // Put all leaves in the queue. |
| for (const auto& pair : hist) { |
| const uint32_t node = CreateNode(); |
| MutableValueOf(node) = pair.first; |
| MutableWeightOf(node) = pair.second; |
| assert(WeightOf(node)); |
| queue.push(node); |
| } |
| |
| // Form the tree by combining two subtrees with the least weight, |
| // and pushing the root of the new tree in the queue. |
| while (true) { |
| // We push a node at the end of each iteration, so the queue is never |
| // supposed to be empty at this point, unless there are no leaves, but |
| // that case was already handled. |
| assert(!queue.empty()); |
| const uint32_t right = queue.top(); |
| queue.pop(); |
| |
| // If the queue is empty at this point, then the last node is |
| // the root of the complete Huffman tree. |
| if (queue.empty()) { |
| root_ = right; |
| break; |
| } |
| |
| const uint32_t left = queue.top(); |
| queue.pop(); |
| |
| // Combine left and right into a new tree and push it into the queue. |
| const uint32_t parent = CreateNode(); |
| MutableWeightOf(parent) = WeightOf(right) + WeightOf(left); |
| MutableLeftOf(parent) = left; |
| MutableRightOf(parent) = right; |
| queue.push(parent); |
| } |
| |
| // Traverse the tree and form encoding table. |
| CreateEncodingTable(); |
| } |
| |
| // Creates Huffman codec from saved tree structure. |
| // |nodes| is the list of nodes of the tree, nodes[0] being NIL. |
| // |root_handle| is the index of the root node. |
| HuffmanCodec(uint32_t root_handle, std::vector<Node>&& nodes) { |
| nodes_ = std::move(nodes); |
| assert(!nodes_.empty()); |
| assert(root_handle > 0 && root_handle < nodes_.size()); |
| assert(!LeftOf(0) && !RightOf(0)); |
| |
| root_ = root_handle; |
| |
| // Traverse the tree and form encoding table. |
| CreateEncodingTable(); |
| } |
| |
| // Serializes the codec in the following text format: |
| // (<root_handle>, { |
| // {0, 0, 0}, |
| // {val1, left1, right1}, |
| // {val2, left2, right2}, |
| // ... |
| // }) |
| std::string SerializeToText(int indent_num_whitespaces) const { |
| const bool value_is_text = std::is_same<Val, std::string>::value; |
| |
| const std::string indent1 = std::string(indent_num_whitespaces, ' '); |
| const std::string indent2 = std::string(indent_num_whitespaces + 2, ' '); |
| |
| std::stringstream code; |
| code << "(" << root_ << ", {\n"; |
| |
| for (const Node& node : nodes_) { |
| code << indent2 << "{"; |
| if (value_is_text) code << "\""; |
| code << node.value; |
| if (value_is_text) code << "\""; |
| code << ", " << node.left << ", " << node.right << "},\n"; |
| } |
| |
| code << indent1 << "})"; |
| |
| return code.str(); |
| } |
| |
| // Prints the Huffman tree in the following format: |
| // w------w------'x' |
| // w------'y' |
| // Where w stands for the weight of the node. |
| // Right tree branches appear above left branches. Taking the right path |
| // adds 1 to the code, taking the left adds 0. |
| void PrintTree(std::ostream& out) const { PrintTreeInternal(out, root_, 0); } |
| |
| // Traverses the tree and prints the Huffman table: value, code |
| // and optionally node weight for every leaf. |
| void PrintTable(std::ostream& out, bool print_weights = true) { |
| std::queue<std::pair<uint32_t, std::string>> queue; |
| queue.emplace(root_, ""); |
| |
| while (!queue.empty()) { |
| const uint32_t node = queue.front().first; |
| const std::string code = queue.front().second; |
| queue.pop(); |
| if (!RightOf(node) && !LeftOf(node)) { |
| out << ValueOf(node); |
| if (print_weights) out << " " << WeightOf(node); |
| out << " " << code << std::endl; |
| } else { |
| if (LeftOf(node)) queue.emplace(LeftOf(node), code + "0"); |
| |
| if (RightOf(node)) queue.emplace(RightOf(node), code + "1"); |
| } |
| } |
| } |
| |
| // Returns the Huffman table. The table was built at at construction time, |
| // this function just returns a const reference. |
| const std::unordered_map<Val, std::pair<uint64_t, size_t>>& GetEncodingTable() |
| const { |
| return encoding_table_; |
| } |
| |
| // Encodes |val| and stores its Huffman code in the lower |num_bits| of |
| // |bits|. Returns false of |val| is not in the Huffman table. |
| bool Encode(const Val& val, uint64_t* bits, size_t* num_bits) const { |
| auto it = encoding_table_.find(val); |
| if (it == encoding_table_.end()) return false; |
| *bits = it->second.first; |
| *num_bits = it->second.second; |
| return true; |
| } |
| |
| // Reads bits one-by-one using callback |read_bit| until a match is found. |
| // Matching value is stored in |val|. Returns false if |read_bit| terminates |
| // before a code was mathced. |
| // |read_bit| has type bool func(bool* bit). When called, the next bit is |
| // stored in |bit|. |read_bit| returns false if the stream terminates |
| // prematurely. |
| bool DecodeFromStream(const std::function<bool(bool*)>& read_bit, |
| Val* val) const { |
| uint32_t node = root_; |
| while (true) { |
| assert(node); |
| |
| if (!RightOf(node) && !LeftOf(node)) { |
| *val = ValueOf(node); |
| return true; |
| } |
| |
| bool go_right; |
| if (!read_bit(&go_right)) return false; |
| |
| if (go_right) |
| node = RightOf(node); |
| else |
| node = LeftOf(node); |
| } |
| |
| assert(0); |
| return false; |
| } |
| |
| private: |
| // Returns value of the node referenced by |handle|. |
| Val ValueOf(uint32_t node) const { return nodes_.at(node).value; } |
| |
| // Returns left child of |node|. |
| uint32_t LeftOf(uint32_t node) const { return nodes_.at(node).left; } |
| |
| // Returns right child of |node|. |
| uint32_t RightOf(uint32_t node) const { return nodes_.at(node).right; } |
| |
| // Returns weight of |node|. |
| uint32_t WeightOf(uint32_t node) const { return nodes_.at(node).weight; } |
| |
| // Returns id of |node|. |
| uint32_t IdOf(uint32_t node) const { return nodes_.at(node).id; } |
| |
| // Returns mutable reference to value of |node|. |
| Val& MutableValueOf(uint32_t node) { |
| assert(node); |
| return nodes_.at(node).value; |
| } |
| |
| // Returns mutable reference to handle of left child of |node|. |
| uint32_t& MutableLeftOf(uint32_t node) { |
| assert(node); |
| return nodes_.at(node).left; |
| } |
| |
| // Returns mutable reference to handle of right child of |node|. |
| uint32_t& MutableRightOf(uint32_t node) { |
| assert(node); |
| return nodes_.at(node).right; |
| } |
| |
| // Returns mutable reference to weight of |node|. |
| uint32_t& MutableWeightOf(uint32_t node) { return nodes_.at(node).weight; } |
| |
| // Returns mutable reference to id of |node|. |
| uint32_t& MutableIdOf(uint32_t node) { return nodes_.at(node).id; } |
| |
| // Returns true if |left| has bigger weight than |right|. Node ids are |
| // used as tie-breaker. |
| bool LeftIsBigger(uint32_t left, uint32_t right) const { |
| if (WeightOf(left) == WeightOf(right)) { |
| assert(IdOf(left) != IdOf(right)); |
| return IdOf(left) > IdOf(right); |
| } |
| return WeightOf(left) > WeightOf(right); |
| } |
| |
| // Prints subtree (helper function used by PrintTree). |
| void PrintTreeInternal(std::ostream& out, uint32_t node, size_t depth) const { |
| if (!node) return; |
| |
| const size_t kTextFieldWidth = 7; |
| |
| if (!RightOf(node) && !LeftOf(node)) { |
| out << ValueOf(node) << std::endl; |
| } else { |
| if (RightOf(node)) { |
| std::stringstream label; |
| label << std::setfill('-') << std::left << std::setw(kTextFieldWidth) |
| << WeightOf(RightOf(node)); |
| out << label.str(); |
| PrintTreeInternal(out, RightOf(node), depth + 1); |
| } |
| |
| if (LeftOf(node)) { |
| out << std::string(depth * kTextFieldWidth, ' '); |
| std::stringstream label; |
| label << std::setfill('-') << std::left << std::setw(kTextFieldWidth) |
| << WeightOf(LeftOf(node)); |
| out << label.str(); |
| PrintTreeInternal(out, LeftOf(node), depth + 1); |
| } |
| } |
| } |
| |
| // Traverses the Huffman tree and saves paths to the leaves as bit |
| // sequences to encoding_table_. |
| void CreateEncodingTable() { |
| struct Context { |
| Context(uint32_t in_node, uint64_t in_bits, size_t in_depth) |
| : node(in_node), bits(in_bits), depth(in_depth) {} |
| uint32_t node; |
| // Huffman tree depth cannot exceed 64 as histogramm counts are expected |
| // to be positive and limited by numeric_limits<uint32_t>::max(). |
| // For practical applications tree depth would be much smaller than 64. |
| uint64_t bits; |
| size_t depth; |
| }; |
| |
| std::queue<Context> queue; |
| queue.emplace(root_, 0, 0); |
| |
| while (!queue.empty()) { |
| const Context& context = queue.front(); |
| const uint32_t node = context.node; |
| const uint64_t bits = context.bits; |
| const size_t depth = context.depth; |
| queue.pop(); |
| |
| if (!RightOf(node) && !LeftOf(node)) { |
| auto insertion_result = encoding_table_.emplace( |
| ValueOf(node), std::pair<uint64_t, size_t>(bits, depth)); |
| assert(insertion_result.second); |
| (void)insertion_result; |
| } else { |
| if (LeftOf(node)) queue.emplace(LeftOf(node), bits, depth + 1); |
| |
| if (RightOf(node)) |
| queue.emplace(RightOf(node), bits | (1ULL << depth), depth + 1); |
| } |
| } |
| } |
| |
| // Creates new Huffman tree node and stores it in the deleter array. |
| uint32_t CreateNode() { |
| const uint32_t handle = static_cast<uint32_t>(nodes_.size()); |
| nodes_.emplace_back(Node()); |
| nodes_.back().id = next_node_id_++; |
| return handle; |
| } |
| |
| // Huffman tree root handle. |
| uint32_t root_ = 0; |
| |
| // Huffman tree deleter. |
| std::vector<Node> nodes_; |
| |
| // Encoding table value -> {bits, num_bits}. |
| // Huffman codes are expected to never exceed 64 bit length (this is in fact |
| // impossible if frequencies are stored as uint32_t). |
| std::unordered_map<Val, std::pair<uint64_t, size_t>> encoding_table_; |
| |
| // Next node id issued by CreateNode(); |
| uint32_t next_node_id_ = 1; |
| }; |
| |
| } // namespace comp |
| } // namespace spvtools |
| |
| #endif // SOURCE_COMP_HUFFMAN_CODEC_H_ |