原理：

将每个字符转换成对应的编码，而编码都是由0和1组成，那么就可以用位来表示编码对应的字符，从而实现文件变小，即压缩；同样，再根据压缩的步骤反推，即可实现解压缩

思路：

1.统计文件中各个字符出现的次数

定义一个哈希表，使用直接定址法，统计各个字符出现的次数；
要注意的是：字符的大小是-127~128，而哈希表的范围是0~255，所以使用直接定址法的时候强转成无符号字符型，如：++hashtable[(unsigned char)ch];

2.构建huffman树（贪心算法）

①先将哈希表中内容放入小堆中；
②再从堆中取两个最小的值进行相加；
③再将相加的结果插入堆中，并保持仍然是小堆
可以使用优先级队列priority_queue实现

3.生成huffman编码

中序遍历huffman树，多说无益，直接看图和代码

    //struct CharInfo
    //{
    //    char _ch;         //字符
    //    LongType _count;  //字符出现的次数
    //    string _code;     //字符对应的huffman编码
    //}
    void GenerateHuffmanCode(Node* root)
    {
        if (root == NULL)
            return;
        if (root->_left == NULL&&root->_right == NULL)
        {
            _hashInfos[(unsigned char)root->_w._ch]._code = root->_w._code;
            return;
        }
        if (root->_left != NULL)
        {
            root->_left->_w._code = root->_w._code + '0';
            GenerateHuffmanCode(root->_left);
        }
        if (root->_right != NULL)
        {
            root->_right->_w._code = root->_w._code + '1';
            GenerateHuffmanCode(root->_right);
        }
    }

4.压缩

将每个字符对应的huffman编码换成比特位，写入压缩文件

5.解压缩

再将压缩文件中的比特位一一读取，寻找huffman树中对应的字符，写入解压缩后的文件

思维导图：

详细步骤：

压缩过程
①使用ifstream函数读取文件Input.txt文件，统计字符出现次数
②将字符信息：字符_ch、字符出现次数_count、字符的huffman编码_code,存入结构体CharInfo
③将字符对应的CharInfo按_ch直接定址，放入哈希表_hashInfo中
④使用priority_queue创建huffman树，再遍历huffman树，将huffman编码写入哈希表中对应的CharInfo的_code里
⑤将哈希表中CharInfo的_count>0的元素使用ofstream函数写入文件Input.txt.huffman中，最后再加一个_count=0的CharInfo作为分界线
⑥再将每个字符的huffman编码按照“位”，使用ofstream函数写入文件Input.txt.huffman中，压缩完毕
解压过程
⑦使用ofstream函数读取文件Input.txt.huffman中分界线前部分，即哈希表的内容
⑧根据上步创建的哈希表内容，再使用priority_queue创建huffman树
⑨根据哈希表中CharInfoL的_code遍历huffman树，找到对应的字符，使用ofstream函数写入文件Input.txt.unhuffman中，解压完毕

错误手册

这里列举出各种可能导致错误的细节点，供读者查阅，看是否能对上自己的错误原因

①使用ifstrem和ofsteam函数对文本进行输出输入操作时，最好写成以二进制方式，否则可能会出现读取到特殊符号而终止，导致解压缩不完全，二进制方式如下：
ifstream ifs(filename,ios::in|ios::binary);
ofstream ofs(filename,ios::out|ios::binary);
详细的ifstream和ofstream操作可自行搜索
②对字符进行直接定址确定自己在哈希表中的位置时，要注意使用(unsigend char)ch强转，因为哈希表的定义范围是0到255，而字符的大小是-127到128
③创建huffman树时，因为节点中保存的是一个结构体而不是一个简单的内置类型，因此在对节点进行“比较”操作的时候需要自己重载这些比较操作符，如：
bool operator>(const T& t);bool operator!=(const T& t);等

代码：

功能函数预览

HuffmanTree.h
HuffmanTree(W* w, size_t n, const W& invalid);//构造函数，创建huffman树
~HuffmanTree();//析构，调用Destroy
void Destroy(Node* root);//释放空间
Node* GetRoot();//返回huffman树的根节点

FileCompress.h
FileCompress();//构造函数，初始化哈希表
void GetCharCount(ifstream& ifs,const char*file);//统计文件中字符出现的次数
bool IsExist(const char* file);//判断文件是否存在
string CompressForm(const char* file, char* form);//压缩前的准备和文件检验
string UncompressForm(const char* file, char* form);//解压缩前的准备和文件检验
void GenerateHuffmanCode(Node* root);//获取huffman编码
void Compress(const char* file);//压缩过程，调用上面的函数
void UnCompress(const char* file);//解压过程，调用上面的函数

全部代码

HuffmanTree.h

#ifndef __HUFFMAN_H__
#define __HUFFMAN_H__
#include <iostream>
using namespace std;
#include <queue>
#include <vector>

template <class W>
struct HuffmanTreeNode
{
    HuffmanTreeNode<W>* _left;
    HuffmanTreeNode<W>* _right;
    W _w;
    HuffmanTreeNode(const W& w)
        :_w(w)
        , _left(NULL)
        , _right(NULL)
    {}
};

template <class W>
class HuffmanTree
{
    typedef HuffmanTreeNode<W> Node;
public:
    HuffmanTree()
        :_root(NULL)
    {}

    struct NodeComp
    {
        //仿函数，使之按照结点内的权值比较大小，而不是按指针
        //这种只比较，不修改的写成const最好
        //但是随之而来的问题就是，如果w是自定义类型，而不仅仅是内置类型
        //那么就要注意，const修饰的l和r能不能调用w内部的函数
        //即const对象不能调用非const函数
        bool operator()(const Node* l, const Node* r)
        {
            return l->_w > r->_w;
        }
    };

    HuffmanTree(W* w, size_t n, const W& invalid)//判断非法值，即_count>1才插入，节省空间
    {
        //构建huffman树
        priority_queue<Node*, vector<Node*>, NodeComp> minHeap;
        for (size_t i = 0; i < n; ++i)
        {
            if (w[i] != invalid)
                minHeap.push(new Node(w[i]));
        }
        while (minHeap.size() > 1)
        {
            Node* left = minHeap.top();
            minHeap.pop();
            Node* right = minHeap.top();
            minHeap.pop();
            Node* parent = new Node(left->_w + right->_w);
            parent->_left = left;
            parent->_right = right;
            minHeap.push(parent);
        }
        _root = minHeap.top();

    }

    ~HuffmanTree()
    {
        //释放树的空间
        Destroy(_root);
        _root = NULL;
    }
    void Destroy(Node* root)
    {
        if (root == NULL)
            return;
        Destroy(root->_left);
        Destroy(root->_right);
        delete(root);
    }
    Node* GetRoot()
    {
        return _root;
    }
protected:
    Node* _root;
private:
    //防拷贝，C++11的话可以直接在构造函数后面加delete
    HuffmanTree(const HuffmanTree<W>& h);
    HuffmanTree<W>& operator=(const HuffmanTree<W>& h);
};

#endif //__HUFFMAN_H__

FileCompress.h

#ifndef __FILECOMPRESS_H__
#define __FILECOMPRESS_H__
#include "HuffmanTree.h"
#include <fstream>
#include <assert.h>
#include <time.h>
typedef long long LongType;

#define FORM_LENGTH 4   //常见的文件格式长度，如*.txt，扩展名长度为4
#define COMPRESS_FORM ".huffman"    //压缩文件格式

struct CharInfo
{
    char _ch;           //字符
    LongType _count;    //字符出现的次数
    string _code;       //字符对应的huffman编码

    CharInfo operator+(const CharInfo& info)//重载 +
    {
        CharInfo tmp;
        tmp._count = _count + info._count;
        return tmp;
    }
    bool operator>(const CharInfo& info) const//重载 >
    {
        return _count > info._count;
    }
    bool operator!=(const CharInfo& info) const//重载 !=
    {
        return _count != info._count;
    }
};

class FileCompress
{
    typedef HuffmanTreeNode<CharInfo> Node;

public:
    //专门用于在压缩文件里保存字符出现次数的结构体
    //没必要使用CharInfo，因为有string code这一项的浪费开销
    struct ConfigInfo 
    {
        char _ch;
        LongType _count;
    };


    FileCompress()//初始化哈希表
    {
        for (size_t i = 0; i < 256; ++i)
        {
            _hashInfos[i]._ch = i;
            _hashInfos[i]._count = 0;
        }
    }

    //统计文件中字符出现的次数
    void GetCharCount(ifstream& ifs,const char*file)
    {
        char ch;
        while (ifs.get(ch))
        {
            ++_hashInfos[(unsigned char)ch]._count;//这里的ch必须转换成无符号的，因为_hashInfos的下标是从0到255
        }
    }

    bool IsExist(const char* file)//判断文件是否存在
    {
        ifstream ifs(file);
        if (!ifs)
        {
            return false;//不存在
        }
        return true;//存在
    }

    string CompressForm(const char* file, char* form)//压缩前的准备和检验
    {
        string compressfile = file;
        size_t index = compressfile.rfind('.');//从后往前找到文件名中的“.”
        assert(index != string::npos);
        strcpy(form, file + index);//保存源文件格式
        compressfile.erase(index);//删除后缀
        index = compressfile.rfind('\\');//从后往前找到文件名中的第一个“\”
        ++index;
        string name;
        for (size_t i = index; file[i] != '.';++i)//保存源文件名
        {
            name.push_back(file[i]);
        }
        compressfile.erase(index);//删除源文件名


        printf("是否压缩到指定文件？\n");
        printf("1.是\t\t\t2.否\n");

        int select = 0;
        while (1)
        {
            scanf("%d", &select);
            if (select == 1)
            {
                char newRoad[100] = { 0 };
                printf("请输入指定目录：(如：D:\\Game\\ “\\”不能漏写)\n");
                scanf("%s", newRoad);
                string road(newRoad);
                if (IsExist((road + name + COMPRESS_FORM).c_str()))
                {
                    printf("文件已存在，请选择：\n");
                    printf("1.替换\t\t\t2.重命名\n");
                    scanf("%d", &select);
                    if (select == 2)
                    {
                        char newName[100] = { 0 };
                        scanf("%s", newName);
                        road += newName;
                    }
                    else
                    {
                        road += name;
                    }
                }
                else
                {
                    road += name;
                }
                road += COMPRESS_FORM;
                return road;
            }
            else if (select == 2)
            {
                if (IsExist((compressfile + name + COMPRESS_FORM).c_str()))
                {
                    printf("文件已存在，请选择：\n");
                    printf("1.替换\t\t\t2.重命名\n");
                    scanf("%d", &select);
                    if (select == 2)
                    {
                        char newName[100] = { 0 };
                        scanf("%s", newName);
                        compressfile += newName;
                    }
                    else
                    {
                        compressfile += name;
                    }
                }
                else
                {
                    compressfile += name;
                }
                compressfile += COMPRESS_FORM;//替换成指定的压缩文件格式
                return compressfile;
            }
            else
            {
                printf("输入有误，重新选择:\n");
            }
        }
    }

    string UncompressForm(const char* file, char* form)//解压缩前的准备和检验
    {
        string compressfile = file;
        size_t index = compressfile.rfind('.');//从后往前找到文件名中的“.”
        assert(index != string::npos);
        compressfile.erase(index);//删除后缀
        index = compressfile.rfind('\\');//从后往前找到文件名中的第一个“\”
        ++index;
        string name;
        for (size_t i = index; file[i] != '.'; ++i)//保存源文件名
        {
            name.push_back(file[i]);
        }
        compressfile.erase(index);//删除压缩文件名

        printf("是否解压到指定文件夹？\n");
        printf("1.是\t\t\t2.否\n");

        int select = 0;
        while (1)
        {
            scanf("%d", &select);
            if (select == 1)
            {
                char str[100] = { 0 };
                printf("请输入指定目录：(如：D:\\Game\\ “\\”不能漏写)\n");
                scanf("%s", str);
                string road(str);
                if (IsExist((road + name + form).c_str()))
                {
                    printf("文件已存在，请选择：\n");
                    printf("1.替换\t\t\t2.重命名\n");
                    scanf("%d", &select);
                    if (select == 2)
                    {
                        char newName[100] = { 0 };
                        printf("请输入新的名字：\n");
                        scanf("%s", newName);
                        road += newName;
                    }
                    else
                    {
                        road += name;
                    }
                }
                else
                {
                    road += name;
                }
                road += form;
                return road;
            }
            else if (select == 2)
            {
                if (IsExist((compressfile + name + form).c_str()))
                {
                    printf("文件已存在，请选择：\n");
                    printf("1.替换\t\t\t2.重命名\n");
                    scanf("%d", &select);
                    if (select == 2)
                    {
                        char newName[100] = { 0 };
                        printf("请输入新的名字：\n");
                        scanf("%s", newName);
                        compressfile += newName;
                    }
                    else
                    {
                        compressfile += name;
                    }
                }
                else
                {
                    compressfile += name;
                }
                compressfile += form;//替换成指定的压缩文件格式
                return compressfile;
            }
            else
            {
                printf("输入有误，重新选择:\n");
            }
        }
    }

    void GenerateHuffmanCode(Node* root)//获取huffman编码
    {
        if (root == NULL)
            return;
        if (root->_left == NULL&&root->_right == NULL)
        {
            _hashInfos[(unsigned char)root->_w._ch]._code = root->_w._code;
            return;
        }
        if (root->_left != NULL)
        {
            root->_left->_w._code = root->_w._code + '0';
            GenerateHuffmanCode(root->_left);
        }
        if (root->_right != NULL)
        {
            root->_right->_w._code = root->_w._code + '1';
            GenerateHuffmanCode(root->_right);
        }
    }

    void Compress(const char* file)
    {
        ifstream ifs(file, ios::in | ios::binary);//0.打开源文件

        GetCharCount(ifs,file);//1.统计文件中字符出现的次数

        CharInfo invalid;
        invalid._count = 0;
        HuffmanTree<CharInfo> tree(_hashInfos, 256, invalid);//2.生成huffman树

        GenerateHuffmanCode(tree.GetRoot());//3.生成huffman编码

        char form[FORM_LENGTH + 1] = { 0 };
        string compressfile = CompressForm(file, form);//4.压缩前的准备和检验
        ofstream ofs(compressfile.c_str(), ios::out | ios::binary);//创建该文件名的文件，并写入内容

        //计时器
        clock_t start = 0, end = 0;
        start = clock();

        //5.压缩
        //压缩三部分内容：源文件格式+字符出现次数信息+源文件内容信息
        for (size_t i = 0; i < FORM_LENGTH+1; ++i)//5.1将源文件后缀，即格式写入压缩文件，方便解压缩的时候恢复原格式
        {
            ofs.put(form[i]);
        }

        for (size_t i = 0; i < 256; ++i)//5.2将_hashInfos中字符出现次数>0的元素写入压缩文件
        {   
            if (_hashInfos[i]._count>0)
            {
                ConfigInfo info;
                info._ch = _hashInfos[i]._ch;
                info._count = _hashInfos[i]._count;
                ofs.write((const char*)&info, sizeof(ConfigInfo));
            }
        }

        ConfigInfo over;
        over._count = 0;
        ofs.write((const char*)&over, sizeof(ConfigInfo));//5.3设置分界线

        char ch;
        char value = 0;
        int pos = 0;
        ifs.clear();
        ifs.seekg(0);
        while (ifs.get(ch))//5.4将huffman编码写入
        {
            string& code = _hashInfos[(unsigned char)ch]._code;
            for (size_t i = 0; i < code.size(); ++i)
            {
                if (code[i] == '0')
                    value &= (~(1 << pos));
                else if (code[i] == '1')
                    value |= (1 << pos);
                else
                    assert(false);
                ++pos;
                if (pos == 8)
                {
                    ofs.put(value);
                    pos = 0;
                    value = 0;
                }
            }

        }
        if (pos > 0)
        {
            ofs.put(value);
        }
        end = clock();
        printf("压缩用时：%d ms\n", end - start);
    }

    void UnCompress(const char* file)
    {
        ifstream ifs(file, ios::in | ios::binary);//0.打开压缩文件

        //2.读取三部分信息：源文件格式+字符出现次数信息+源文件内容信息

        char form[FORM_LENGTH + 1];
        for (size_t i = 0; i < FORM_LENGTH + 1; ++i)//2.读取源文件格式信息
        {
            ifs.get(form[i]);
        }

        string uncompressfile = UncompressForm(file,form);//3.解压缩前的准备和检验
        ofstream ofs(uncompressfile.c_str(), ios::out | ios::binary);//创建该文件名的文件，并写入内容

        while (1)//4.读取字符串出现次数信息
        {
            ConfigInfo info;
            ifs.read((char*)&info, sizeof(ConfigInfo));
            if (info._count > 0)
            {
                _hashInfos[(unsigned char)info._ch]._count = info._count;
            }
            else
            {
                break;
            }
        }

        //计时器
        clock_t start = 0, end = 0;
        start = clock();

        //5.重建huffman树
        CharInfo invalid;
        invalid._count = 0;
        HuffmanTree<CharInfo> tree(_hashInfos, 256, invalid);

        //6.解压缩
        //根据读取的huffman编码在huffman树中找到对应字符，写入到文件中
        Node* root = tree.GetRoot();
        LongType filecount = root->_w._count;
        Node* cur = root;
        char ch;
        while (ifs.get(ch))
        {
            for (size_t i = 0; i < 8; ++i)
            {
                if (ch&(1 << i))//1
                    cur = cur->_right;
                else//0
                    cur = cur->_left;
                if (cur->_left == NULL&&cur->_right == NULL)
                {
                    ofs.put(cur->_w._ch);
                    cur = root;
                    if (--filecount == 0)
                    {
                        break;
                    }
                }
            }
        }
        end = clock();
        printf("解压用时：%d ms\n", end - start);
    }
private:
    CharInfo _hashInfos[256];
};

void TestCompress(const char* filename)
{
    FileCompress fc;
    fc.Compress(filename);
}

void TestUnCompress(const char* filename)
{
    FileCompress fc;
    fc.UnCompress(filename);
}

#endif //__FILECOMPRESS_H__

test.c

#include "FileCompress.h"

void menu()
{
    printf("****************** 欢迎使用huffman文件压缩 ******************\n");
    printf("\n");
    printf("****************** 1.压缩文件   2.解压文件 ******************\n");
    int select = 0;
    scanf("%d", &select);
    switch (select)
    {
    case 1:
    {
              printf("请输入文件所在路径和文件格式：（如：D:\\test\\filename.txt）\n");
              char filename[100] = { 0 };
              scanf("%s", filename);
              TestCompress(filename);
    }
        break;
    case 2:
    {
              printf("请输入文件所在路径和文件格式：（如：D:\\test\\filename.huffman）\n");
              char filename[100] = { 0 };
              scanf("%s", filename);
              TestUnCompress(filename);
    }
        break;
    default:
        break;
    }
}

int main()
{
    menu();
    system("pause");
    return 0;
}

压缩率

文件类型	源文件大小	压缩大小	压缩率
视频文件	78.8MB	78.7MB	0.99
word文档	101K	105K	1.04
长篇中文小说txt	8.56MB	6.25MB	0.73

总结：huffman树压缩适用于字符出现次数差值较大，分布不平均的文件

开源项目：文件压缩（huffman树版）

原理：

思路：

1.统计文件中各个字符出现的次数

2.构建huffman树（贪心算法）

3.生成huffman编码

4.压缩

5.解压缩

思维导图：

详细步骤：

错误手册

代码：

功能函数预览

全部代码

压缩率

猜你喜欢