① 将词典初始化为包含所有可能的单字符，当前前缀P初始化为空。
② 当前字符C=字符流中的下一个字符。
③ 判断P＋C是否在词典中
1° 如果“是”，则用C扩展P，即让P=P＋C，返回到步骤2。
2° 如果“否”，则输出与当前前缀P相对应的码字W；
将P＋C添加到词典中；
令P=C，并返回到步骤2。

2. LZW解码原理

① 在开始译码时词典包含所有可能的前缀根。
② 令CW：=码字流中的第一个码字。
③ 输出当前缀-符串string.CW到码字流。
④ 先前码字PW：=当前码字CW。
⑤ 当前码字CW：=码字流的下一个码字。
⑥ 判断当前缀-符串string.CW 是否在词典中。
1° 如果”是”，则把当前缀-符串string.CW输出到字符流；
当前前缀P：=先前缀-符串string.PW；
当前字符C：=当前前缀-符串string.CW的第一个字符；
把缀-符串P+C添加到词典。
2° 如果”否”，则当前前缀P：=先前缀-符串string.PW；
当前字符C：=当前缀-符串string.CW的第一个字符；
输出缀-符串P+C到字符流,然后把它添加到词典中。

三、实验代码

1. 数据结构分析

struct {
	int suffix;//尾缀字符
	int parent, firstchild, nextsibling;//母节点，第一个孩子节点，下一个兄弟节点
} dictionary[MAX_CODE+1];
int next_code;
int d_stack[MAX_CODE]; // stack for decoding a phrase

2. 主要功能模块

（1）初始化词典

void InitDictionary(void) {//将0~255根节点初始化
	int i;

	for (i = 0; i < 256; i++) {
		dictionary[i].suffix = i;
		dictionary[i].parent = -1;//母节点初始化为空
		dictionary[i].firstchild = -1;//子节点初始化为空
		dictionary[i].nextsibling = i + 1;
	}
	dictionary[255].nextsibling = -1;
	next_code = 256;//新词条从256开始编码
}

（2）将新串加入词典

void AddToDictionary(int character, int string_code) {
	int firstsibling, nextsibling;
	if (0 > string_code) return;//当前词条无前缀，为单个字符，已存在词典中
	dictionary[next_code].suffix = character;
	dictionary[next_code].parent = string_code;
	dictionary[next_code].nextsibling = -1;
	dictionary[next_code].firstchild = -1;
	firstsibling = dictionary[string_code].firstchild;//当前前缀的第一个孩子
	if (-1 < firstsibling) {	// 如果当前前缀有孩子
		nextsibling = firstsibling;
		while (-1 < dictionary[nextsibling].nextsibling) //只要nextsibling还有下一个兄弟
			nextsibling = dictionary[nextsibling].nextsibling;
		dictionary[nextsibling].nextsibling = next_code;
	}
	else {// 当前前缀无孩子,则新节点为它的第一个孩子
		dictionary[string_code].firstchild = next_code;
	}
	next_code++;
}

（3）查找词典中是否有字符串

int InDictionary( int character, int string_code){
	int sibling;
	if( 0>string_code) return character;//当前词条无前缀，为单个字符，初始化后已经在词典中，返回此字符
	sibling = dictionary[string_code].firstchild;//如果不是单个字符，找当前前缀的第一个孩子节点
	while( -1<sibling){
		if( character == dictionary[sibling].suffix) return sibling;//如果此孩子节点的尾缀字符等于character，则当前词条在词典中，返回此孩子节点
		sibling = dictionary[sibling].nextsibling;//否则，找当前前缀的下一个孩子节点
	}
	return -1;//没有找到，返回-1
}

（4）编码

void LZWEncode( FILE *fp, BITFILE *bf){
	int character;//当前字符C
	int string_code;//前缀P
	int index;//索引
	unsigned long file_length;//文件长度

	fseek( fp, 0, SEEK_END);//文件指针置于文件末尾
	file_length = ftell( fp);//获取文件长度
	fseek( fp, 0, SEEK_SET);//文件指针置于文件头
	BitsOutput( bf, file_length, 4*8);
	InitDictionary();//初始化词典
	string_code = -1;
	while( EOF!=(character=fgetc( fp))){
		index = InDictionary( character, string_code);
		if( 0<=index){	//如果当前字符串存在词典中
			string_code = index;//将前缀赋值为当前字符串
		}else{	// 如果当前字符串不在词典中
			output( bf, string_code);//输出当前前缀对应的码字
			if( MAX_CODE > next_code){	// 判断词典是否有空间
				// 将当前字符串添加到词典中
				AddToDictionary( character, string_code);
			}
			string_code = character;//令P=C
		}
	}
	output( bf, string_code);//若最后一个前缀无下一个字符，则输出其对应的码字
}

（5）解码

void LZWDecode( BITFILE *bf, FILE *fp){
	int character;
	int new_code, last_code;
	int phrase_length;
	unsigned long file_length;

	file_length = BitsInput( bf, 4*8);
	if( -1 == file_length) file_length = 0;
	InitDictionary();//初始化字典
	last_code = -1;//pw=-1;
	while( 0<file_length)
	{
		new_code = input( bf);//读入一个代号，new_code指cw
		if( new_code >= next_code)//如果读入的代号比字典最大代号还大，即该代号不在字典里
		{ // this is the case CSCSC( not in dict)
			d_stack[0] = character;//pw的代号character存入输出字符串最后一个字符中
			phrase_length = DecodeString( 1, last_code);//解出字符，将pw写入d_stack
		}
		else//如果代号存在于字典中
		{
			phrase_length = DecodeString( 0, new_code);//解出字符，将pw写入d_stack
		}
		character = d_stack[phrase_length-1];//将cw首字符存入character
		while( 0<phrase_length)
		{
			phrase_length --;
			fputc( d_stack[ phrase_length], fp);//输出当前码字对应的字符串
			file_length--;
		}
		if( MAX_CODE>next_code)//如果字典还有空间
		{// add the new phrase to dictionary
			AddToDictionary( character, last_code);//将P+C写入字典
		}
		last_code = new_code;//更新词条数
	}
}

3.完整代码

（1）bitio.h

/*
 * Declaration for bitwise IO
 *
 * vim: ts=4 sw=4 cindent
 */
#ifndef __BITIO__
#define __BITIO__

#include <stdio.h>

typedef struct{
	FILE *fp;
	unsigned char mask;
	int rack;
}BITFILE;

BITFILE *OpenBitFileInput( char *filename);
BITFILE *OpenBitFileOutput( char *filename);
void CloseBitFileInput( BITFILE *bf);
void CloseBitFileOutput( BITFILE *bf);
int BitInput( BITFILE *bf);
unsigned long BitsInput( BITFILE *bf, int count);
void BitOutput( BITFILE *bf, int bit);
void BitsOutput( BITFILE *bf, unsigned long code, int count);
#endif	// __BITIO__

（2）bitio.c

/*
 * Definitions for bitwise IO
 *
 * vim: ts=4 sw=4 cindent
 */

#include <stdlib.h>
#include <stdio.h>
#include "bitio.h"
BITFILE *OpenBitFileInput( char *filename){
	BITFILE *bf;
	bf = (BITFILE *)malloc( sizeof(BITFILE));
	if( NULL == bf) return NULL;
	if( NULL == filename)	bf->fp = stdin;
	else bf->fp = fopen( filename, "rb");
	if( NULL == bf->fp) return NULL;
	bf->mask = 0x80;
	bf->rack = 0;
	return bf;
}

BITFILE *OpenBitFileOutput( char *filename){
	BITFILE *bf;
	bf = (BITFILE *)malloc( sizeof(BITFILE));
	if( NULL == bf) return NULL;
	if( NULL == filename)	bf->fp = stdout;
	else bf->fp = fopen( filename, "wb");
	if( NULL == bf->fp) return NULL;
	bf->mask = 0x80;
	bf->rack = 0;
	return bf;
}

void CloseBitFileInput( BITFILE *bf){
	fclose( bf->fp);
	free( bf);
}

void CloseBitFileOutput( BITFILE *bf){
	// Output the remaining bits
	if( 0x80 != bf->mask) fputc( bf->rack, bf->fp);
	fclose( bf->fp);
	free( bf);
}

int BitInput( BITFILE *bf){
	int value;

	if( 0x80 == bf->mask){
		bf->rack = fgetc( bf->fp);
		if( EOF == bf->rack){
			fprintf(stderr, "Read after the end of file reached\n");
			exit( -1);
		}
	}
	value = bf->mask & bf->rack;
	bf->mask >>= 1;
	if( 0==bf->mask) bf->mask = 0x80;
	return( (0==value)?0:1);
}

unsigned long BitsInput( BITFILE *bf, int count){
	unsigned long mask;
	unsigned long value;
	mask = 1L << (count-1);
	value = 0L;
	while( 0!=mask){
		if( 1 == BitInput( bf))
			value |= mask;
		mask >>= 1;
	}
	return value;
}

void BitOutput( BITFILE *bf, int bit){
	if( 0 != bit) bf->rack |= bf->mask;
	bf->mask >>= 1;
	if( 0 == bf->mask){	// eight bits in rack
		fputc( bf->rack, bf->fp);
		bf->rack = 0;
		bf->mask = 0x80;
	}
}

void BitsOutput( BITFILE *bf, unsigned long code, int count){
	unsigned long mask;

	mask = 1L << (count-1);
	while( 0 != mask){
		BitOutput( bf, (int)(0==(code&mask)?0:1));
		mask >>= 1;
	}
}
#if 0
int main( int argc, char **argv){
	BITFILE *bfi, *bfo;
	int bit;
	int count = 0;

	if( 1<argc){
		if( NULL==OpenBitFileInput( bfi, argv[1])){
			fprintf( stderr, "fail open the file\n");
			return -1;
		}
	}else{
		if( NULL==OpenBitFileInput( bfi, NULL)){
			fprintf( stderr, "fail open stdin\n");
			return -2;
		}
	}
	if( 2<argc){
		if( NULL==OpenBitFileOutput( bfo, argv[2])){
			fprintf( stderr, "fail open file for output\n");
			return -3;
		}
	}else{
		if( NULL==OpenBitFileOutput( bfo, NULL)){
			fprintf( stderr, "fail open stdout\n");
			return -4;
		}
	}
	while( 1){
		bit = BitInput( bfi);
		fprintf( stderr, "%d", bit);
		count ++;
		if( 0==(count&7))fprintf( stderr, " ");
		BitOutput( bfo, bit);
	}
	return 0;
}
#endif

（3）lzw.c

/*
 * Definition for LZW coding 
 *
 * vim: ts=4 sw=4 cindent nowrap
 */
#include <stdlib.h>
#include <stdio.h>
#include "bitio.h"
#define MAX_CODE 65535

struct {
	int suffix;//尾缀字符
	int parent, firstchild, nextsibling;//母节点，第一个孩子节点，下一个兄弟节点
} dictionary[MAX_CODE+1];
int next_code;
int d_stack[MAX_CODE]; // stack for decoding a phrase

#define input(f) ((int)BitsInput( f, 16))
#define output(f, x) BitsOutput( f, (unsigned long)(x), 16)

int DecodeString( int start, int code);
void InitDictionary( void);
void PrintDictionary( void){
	int n;
	int count;
	for( n=256; n<next_code; n++){
		count = DecodeString( 0, n);
		printf( "%4d->", n);
		while( 0<count--) printf("%c", (char)(d_stack[count]));
		printf( "\n");
	}
}

int DecodeString( int start, int code){
	//需填充
	int count;
	count = start;
	while (0 <= code)
	{
		d_stack[count] = dictionary[code].suffix;
		code = dictionary[code].parent;
		count++;
	}
	return count;
}
void InitDictionary( void){
	int i;

	for( i=0; i<256; i++){
		dictionary[i].suffix = i;
		dictionary[i].parent = -1;
		dictionary[i].firstchild = -1;
		dictionary[i].nextsibling = i+1;
	}
	dictionary[255].nextsibling = -1;
	next_code = 256;
}
/*
 * Input: string represented by string_code in dictionary,
 * Output: the index of character+string in the dictionary
 * 		index = -1 if not found
 */
int InDictionary( int character, int string_code){
	int sibling;
	if( 0>string_code) return character;
	sibling = dictionary[string_code].firstchild;
	while( -1<sibling){
		if( character == dictionary[sibling].suffix) return sibling;
		sibling = dictionary[sibling].nextsibling;
	}
	return -1;
}

void AddToDictionary( int character, int string_code){
	int firstsibling, nextsibling;
	if( 0>string_code) return;
	dictionary[next_code].suffix = character;
	dictionary[next_code].parent = string_code;
	dictionary[next_code].nextsibling = -1;
	dictionary[next_code].firstchild = -1;
	firstsibling = dictionary[string_code].firstchild;
	if( -1<firstsibling){	// the parent has child
		nextsibling = firstsibling;
		while( -1<dictionary[nextsibling].nextsibling ) 
			nextsibling = dictionary[nextsibling].nextsibling;
		dictionary[nextsibling].nextsibling = next_code;
	}else{// no child before, modify it to be the first
		dictionary[string_code].firstchild = next_code;
	}
	next_code ++;
}

void LZWEncode( FILE *fp, BITFILE *bf){
	int character;
	int string_code;
	int index;
	unsigned long file_length;

	fseek( fp, 0, SEEK_END);
	file_length = ftell( fp);
	fseek( fp, 0, SEEK_SET);
	BitsOutput( bf, file_length, 4*8);
	InitDictionary();
	string_code = -1;
	while( EOF!=(character=fgetc( fp))){
		index = InDictionary( character, string_code);
		if( 0<=index){	// string+character in dictionary
			string_code = index;
		}else{	// string+character not in dictionary
			output( bf, string_code);
			if( MAX_CODE > next_code){	// free space in dictionary
				// add string+character to dictionary
				AddToDictionary( character, string_code);
			}
			string_code = character;
		}
	}
	output( bf, string_code);
}

void LZWDecode( BITFILE *bf, FILE *fp){
	//需填充
	int character;
	int new_code, last_code;
	int phrase_length;
	unsigned long file_length;

	file_length = BitsInput(bf, 4 * 8);
	if (-1 == file_length)
	{
		file_length = 0;
	}
	InitDictionary();
	last_code = -1;
	while (file_length > 0)
	{
		new_code = input(bf);
		if (new_code >= next_code)
		{
			d_stack[0] = character;
			phrase_length = DecodeString(1, last_code);
		}
		else
		{
			phrase_length = DecodeString(0, new_code);
		}
		character = d_stack[phrase_length - 1];
		while (0 < phrase_length)
		{
			phrase_length--;
			fputc(d_stack[phrase_length], fp);
			file_length--;
		}
		if (MAX_CODE > next_code)
		{
			AddToDictionary(character, last_code);
		}
		last_code = new_code;
	}
}



int main( int argc, char **argv){
	FILE *fp;
	BITFILE *bf;
	/*传参格式：argv[1]:字母D或E表示解码或者编码
			 argv[2]:输入文件的名称和路径
			 argv[3]:输出文件的名称和路径 */

	if( 4>argc){
		fprintf( stdout, "usage: \n%s <o> <ifile> <ofile>\n", argv[0]);
		fprintf( stdout, "\t<o>: E or D reffers encode or decode\n");
		fprintf( stdout, "\t<ifile>: input file name\n");
		fprintf( stdout, "\t<ofile>: output file name\n");
		return -1;
	}
	if( 'E' == argv[1][0]){ // 编码
		fp = fopen( argv[2], "rb");
		bf = OpenBitFileOutput( argv[3]);
		if( NULL!=fp && NULL!=bf){
			LZWEncode( fp, bf);
			fclose( fp);
			CloseBitFileOutput( bf);
			fprintf( stdout, "encoding done\n");
		}
		printf("Encode dictionary:\n");
		PrintDictionary();
	}else if( 'D' == argv[1][0]){	// 解码
		bf = OpenBitFileInput( argv[2]);
		fp = fopen( argv[3], "wb");
		if( NULL!=fp && NULL!=bf){
			LZWDecode( bf, fp);
			fclose( fp);
			CloseBitFileInput( bf);
			fprintf( stdout, "decoding done\n");
		}
		printf("Decode dictionary:\n");
		PrintDictionary();
	}else{	// otherwise
		fprintf( stderr, "not supported operation\n");
	}
	return 0;
}

四、实验步骤

1. 调试编码程序

新建test.txt文件，其内容为abbababac，将编码后的文件命名为test_en.txt，并输出编码后词典中新增的内容。

2. 调试解码程序

对test_en.txt文件进行解码，将解码后的文件命名为test_de.txt，并输出解码后词典中新增的内容。

3. 压缩效率分析

十种不同格式类型的原始文件：

进行LZW编码后：

压缩比统计：

文件类型	原始大小	压缩后大小	压缩比
.webp	6KB	12KB	2
.jpg	135KB	179KB	1.326
.mp3	1576KB	1916KB	1.216
.docx	85KB	110KB	1.294
.pdf	795KB	959KB	1.206
.mobi	655KB	633KB	0.966
.mp4	8202KB	9988KB	1.218
.ppt	1739	2100	1.208
.png	90	59	0.656
.yuv	768	557	0.725

对于部分文件类型，经过LZW编码后，文件反而变大了，这是因为这些文件本身就是经过一定压缩后得到的，所以使用LZW编码反而增加了冗余。而对于.yuv这种比较原始，没有经过处理的文件格式来说，经过LZW编码后文件大小大大地减小了。

5、LZW编解码

一、实验目的

二、实验原理

1. LZW编码原理