LZW Decompression

本文关键字：Decompression LZW 更新时间：2023-10-16

我正在C++中实现LZW算法。

字典的大小是用户输入，但最小值为 256，因此它应该适用于二进制文件。如果它到达字典的末尾，它会绕过索引 0 并从那里覆盖它。

例如，如果我输入一个爱丽丝梦游仙境脚本并用字典大小 512 压缩它，我就会得到这个字典。

但是我在解压缩方面遇到了问题，解压缩压缩文件的输出字典如下所示。

我的解压缩代码看起来像这样

struct dictionary
{
vector<unsigned char> entry;
vector<bool> bits;
};
void decompress(dictionary dict[], vector<bool> file, int dictionarySize, int numberOfBits)
{
//in this example
//dictionarySize = 512, tells the max size of the dictionary, and goes back to 0 if it reaches 513
//numberOfBits = log2(512) = 9
//dictionary dict[] contains bits and strings (strings can be empty)
// dict[0] = 
//            entry = (unsigned char)0
//            bits = (if numberOfBits = 9) 000000001
// dict[255] = 
//            entry = (unsigned char)255
//            bits = (if numberOfBits = 9) 011111111
// so the next entry will be dict[next] (next is currently 256)
// dict[256] = 
//            entry = what gets added in the code below
//            bits = 100000000
// all the bits are already set previously (dictionary size is int dictionarySize) so in this case all the bits from 0 to 511 are already set, entries are set from 0 to 255, so extended ASCII

vector<bool> currentCode;
vector<unsigned char> currentString;
vector<unsigned char> temp;
int next=256;
bool found=false;
for(int i=0;i<file.size();i+=numberOfBits)
{
for(int j=0;j<numberOfBits;j++)
{
currentCode.push_back(file[i+j]);
}
for(int j=0;j<dictionarySize;j++)
{
// when the currentCode (size numberOfBits) gets found in the dictionary
if(currentCode==dict[j].bits)
{
currentString = dict[j].entry;
// if the current string isnt empty, then it means it found the characted in the dictionary
if(!currentString.empty())
{
found = true;
}
}
}
//if the currentCode in the dictionary has a string value attached to it
if(found)
{
for(int j=0;j<currentString.size();j++)
{
cout<<currentString[j];
}
temp.push_back(currentString[0]);
// so it doesnt just push 1 character into the dictionary
// example, if first read character is 'r', it is already in the dictionary so it doesnt get added 
if(temp.size()>1)
{
// if next is more than 511, writing to that index would cause an error, so it resets back to 0 and goes back up
if(next>dictionarySize-1) //next > 512-1
{
next = 0;
}
dict[next].entry.clear();
dict[next].entry = temp;
next++;
}
//temp = currentString;
}
else
{
currentString = temp;
currentString.push_back(temp[0]);
for(int j=0;j<currentString.size();j++)
{
cout<<currentString[j];
}
// if next is more than 511, writing to that index would cause an error, so it resets back to 0 and goes back up
if(next>dictionarySize-1)
{
next = 0;
}
dict[next].entry.clear();
dict[next].entry = currentString;
next++;
//break;
}
temp = currentString;
// currentCode gets cleared, and written into in the next iteration
currentCode.clear();
//cout<<endl;
found = false;
}
}

我目前被困住了，不知道在这里修复什么来修复输出。我还注意到，如果我把字典放得足够大，所以它不会绕字典(它不会到达末尾并从 0 重新开始)，它就可以工作。

从小处开始

您正在使用的数据过多而无法调试的文件。从小的字符串开始。我从Wikli那里举了这个很好的例子：

Input: "abacdacacadaad"
step    input           match   output  new_entry   new_index
a           0
b           1
c           2
d           3
1       abacdacacadaad  a       0       ab          4
2       bacdacacadaad   b       1       ba          5
3       acdacacadaad    a       0       ac          6
4       cdacacadaad     c       2       cd          7
5       dacacadaad      d       3       da          8
6       acacadaad       ac      6       aca         9
7       acadaad         aca     9       acad        10
8       daad            da      8       daa         11
9       ad              a       0       ad          12
10      d               d       3       
Output: "0102369803"

因此，您可以使用交叉匹配输入/输出和字典内容逐步调试代码。正确完成此操作后，您可以对解码执行相同的操作：

Input: "0102369803"
step    input   output  new_entry   new_index
a           0
b           1
c           2
d           3
1       0       a       
2       1       b       ab          4
3       0       a       ba          5
4       2       c       ac          6
5       3       d       cd          7
6       6       ac      da          8
7       9       aca     aca         9
8       8       da      acad        10
9       0       a       daa         11
10      3       d       ad          12
Output: "abacdacacadaad"

然后才移动到文件并清除字典处理。

比特流

一旦你成功地完成了小字母表的LZW，你可以尝试使用完整的字母表和位编码。您知道 LZW 流可以以任何位长度(不仅仅是 8/16/32/64 位)进行编码，这会极大地影响压缩率(相对于使用的数据属性)。所以我会尝试以变量(或预定义的位长度)对数据进行全局访问。

有点好奇，所以我编码了一个简单的 C++/VCL 压缩示例：

//---------------------------------------------------------------------------
// LZW
const int LZW_bits=12;              // encoded bitstream size
const int LZW_size=1<<LZW_bits;     // dictinary size
// bitstream R/W
DWORD bitstream_tmp=0;
//---------------------------------------------------------------------------
// return LZW_bits from dat[adr,bit] and increment position (adr,bit)
DWORD bitstream_read(BYTE *dat,int siz,int &adr,int &bit,int bits)
{
DWORD a=0,m=(1<<bits)-1;
// save tmp if enough bits
if (bit>=bits){ a=(bitstream_tmp>>(bit-bits))&m; bit-=bits; return a; }
for (;;)
{
// insert byte
bitstream_tmp<<=8;
bitstream_tmp&=0xFFFFFF00;
bitstream_tmp|=dat[adr]&255;
adr++; bit+=8;
// save tmp if enough bits
if (bit>=bits){ a=(bitstream_tmp>>(bit-bits))&m; bit-=bits; return a; }
// end of data
if (adr>=siz) return 0;
}
}
//---------------------------------------------------------------------------
// write LZW_bits from a to dat[adr,bit] and increment position (adr,bit)
// return true if buffer is full
bool bitstream_write(BYTE *dat,int siz,int &adr,int &bit,int bits,DWORD a)
{
a<<=32-bits;        // align to MSB
// save tmp if aligned
if ((adr<siz)&&(bit==32)){ dat[adr]=(bitstream_tmp>>24)&255; adr++; bit-=8; }
if ((adr<siz)&&(bit==24)){ dat[adr]=(bitstream_tmp>>16)&255; adr++; bit-=8; }
if ((adr<siz)&&(bit==16)){ dat[adr]=(bitstream_tmp>> 8)&255; adr++; bit-=8; }
if ((adr<siz)&&(bit== 8)){ dat[adr]=(bitstream_tmp    )&255; adr++; bit-=8; }
// process all bits of a
for (;bits;bits--)
{
// insert bit
bitstream_tmp<<=1;
bitstream_tmp&=0xFFFFFFFE;
bitstream_tmp|=(a>>31)&1;
a<<=1; bit++;
// save tmp if aligned
if ((adr<siz)&&(bit==32)){ dat[adr]=(bitstream_tmp>>24)&255; adr++; bit-=8; }
if ((adr<siz)&&(bit==24)){ dat[adr]=(bitstream_tmp>>16)&255; adr++; bit-=8; }
if ((adr<siz)&&(bit==16)){ dat[adr]=(bitstream_tmp>> 8)&255; adr++; bit-=8; }
if ((adr<siz)&&(bit== 8)){ dat[adr]=(bitstream_tmp    )&255; adr++; bit-=8; }
}
return (adr>=siz);
}
//---------------------------------------------------------------------------
bool str_compare(char *s0,int l0,char *s1,int l1)
{
if (l1<l0) return false;
for (;l0;l0--,s0++,s1++)
if (*s0!=*s1) return false;
return true;
}
//---------------------------------------------------------------------------
AnsiString LZW_encode(AnsiString raw)
{
AnsiString lzw="";
int i,j,k,l;
int adr,bit;
DWORD a;
const int siz=32;                   // bitstream buffer
BYTE buf[siz];
AnsiString dict[LZW_size];          // dictionary
int dicts=0;                        // actual size of dictionary
// init dictionary
for (dicts=0;dicts<256;dicts++) dict[dicts]=char(dicts);    // full 8bit binary alphabet
//  for (dicts=0;dicts<4;dicts++) dict[dicts]=char('a'+dicts);  // test alphabet "a,b,c,d"
l=raw.Length();
adr=0; bit=0;
for (i=0;i<l;)
{
i&=i;
// find match in dictionary
for (j=dicts-1;j>=0;j--)
if (str_compare(dict[j].c_str(),dict[j].Length(),raw.c_str()+i,l-i))
{
i+=dict[j].Length();
if (i<l)    // add new entry in dictionary (if not end of input)
{
// clear dictionary if full
if (dicts>=LZW_size) dicts=256; // full 8bit binary alphabet
//              if (dicts>=LZW_size) dicts=4;   // test alphabet "a,b,c,d"
else{
dict[dicts]=dict[j]+AnsiString(raw[i+1]); // AnsiString index starts from 1 hence the +1
dicts++;
}
}
a=j; j=-1; break;       // full binary output
//          a='0'+j; j=-1; break;   // test ASCII output
}
// store result to bitstream
if (bitstream_write(buf,siz,adr,bit,LZW_bits,a))
{
// append buf to lzw
k=lzw.Length();
lzw.SetLength(k+adr);
for (j=0;j<adr;j++) lzw[j+k+1]=buf[j];
// reset buf
adr=0;
}
}
if (bit)
{
// store the remainding bits with zeropad
bitstream_write(buf,siz,adr,bit,LZW_bits-bit,0);
}
if (adr)
{
// append buf to lzw
k=lzw.Length();
lzw.SetLength(k+adr);
for (j=0;j<adr;j++) lzw[j+k+1]=buf[j];
}
return lzw;
}
//---------------------------------------------------------------------------
AnsiString LZW_decode(AnsiString lzw)
{
AnsiString raw="";
int adr,bit,siz,ix;
DWORD a;
AnsiString dict[LZW_size];          // dictionary
int dicts=0;                        // actual size of dictionary
// init dictionary
for (dicts=0;dicts<256;dicts++) dict[dicts]=char(dicts);    // full 8bit binary alphabet
//  for (dicts=0;dicts<4;dicts++) dict[dicts]=char('a'+dicts);  // test alphabet "a,b,c,d"
siz=lzw.Length();
adr=0; bit=0; ix=-1;
for (adr=0;(adr<siz)||(bit>=LZW_bits);)
{
a=bitstream_read(lzw.c_str(),siz,adr,bit,LZW_bits);
//      a-='0';                         // test ASCII input
// clear dictionary if full
if (dicts>=LZW_size){ dicts=4; ix=-1; }
// new dictionary entry
if (ix>=0)
{
if (a>=dicts){ dict[dicts]=dict[ix]+AnsiString(dict[ix][1]); dicts++; }
else         { dict[dicts]=dict[ix]+AnsiString(dict[a ][1]); dicts++; }
} ix=a;
// update decoded output
raw+=dict[a];
}
return raw;
}
//---------------------------------------------------------------------------

并使用// test ASCII input行输出：

txt="abacdacacadaad"
enc="0102369803"
dec="abacdacacadaad"

其中AnsiString是我使用的唯一VCL东西，它只是自我分配的字符串变量，当心它的索引从1开始。

AnsiString s;
s[5]              // character access (1 is first character) 
s.Length()        // returns size
s.c_str()         // returns char*
s.SetLength(size) // resize

所以只需使用您获得的任何字符串...

如果您不必BYTE,DWORD使用unsigned char和unsigned int代替...

看起来它也适用于长文本(比字典和/或比特流缓冲区大小大)。但是请注意，清除可能在几个不同的代码位置完成，但必须在编码器/解码器中同步，否则清除数据后会损坏。

该示例可以只使用"a,b,c,d"字母表或完整的 8it 字母表。当前设置为 8 位。如果要更改它，只需取消删除// test ASCII input行并删除代码中的// full 8bit binary alphabet行。

要测试交叉缓冲区和边界，您可以使用：

const int LZW_bits=12;              // encoded bitstream size
const int LZW_size=1<<LZW_bits;     // dictinary size

以及：

const int siz=32;                   // bitstream buffer

常数。。。这也会影响性能，因此请根据自己的喜好进行调整。请注意，bitstream_write没有优化，可以大大加快速度......

同样为了调试 4 位对齐编码，我正在使用编码数据的十六进制打印(十六进制字符串是其 ASCII 版本的两倍)，如下所示(忽略 VCL 内容)：

AnsiString txt="abacdacacadaadddddddaaaaaaaabcccddaaaaaaaaa",enc,dec,hex;
enc=LZW_encode(txt);
dec=LZW_decode(enc);
// convert to hex
hex=""; for (int i=1,l=enc.Length();i<=l;i++) hex+=AnsiString().sprintf("%02X",enc[i]);
mm_log->Lines->Add("""+txt+""");
mm_log->Lines->Add("""+hex+""");
mm_log->Lines->Add("""+dec+""");
mm_log->Lines->Add(AnsiString().sprintf("ratio: %i%",(100*enc.Length()/dec.Length())));

结果和结果：

"abacdacacadaadddddddaaaaaaaabcccddaaaaaaaaa"
"06106206106306410210510406106410FFFFFF910A10706110FFFFFFD10E06206311110910FFFFFFE11410FFFFFFD0"
"abacdacacadaadddddddaaaaaaaabcccddaaaaaaaaa"
ratio: 81%