Lempel–Ziv–Storer–Szymanski
一种压缩算法
每八个东西前有个字节标记这8个是啥。对应位是1的话,就是原来的字节。对应位是0,则这个东西占俩字节,引用窗口里的一段东西。
标准LZSS算法
初始窗口指针=0xFEE // 恶心!
初始窗口清空。
读标记
对每一位:
1->原样输出一个字节,附加到窗口之后。
0->读俩字节,取第二字节高四位拼上第一个字节作为窗口内开始位置,第二字节低四位+3(0,1,2不可能……)作为长度,输出窗口内容,同时也附加到窗口之后。
窗口大小0x1000
变种
- 取第一字节拼上第二字节高三位作为窗口内开始位置,第二字节低五位+2作为长度,窗口大小0x800
revision
第一版用的read和write,系统调用占了大部分时间…… 现在用fread和fwrite了
代码
#include <fcntl.h>
#include <iostream>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
using namespace std;
int get_char(FILE *f, unsigned char *ch)
{
int cnt = fread(ch, 1, 1, f);
return cnt;
}
int put_char(FILE *f, unsigned char *ch)
{
int cnt = fwrite(ch, 1, 1, f);
return cnt;
}
string dump(char ch)
{
char buf[3];
snprintf(buf, 3, "%02x", (unsigned char)ch);
return string(buf);
}
string dumpb(char ch)
{
char buf[10];
buf[8] = 0;
for (int i=0; i<8; i++)
if (ch & (1 << i))
buf[7-i] = '1';
else
buf[7-i] = '0';
return string(buf);
}
int uncompress(FILE* fin, FILE* fout, int from_bits)
{
// example:
// from_bits = 3
// len_mask = 0x1f : 0001 1111
// from_mask = 0xe0 : 1110 0000
// ^^^ 3
int window_size = 0x100 << from_bits;
unsigned char len_mask = (1 << (8 - from_bits)) - 1;
unsigned char from_mask = 0xFF - len_mask;
// cout << hex << window_size << dec << '/' << dump(len_mask) << '/' << dump(from_mask) << endl;
unsigned char window[window_size];
int win_pos = from_bits == 4 ? 0xFEE : 0;
int out_len = 0;
int in_len = 0;
int br_cnt = 0;
memset(window, 0, window_size);
unsigned char xbuf[8];
while (true)
{
unsigned char flag;
int cnt = get_char(fin, &flag);
if (cnt == 0) break;
int index = 0;
for (int index = 0; index < 8; index++, flag >>= 1)
{
if (flag & 1)
{
out_len++;
unsigned char ch;
cnt = get_char(fin, &ch);
if (cnt == 0) break;
put_char(fout, &ch);
window[win_pos] = ch;
win_pos = (win_pos + 1) % window_size;
// cout << dump(ch) << " ";
// if (isprint(ch))
// cout << ch << ' ';
in_len++;
} else {
br_cnt++;
unsigned char lead, offset;
cnt = get_char(fin, &offset);
if (cnt == 0) break;
cnt = get_char(fin, &lead);
if (cnt == 0) break;
// cout << endl << "backref: pos=" << hex << in_len << ", dpos=" << out_len << ", lead=" << hex << (int)lead << ", offset=" << (int)offset << dec << '\t' << dumpb(lead) << "/" << dumpb(offset) << endl;
int copy_len = (lead & len_mask) + 3;
int copy_from = ((lead & from_mask) << from_bits) | (offset & 0xff);
// cout << "backref: len=" << hex << copy_len << ", from=" << copy_from << dec << endl;
for (int i=0; i<copy_len; i++)
{
int ret = put_char(fout, &window[copy_from]);
if (ret != 1)
cerr << "writeout failed! " << errno << endl;
window[win_pos] = window[copy_from];
copy_from = (copy_from + 1) % window_size;
win_pos = (win_pos + 1) % window_size;
}
out_len += copy_len;
in_len += 2;
}
}
if (cnt == 0) break;
}
return out_len;
}
int main(int argc, char **argv)
{
if (argc < 3)
{
cerr << "usage: " << argv[0] << " <infile> <outfile> [<skip>]" << endl;
return 1;
}
FILE * fin = fopen(argv[1], "rb");
FILE * fout = fopen(argv[2], "wb");
if (fin == NULL || fout == NULL)
{
cerr << "file open error! " << errno << endl;
return errno;
}
if (argc > 3)
{
int skip = atoi(argv[3]);
char buf[1024];
while (skip > 0)
{
int cnt = fread(buf, 1, skip > 1024 ? 1024 : skip, fin);
if (cnt <= 0)
{
cerr << "skip error! " << errno << endl;
return 3;
}
skip -= cnt;
}
}
int from_bits = 4;
if (argc > 4)
from_bits = atoi(argv[4]);
int out_len = uncompress(fin, fout, from_bits);
cout << "result len: " << out_len << endl;
fclose(fin);
fclose(fout);
}