Lempel–Ziv–Storer–Szymanski

一种压缩算法

每八个东西前有个字节标记这8个是啥。对应位是1的话,就是原来的字节。对应位是0,则这个东西占俩字节,引用窗口里的一段东西。

标准LZSS算法

初始窗口指针=0xFEE // 恶心!

初始窗口清空。

读标记

对每一位:

1->原样输出一个字节,附加到窗口之后。

0->读俩字节,取第二字节高四位拼上第一个字节作为窗口内开始位置,第二字节低四位+3(0,1,2不可能……)作为长度,输出窗口内容,同时也附加到窗口之后。

窗口大小0x1000

变种

  • 取第一字节拼上第二字节高位作为窗口内开始位置,第二字节低位+2作为长度,窗口大小0x800

revision

第一版用的read和write,系统调用占了大部分时间…… 现在用fread和fwrite了

代码

#include <fcntl.h>
#include <iostream>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

using namespace std;

int get_char(FILE *f, unsigned char *ch)
{
	int cnt = fread(ch, 1, 1, f);
	return cnt;
}

int put_char(FILE *f, unsigned char *ch)
{
	int cnt = fwrite(ch, 1, 1, f);
	return cnt;
}

string dump(char ch)
{
    char buf[3];
    snprintf(buf, 3, "%02x", (unsigned char)ch);
    return string(buf);
}

string dumpb(char ch)
{
    char buf[10];
    buf[8] = 0;
    for (int i=0; i<8; i++)
        if (ch & (1 << i))
            buf[7-i] = '1';
        else
            buf[7-i] = '0';
    return string(buf);
}

int uncompress(FILE* fin, FILE* fout, int from_bits)
{
    // example:
    // from_bits = 3
    // len_mask  = 0x1f : 0001 1111
    // from_mask = 0xe0 : 1110 0000
    //                    ^^^ 3
    int window_size = 0x100 << from_bits;
    unsigned char len_mask = (1 << (8 - from_bits)) - 1;
    unsigned char from_mask = 0xFF - len_mask;

//    cout << hex << window_size << dec << '/' << dump(len_mask) << '/' << dump(from_mask) << endl;

    unsigned char window[window_size];
    int win_pos = from_bits == 4 ? 0xFEE : 0;
    int out_len = 0;
    int in_len = 0;
    int br_cnt = 0;
    memset(window, 0, window_size);
	unsigned char xbuf[8];
                
    while (true)
    {
        unsigned char flag;
        int cnt = get_char(fin, &flag);
        if (cnt == 0) break;
        int index = 0;
/*		if (flag == 0xff)
		{
			cnt = 0;
			for (int i=0; i<8; i++)
			{
				cnt += get_char(fin, &xbuf[i]);
			}
			write(fout, xbuf, cnt);
			for (int i=0; i<cnt; i++)
				window[win_pos++] = xbuf[i];
			in_len+=8;
			out_len+=8;
			continue;
		}*/
        for (int index = 0; index < 8; index++, flag >>= 1)
        {
            if (flag & 1)
            {
                out_len++;
                unsigned char ch;
                cnt = get_char(fin, &ch);
                if (cnt == 0) break;
				put_char(fout, &ch);
                window[win_pos] = ch;
                win_pos = (win_pos + 1) % window_size;
//                cout << dump(ch) << " ";
//                if (isprint(ch))
//                    cout << ch << ' ';
                in_len++;
            } else {
                br_cnt++;
                unsigned char lead, offset;
				cnt = get_char(fin, &offset);
                if (cnt == 0) break;
                cnt = get_char(fin, &lead);
                if (cnt == 0) break;
//                cout << endl << "backref: pos=" << hex << in_len << ", dpos=" << out_len << ", lead=" << hex << (int)lead << ", offset=" << (int)offset << dec << '\t' << dumpb(lead) << "/" << dumpb(offset) << endl;
                int copy_len = (lead & len_mask) + 3;
                int copy_from = ((lead & from_mask) << from_bits) | (offset & 0xff);
//                cout << "backref: len=" << hex << copy_len << ", from=" << copy_from << dec << endl;
                for (int i=0; i<copy_len; i++)
                {
					int ret = put_char(fout, &window[copy_from]);
                    if (ret != 1)
                        cerr << "writeout failed! " << errno << endl;
                    window[win_pos] = window[copy_from];
                    copy_from = (copy_from + 1) % window_size;
                    win_pos = (win_pos + 1) % window_size;
                }
                out_len += copy_len;
                in_len += 2;
            }
        }
        if (cnt == 0) break;
    }

    return out_len;
}


int main(int argc, char **argv)
{
    if (argc < 3)
    {
        cerr << "usage: " << argv[0] << " <infile> <outfile> [<skip>]" << endl;
        return 1;
    }

    FILE * fin = fopen(argv[1], "rb");
    FILE * fout = fopen(argv[2], "wb");
    if (fin == NULL || fout == NULL)
    {
        cerr << "file open error! " << errno << endl;
        return errno;
    }

    if (argc > 3)
    {
        int skip = atoi(argv[3]);
        char buf[1024];
        while (skip > 0)
        {
            int cnt = fread(buf, 1, skip > 1024 ? 1024 : skip, fin);
            if (cnt <= 0)
            {
                cerr << "skip error! " << errno << endl;
                return 3;
            }
            skip -= cnt;
        }
    }

    int from_bits = 4;
    if (argc > 4)
        from_bits = atoi(argv[4]);

    int out_len = uncompress(fin, fout, from_bits);
    cout << "result len: " << out_len << endl;
    fclose(fin);
    fclose(fout);
}