Lempel–Ziv–Storer–Szymanski

一种压缩算法

每八个东西前有个字节标记这8个是啥。对应位是1的话,就是原来的字节。对应位是0,则这个东西占俩字节,引用窗口里的一段东西。

标准LZSS算法

初始窗口指针=0xFEE // 恶心!

初始窗口清空。

读标记

对每一位:

1->原样输出一个字节,附加到窗口之后。

0->读俩字节,取第二字节高四位拼上第一个字节作为窗口内开始位置,第二字节低四位+3(0,1,2不可能……)作为长度,输出窗口内容,同时也附加到窗口之后。

窗口大小0x1000

变种

  • 取第一字节拼上第二字节高位作为窗口内开始位置,第二字节低位+2作为长度,窗口大小0x800

代码

#include <fcntl.h>
#include <iostream>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

using namespace std;

string dump(char ch)
{
    char buf[3];
    snprintf(buf, 3, "%02x", (unsigned char)ch);
    return string(buf);
}

string dumpb(char ch)
{
    char buf[10];
    buf[8] = 0;
    for (int i=0; i<8; i++)
        if (ch & (1 << i))
            buf[7-i] = '1';
        else
            buf[7-i] = '0';
    return string(buf);
}

int uncompress(int fin, int fout, int from_bits)
{
    // example:
    // from_bits = 3
    // len_mask  = 0x1f : 0001 1111
    // from_mask = 0xe0 : 1110 0000
    //                    ^^^ 3
    int window_size = 0x100 << from_bits;
    unsigned char len_mask = (1 << (8 - from_bits)) - 1;
    unsigned char from_mask = 0xFF - len_mask;

    cout << hex << window_size << dec << '/' << dump(len_mask) << '/' << dump(from_mask) << endl;

    char window[window_size];
    int win_pos = from_bits == 4 ? 0xFEE : 0;
    int out_len = 0;
    int in_len = 0;
    int br_cnt = 0;
    memset(window, 0, window_size);
                
    while (true)
    {
        char flag;
        int cnt = read(fin, &flag, 1);
        if (cnt == 0) break;
        int index = 0;
        for (int index = 0; index < 8; index++, flag >>= 1)
        {
            if (flag & 1)
            {
                out_len++;
                char ch;
                cnt = read(fin, &ch, 1);
                if (cnt == 0) break;
                write(fout, &ch, 1);
                window[win_pos] = ch;
                win_pos = (win_pos + 1) % window_size;
//                cout << dump(ch) << " ";
//                if (isprint(ch))
//                    cout << ch << ' ';
                in_len++;
            } else {
                br_cnt++;
                unsigned char lead, offset;
                cnt = read(fin, &offset, 1);
                if (cnt == 0) break;
                cnt = read(fin, &lead, 1);
                if (cnt == 0) break;
//                cout << endl << "backref: pos=" << hex << in_len << ", dpos=" << out_len << ", lead=" << hex << (int)lead << ", offset=" << (int)offset << dec << '\t' << dumpb(lead) << "/" << dumpb(offset) << endl;
                int copy_len = (lead & len_mask) + 3;
                int copy_from = ((lead & from_mask) << from_bits) | (offset & 0xff);
//                cout << "backref: len=" << hex << copy_len << ", from=" << copy_from << dec << endl;
                for (int i=0; i<copy_len; i++)
                {
                    int ret = write(fout, &window[copy_from], 1);
                    if (ret != 1)
                        cerr << "writeout failed! " << errno << endl;
                    window[win_pos] = window[copy_from];
                    copy_from = (copy_from + 1) % window_size;
                    win_pos = (win_pos + 1) % window_size;
                }
                out_len += copy_len;
                in_len += 2;
            }
        }
        if (cnt == 0) break;
    }

    return out_len;
}


int main(int argc, char **argv)
{
    if (argc < 3)
    {
        cerr << "usage: " << argv[0] << " <infile> <outfile> [<skip>]" << endl;
        return 1;
    }

    int fin = open(argv[1], O_RDONLY);
    int fout = open(argv[2], O_WRONLY | O_CREAT, 0644);
    if ((fin < 0) || (fout < 0))
    {
        cerr << "file open error! " << errno << endl;
        return errno;
    }

    if (argc > 3)
    {
        int skip = atoi(argv[3]);
        char buf[1024];
        while (skip > 0)
        {
            int cnt = read(fin, buf, skip > 1024 ? 1024 : skip);
            if (cnt <= 0)
            {
                cerr << "skip error! " << errno << endl;
                return 3;
            }
            skip -= cnt;
        }
    }

    int from_bits = 4;
    if (argc > 4)
        from_bits = atoi(argv[4]);

    int out_len = uncompress(fin, fout, from_bits);
    cout << "result len: " << out_len << endl;
    close(fin);
    close(fout);
}