CS144 lab1 笔记 📒

Lab Checkpoint 1: stitching substrings into a byte stream

In Lab 1, you’ll implement a stream reassembler—a module that stitches small pieces of the byte stream (known as substrings, or segments) back into a contiguous stream of bytes in the correct sequence.

3 Putting substrings in sequence

存储 substring 的数据结构选择的是红黑树(std::map)

key: index
value: substring

空间复杂度为\(O(\log n)\)，插入和删除的时间复杂度为\(O(\log n)\)，遍历时间复杂度为\(O(n)\)。

把接收的 substring 情况先大致分为下面几种，其中先要判断边界条件，窗口外的部分需要丢弃。

对于与 capacity 有重叠部分的 substring 简单分成两种情况：

index > first unread
index <= first unread

然后对end index进行判断，以此确定 substring 需要的区间。

对该所需区间与已存取的区间进行合并(merge)，判断是否能够 push substring。若能则更新几个指针参数(first unread, first unacceptable, etc)。

还需要注意的是 eof 的判断，然后就是实现合并算法和更新数据结构的策略，最后~~面向测试用例~~debug 了。

StreamReassembler::StreamReassembler(const size_t capacity)
  : _first_unread(0), _first_unassembled(0), _first_unacceptable(capacity), _unassembled_bytes(0), _eof_index(-1),
    _outside_size(0), _eof(false), _auxiliary_storage({}), _output(capacity), _capacity(capacity) {}

//! \details This function accepts a substring (aka a segment) of bytes,
//! possibly out-of-order, from the logical stream, and assembles any newly
//! contiguous substrings and writes them into the output stream in order.
void StreamReassembler::push_substring(const string &data, const size_t index, const bool eof) {
    // DUMMY_CODE(data, index, eof);

    // The number of overlap chars.
    size_t overlap_size = 0;

    // if out of bounds, discard it.
    if(index + data.size() < _first_unread || index > _first_unacceptable){
        return;
    }

    // if in unaassembled area, store it.
    if(index > _first_unread){
        string substring = data.substr(0, min(_first_unacceptable - index, data.size()));
        // if never stored before
        if(_auxiliary_storage.find(index) == _auxiliary_storage.end()){
            _auxiliary_storage[index] = substring;
            overlap_size = overlap_length();
            if(_auxiliary_storage.find(index) == _auxiliary_storage.end()){  // if deleted after merge
                _unassembled_bytes +=  _auxiliary_storage[index].size() - _outside_size;
            }else{
                _unassembled_bytes +=  _auxiliary_storage[index].size() - overlap_size; // c bcd ->error
            }
        }else if(substring.size() > _auxiliary_storage[index].size()){  // update better(longer) substring
            size_t before_len = _auxiliary_storage[index].size();
            _auxiliary_storage[index] = substring;
            overlap_size = overlap_length();
            _unassembled_bytes += before_len + overlap_size;
        }
    }

    //if includes read & unread area, take unread area.
    if(index <= _first_unread){
        size_t end_index = min(data.size() - (_first_unread - index), _capacity);
        _auxiliary_storage[_first_unread] = data.substr(_first_unread - index, end_index);
        size_t len_before = _auxiliary_storage[_first_unread].size();
        overlap_size = overlap_length();
        _unassembled_bytes += len_before - overlap_size; // index -> _first_unread
    }

    // judge if it is possible to push substring
    map<size_t, string>::const_iterator iter = _auxiliary_storage.begin();
    if(iter->first == _first_unread){
        string to_be_written = _auxiliary_storage[_first_unread];
        size_t accepted_size = _output.write(to_be_written);
        _auxiliary_storage.erase(_first_unread); // important!!
        _first_unread += accepted_size;
        _first_unacceptable = _first_unread + _capacity;
        if(_unassembled_bytes < accepted_size){
            _unassembled_bytes = 0;
        } else {
            _unassembled_bytes -= accepted_size;
        }
    }

    if(eof){
        _eof_index = index + data.size();
    }

    if(_eof_index <= _first_unread){
        _output.end_input();
    }
}

size_t StreamReassembler::unassembled_bytes() const { return _unassembled_bytes; }

bool StreamReassembler::empty() const { return _unassembled_bytes == 0; }

size_t StreamReassembler::overlap_length() {
    size_t total_length = 0;
    std::map<size_t, string>::const_iterator iter1 = _auxiliary_storage.begin();
    std::map<size_t, string>::const_iterator iter2 = ++_auxiliary_storage.begin(); // iter next to iter1
    while(iter2 != _auxiliary_storage.end()){
        size_t overlap_size;
        size_t idx1 = iter1->first, idx2 = iter2->first;
        string s1 = iter1->second, s2 = iter2->second;
        if(idx1 + s1.size() >= idx2){  // overlap or neighbours
            iter2++;
            overlap_size = merge_substrings(idx1, idx2, s1, s2);  // _auxiliary_storage[idx2] would be erased
            if(overlap_size == 0){
                if(idx1 + s1.size() == idx2){ // neighbours
                    continue;
                }else{  // previous iter2 has been deleted
                    iter1++;
                    if(iter2 != _auxiliary_storage.end()){
                        iter2++;
                    }else{
                        break;
                    }
                }
            }else{  // overlap
                total_length += overlap_size;
            }
        }else{
            iter1++;
            iter2++;
        }
    }
    return total_length;
}

size_t StreamReassembler::merge_substrings(const size_t idx1, const size_t idx2, const string &data1, const string &data2) {
    size_t overlap = 0;
    // neighbour
    if(idx1 + data1.size() == idx2){
        string data = data1 + data2;
        _auxiliary_storage.erase(idx2);
        _auxiliary_storage[idx1] = data;
        _outside_size = data2.size();
        return overlap;
    }

    // overlap
    // 1. include
    if(idx1+ data1.size() >= idx2 + data2.size()){
        overlap = data2.size();
        _outside_size = 0;
    }else{ // 2. intersection
        overlap = idx1 + data1.size() - idx2;
        _outside_size = data2.size() - overlap;
        string data = data1 + data2.substr(overlap, _outside_size);
        _auxiliary_storage[idx1] = data;
    }
    _auxiliary_storage.erase(idx2);
    return overlap;
}

courses

#notes #CS144 #C++

CS144 lab1 笔记 📒

http://example.com/2023/11/14/CS144-note-lab1/

作者

臣皮蛋

发布于

2023年11月14日

许可协议

CS144 lab2 笔记 📒 上一篇

CS144 lab0 笔记 📒 下一篇