CS144 lab1 笔记 📒

Lab Checkpoint 1: stitching substrings into a byte stream

In Lab 1, you’ll implement a stream reassembler—a module that stitches small pieces of the byte stream (known as substrings, or segments) back into a contiguous stream of bytes in the correct sequence.

3 Putting substrings in sequence

存储 substring 的数据结构选择的是红黑树(std::map)

  • key: index
  • value: substring

空间复杂度为\(O(\log n)\),插入和删除的时间复杂度为\(O(\log n)\),遍历时间复杂度为\(O(n)\)

把接收的 substring 情况先大致分为下面几种,其中先要判断边界条件,窗口外的部分需要丢弃。

对于与 capacity 有重叠部分的 substring 简单分成两种情况:

  1. index > first unread
  2. index <= first unread

然后对end index进行判断,以此确定 substring 需要的区间。

对该所需区间与已存取的区间进行合并(merge),判断是否能够 push substring。若能则更新几个指针参数(first unread, first unacceptable, etc)。

还需要注意的是 eof 的判断,然后就是实现合并算法和更新数据结构的策略,最后面向测试用例debug 了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
StreamReassembler::StreamReassembler(const size_t capacity)
: _first_unread(0), _first_unassembled(0), _first_unacceptable(capacity), _unassembled_bytes(0), _eof_index(-1),
_outside_size(0), _eof(false), _auxiliary_storage({}), _output(capacity), _capacity(capacity) {}

//! \details This function accepts a substring (aka a segment) of bytes,
//! possibly out-of-order, from the logical stream, and assembles any newly
//! contiguous substrings and writes them into the output stream in order.
void StreamReassembler::push_substring(const string &data, const size_t index, const bool eof) {
// DUMMY_CODE(data, index, eof);

// The number of overlap chars.
size_t overlap_size = 0;

// if out of bounds, discard it.
if(index + data.size() < _first_unread || index > _first_unacceptable){
return;
}

// if in unaassembled area, store it.
if(index > _first_unread){
string substring = data.substr(0, min(_first_unacceptable - index, data.size()));
// if never stored before
if(_auxiliary_storage.find(index) == _auxiliary_storage.end()){
_auxiliary_storage[index] = substring;
overlap_size = overlap_length();
if(_auxiliary_storage.find(index) == _auxiliary_storage.end()){ // if deleted after merge
_unassembled_bytes += _auxiliary_storage[index].size() - _outside_size;
}else{
_unassembled_bytes += _auxiliary_storage[index].size() - overlap_size; // c bcd ->error
}
}else if(substring.size() > _auxiliary_storage[index].size()){ // update better(longer) substring
size_t before_len = _auxiliary_storage[index].size();
_auxiliary_storage[index] = substring;
overlap_size = overlap_length();
_unassembled_bytes += before_len + overlap_size;
}
}

//if includes read & unread area, take unread area.
if(index <= _first_unread){
size_t end_index = min(data.size() - (_first_unread - index), _capacity);
_auxiliary_storage[_first_unread] = data.substr(_first_unread - index, end_index);
size_t len_before = _auxiliary_storage[_first_unread].size();
overlap_size = overlap_length();
_unassembled_bytes += len_before - overlap_size; // index -> _first_unread
}

// judge if it is possible to push substring
map<size_t, string>::const_iterator iter = _auxiliary_storage.begin();
if(iter->first == _first_unread){
string to_be_written = _auxiliary_storage[_first_unread];
size_t accepted_size = _output.write(to_be_written);
_auxiliary_storage.erase(_first_unread); // important!!
_first_unread += accepted_size;
_first_unacceptable = _first_unread + _capacity;
if(_unassembled_bytes < accepted_size){
_unassembled_bytes = 0;
} else {
_unassembled_bytes -= accepted_size;
}
}

if(eof){
_eof_index = index + data.size();
}

if(_eof_index <= _first_unread){
_output.end_input();
}
}

size_t StreamReassembler::unassembled_bytes() const { return _unassembled_bytes; }

bool StreamReassembler::empty() const { return _unassembled_bytes == 0; }

size_t StreamReassembler::overlap_length() {
size_t total_length = 0;
std::map<size_t, string>::const_iterator iter1 = _auxiliary_storage.begin();
std::map<size_t, string>::const_iterator iter2 = ++_auxiliary_storage.begin(); // iter next to iter1
while(iter2 != _auxiliary_storage.end()){
size_t overlap_size;
size_t idx1 = iter1->first, idx2 = iter2->first;
string s1 = iter1->second, s2 = iter2->second;
if(idx1 + s1.size() >= idx2){ // overlap or neighbours
iter2++;
overlap_size = merge_substrings(idx1, idx2, s1, s2); // _auxiliary_storage[idx2] would be erased
if(overlap_size == 0){
if(idx1 + s1.size() == idx2){ // neighbours
continue;
}else{ // previous iter2 has been deleted
iter1++;
if(iter2 != _auxiliary_storage.end()){
iter2++;
}else{
break;
}
}
}else{ // overlap
total_length += overlap_size;
}
}else{
iter1++;
iter2++;
}
}
return total_length;
}

size_t StreamReassembler::merge_substrings(const size_t idx1, const size_t idx2, const string &data1, const string &data2) {
size_t overlap = 0;
// neighbour
if(idx1 + data1.size() == idx2){
string data = data1 + data2;
_auxiliary_storage.erase(idx2);
_auxiliary_storage[idx1] = data;
_outside_size = data2.size();
return overlap;
}

// overlap
// 1. include
if(idx1+ data1.size() >= idx2 + data2.size()){
overlap = data2.size();
_outside_size = 0;
}else{ // 2. intersection
overlap = idx1 + data1.size() - idx2;
_outside_size = data2.size() - overlap;
string data = data1 + data2.substr(overlap, _outside_size);
_auxiliary_storage[idx1] = data;
}
_auxiliary_storage.erase(idx2);
return overlap;
}

CS144 lab1 笔记 📒
http://example.com/2023/11/14/CS144-note-lab1/
作者
臣皮蛋
发布于
2023年11月14日
许可协议