/home/andy/git/oilshell/oil/mycpp/gc_str.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef MYCPP_GC_STR_H |
2 | | #define MYCPP_GC_STR_H |
3 | | |
4 | | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN |
5 | | #include "mycpp/gc_obj.h" // GC_OBJ |
6 | | #include "mycpp/hash.h" // HashFunc |
7 | | |
8 | | template <typename T> |
9 | | class List; |
10 | | |
11 | | class BigStr { |
12 | | public: |
13 | | // Don't call this directly. Call NewStr() instead, which calls this. |
14 | 5.39k | BigStr() { |
15 | 5.39k | } |
16 | | |
17 | 530 | char* data() { |
18 | 530 | return data_; |
19 | 530 | } |
20 | | |
21 | | // Call this after writing into buffer created by OverAllocatedStr() |
22 | | void MaybeShrink(int str_len); |
23 | | |
24 | | BigStr* at(int i); |
25 | | |
26 | | int find(BigStr* needle, int pos = 0); |
27 | | int rfind(BigStr* needle); |
28 | | |
29 | | BigStr* slice(int begin); |
30 | | BigStr* slice(int begin, int end); |
31 | | BigStr* slice(int begin, int end, int step); |
32 | | |
33 | | BigStr* strip(); |
34 | | // Used for CommandSub in osh/cmd_exec.py |
35 | | BigStr* rstrip(BigStr* chars); |
36 | | BigStr* rstrip(); |
37 | | |
38 | | BigStr* lstrip(BigStr* chars); |
39 | | BigStr* lstrip(); |
40 | | |
41 | | BigStr* ljust(int width, BigStr* fillchar); |
42 | | BigStr* rjust(int width, BigStr* fillchar); |
43 | | |
44 | | bool startswith(BigStr* s); |
45 | | bool endswith(BigStr* s); |
46 | | |
47 | | BigStr* replace(BigStr* old, BigStr* new_str); |
48 | | BigStr* join(List<BigStr*>* items); |
49 | | |
50 | | List<BigStr*>* split(BigStr* sep); |
51 | | List<BigStr*>* split(BigStr* sep, int max_split); |
52 | | List<BigStr*>* splitlines(bool keep); |
53 | | |
54 | | bool isdigit(); |
55 | | bool isalpha(); |
56 | | bool isupper(); |
57 | | |
58 | | BigStr* upper(); |
59 | | BigStr* lower(); |
60 | | |
61 | | // for raw_input() to look like GNU readline |
62 | | void RemoveNewlineHack(); |
63 | | |
64 | | // Other options for fast comparison / hashing / string interning: |
65 | | // - unique_id_: an index into intern table. I don't think this works unless |
66 | | // you want to deal with rehashing all strings when the set grows. |
67 | | // - although note that the JVM has -XX:StringTableSize=FIXED, which means |
68 | | // - it can degrade into linked list performance |
69 | | // - Hashed strings become GLOBAL_STR(). Never deallocated. |
70 | | // - Hashed strings become part of the "large object space", which might be |
71 | | // managed by mark and sweep. This requires linked list overhead. |
72 | | // (doubly-linked?) |
73 | | // - Intern strings at GARBAGE COLLECTION TIME, with |
74 | | // LayoutForwarded::new_location_? Is this possible? Does it introduce |
75 | | // too much coupling between strings, hash tables, and GC? |
76 | | |
77 | 5.39k | static constexpr ObjHeader obj_header() { |
78 | 5.39k | return ObjHeader::BigStr(); |
79 | 5.39k | } |
80 | | |
81 | | unsigned hash(HashFunc h); |
82 | | |
83 | | int len_; |
84 | | unsigned hash_ : 31; |
85 | | unsigned is_hashed_ : 1; |
86 | | char data_[1]; // flexible array |
87 | | |
88 | | private: |
89 | | int _strip_left_pos(); |
90 | | int _strip_right_pos(); |
91 | | |
92 | | DISALLOW_COPY_AND_ASSIGN(BigStr) |
93 | | }; |
94 | | |
95 | | constexpr int kStrHeaderSize = offsetof(BigStr, data_); |
96 | | |
97 | | // Note: for SmallStr, we might copy into the VALUE |
98 | 156 | inline void BigStr::MaybeShrink(int str_len) { |
99 | 156 | len_ = str_len; |
100 | 156 | data_[len_] = '\0'; // NUL terminate |
101 | 156 | } |
102 | | |
103 | 13.4k | inline int len(const BigStr* s) { |
104 | 13.4k | return s->len_; |
105 | 13.4k | } |
106 | | |
107 | | BigStr* StrFormat(const char* fmt, ...); |
108 | | BigStr* StrFormat(BigStr* fmt, ...); |
109 | | |
110 | | // NOTE: This iterates over bytes. |
111 | | class StrIter { |
112 | | public: |
113 | 126 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) { |
114 | | // Cheney only: s_ could be moved during iteration. |
115 | | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_)); |
116 | 126 | } |
117 | 126 | ~StrIter() { |
118 | | // gHeap.PopRoot(); |
119 | 126 | } |
120 | 158 | void Next() { |
121 | 158 | i_++; |
122 | 158 | } |
123 | 284 | bool Done() { |
124 | 284 | return i_ >= len_; |
125 | 284 | } |
126 | | BigStr* Value(); // similar to at() |
127 | | |
128 | | private: |
129 | | BigStr* s_; |
130 | | int i_; |
131 | | int len_; |
132 | | |
133 | | DISALLOW_COPY_AND_ASSIGN(StrIter) |
134 | | }; |
135 | | |
136 | | bool maybe_str_equals(BigStr* left, BigStr* right); |
137 | | |
138 | | extern BigStr* kEmptyString; |
139 | | |
140 | | // GlobalStr notes: |
141 | | // - sizeof("foo") == 4, for the NUL terminator. |
142 | | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We |
143 | | // don't put it here because it triggers -Winvalid-offsetof |
144 | | |
145 | | template <int N> |
146 | | class GlobalStr { |
147 | | // A template type with the same layout as BigStr with length N-1 (which needs |
148 | | // a buffer of size N). For initializing global constant instances. |
149 | | public: |
150 | | int len_; |
151 | | unsigned hash_ : 31; |
152 | | unsigned is_hashed_ : 1; |
153 | | const char data_[N]; |
154 | | |
155 | | DISALLOW_COPY_AND_ASSIGN(GlobalStr) |
156 | | }; |
157 | | |
158 | | union Str { |
159 | | public: |
160 | | // Instead of this at the start of every function: |
161 | | // Str* s = nullptr; |
162 | | // It will now be: |
163 | | // Str s(nullptr); |
164 | | // |
165 | | // StackRoot _root(&s); |
166 | 12 | explicit Str(BigStr* big) : big_(big) { |
167 | 12 | } |
168 | | |
169 | 10 | char* data() { |
170 | 10 | return big_->data(); |
171 | 10 | } |
172 | | |
173 | 10 | Str at(int i) { |
174 | 10 | return Str(big_->at(i)); |
175 | 10 | } |
176 | | |
177 | 0 | Str upper() { |
178 | 0 | return Str(big_->upper()); |
179 | 0 | } |
180 | | |
181 | | uint64_t raw_bytes_; |
182 | | BigStr* big_; |
183 | | // TODO: add SmallStr, see mycpp/small_str_test.cc |
184 | | }; |
185 | | |
186 | 12 | inline int len(const Str s) { |
187 | 12 | return len(s.big_); |
188 | 12 | } |
189 | | |
190 | | // This macro is a workaround for the fact that it's impossible to have a |
191 | | // a constexpr initializer for char[N]. The "String Literals as Non-Type |
192 | | // Template Parameters" feature of C++ 20 would have done it, but it's not |
193 | | // there. |
194 | | // |
195 | | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/ |
196 | | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor |
197 | | // |
198 | | // TODO: Can we hash values at compile time so they can be in the intern table? |
199 | | |
200 | | #define GLOBAL_STR(name, val) \ |
201 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
202 | | ObjHeader::Global(TypeTag::BigStr), \ |
203 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
204 | | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj); |
205 | | |
206 | | // New style for SmallStr compatibility |
207 | | #define GLOBAL_STR2(name, val) \ |
208 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
209 | | ObjHeader::Global(TypeTag::BigStr), \ |
210 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
211 | | Str name(reinterpret_cast<BigStr*>(&_##name.obj)); |
212 | | |
213 | | #endif // MYCPP_GC_STR_H |