/home/andy/git/oilshell/oil/data_lang/utf8.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef DATA_LANG_UTF8_H |
2 | | #define DATA_LANG_UTF8_H |
3 | | |
4 | | #include <stddef.h> // size_t |
5 | | #include <stdint.h> // uint32_t |
6 | | #include <stdio.h> |
7 | | |
8 | | /** |
9 | | * ---- Quick reference about the encoding ---- |
10 | | * |
11 | | * First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The |
12 | | * first byte determines the length of the sequence and then the next 0-3 bytes |
13 | | * are "continuation bytes." |
14 | | * |
15 | | * +----------------------------+----------+----------+----------+----------+ |
16 | | * | Scalar Value | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte | |
17 | | * +----------------------------+----------+----------+----------+----------+ |
18 | | * | 00000000 0xxxxxxx | 0xxxxxxx | | | | |
19 | | * | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | | |
20 | | * | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | | |
21 | | * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx | |
22 | | * +----------------------------+----------+----------+----------+----------+ |
23 | | * |
24 | | * Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns |
25 | | * |
26 | | * There are 3 further restrictions which make some valid bit patterns |
27 | | * *invalid*: |
28 | | * 1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second |
29 | | * sequence is longer and thus an error. |
30 | | * 2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a |
31 | | * surrogate. It is an error to encode surrogates in UTF-8. |
32 | | * 3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint, |
33 | | * and must be rejected as an error. |
34 | | * |
35 | | * See https://aolsen.ca/writings/everything-about-utf8 for more details about |
36 | | * the encoding. |
37 | | */ |
38 | | |
39 | | typedef enum Utf8Error { |
40 | | UTF8_OK = 0, |
41 | | |
42 | | // Encodes a codepoint in more bytes than necessary |
43 | | UTF8_ERR_OVERLONG = 1, |
44 | | |
45 | | // Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive) |
46 | | UTF8_ERR_SURROGATE = 2, |
47 | | |
48 | | // Encodes a value greater than the max codepoint U+10FFFF |
49 | | UTF8_ERR_TOO_LARGE = 3, |
50 | | |
51 | | // Encoding doesn't conform to the UTF-8 bit patterns |
52 | | UTF8_ERR_BAD_ENCODING = 4, |
53 | | |
54 | | // It looks like there is another codepoint, but it has been truncated. |
55 | | UTF8_ERR_TRUNCATED_BYTES = 5, |
56 | | } Utf8Error_t; |
57 | | |
58 | | typedef struct Utf8Result { |
59 | | Utf8Error_t error; |
60 | | uint32_t codepoint; |
61 | | size_t bytes_read; |
62 | | } Utf8Result_t; |
63 | | |
64 | 74 | static inline void _cont(const unsigned char *input, Utf8Result_t *result) { |
65 | 74 | if (result->error) return; |
66 | | |
67 | 74 | int byte = input[result->bytes_read]; |
68 | 74 | if (byte == '\0') { |
69 | 10 | result->error = UTF8_ERR_TRUNCATED_BYTES; |
70 | 10 | return; |
71 | 10 | } |
72 | 64 | result->bytes_read += 1; |
73 | | |
74 | | // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a) |
75 | | // validate the pattern and b) remove the leading '10'. |
76 | 64 | if ((byte & 0xC0) == 0x80) { |
77 | 54 | result->codepoint <<= 6; |
78 | 54 | result->codepoint |= byte & 0x3F; |
79 | 54 | } else { |
80 | 10 | result->error = UTF8_ERR_BAD_ENCODING; |
81 | 10 | } |
82 | 64 | } data_lang.cc:_ZL5_contPKhP10Utf8Result Line | Count | Source | 64 | 53 | static inline void _cont(const unsigned char *input, Utf8Result_t *result) { | 65 | 53 | if (result->error) return; | 66 | | | 67 | 53 | int byte = input[result->bytes_read]; | 68 | 53 | if (byte == '\0') { | 69 | 7 | result->error = UTF8_ERR_TRUNCATED_BYTES; | 70 | 7 | return; | 71 | 7 | } | 72 | 46 | result->bytes_read += 1; | 73 | | | 74 | | // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a) | 75 | | // validate the pattern and b) remove the leading '10'. | 76 | 46 | if ((byte & 0xC0) == 0x80) { | 77 | 39 | result->codepoint <<= 6; | 78 | 39 | result->codepoint |= byte & 0x3F; | 79 | 39 | } else { | 80 | 7 | result->error = UTF8_ERR_BAD_ENCODING; | 81 | 7 | } | 82 | 46 | } |
j8_libc.c:_ZL5_contPKhP10Utf8Result Line | Count | Source | 64 | 21 | static inline void _cont(const unsigned char *input, Utf8Result_t *result) { | 65 | 21 | if (result->error) return; | 66 | | | 67 | 21 | int byte = input[result->bytes_read]; | 68 | 21 | if (byte == '\0') { | 69 | 3 | result->error = UTF8_ERR_TRUNCATED_BYTES; | 70 | 3 | return; | 71 | 3 | } | 72 | 18 | result->bytes_read += 1; | 73 | | | 74 | | // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a) | 75 | | // validate the pattern and b) remove the leading '10'. | 76 | 18 | if ((byte & 0xC0) == 0x80) { | 77 | 15 | result->codepoint <<= 6; | 78 | 15 | result->codepoint |= byte & 0x3F; | 79 | 15 | } else { | 80 | 3 | result->error = UTF8_ERR_BAD_ENCODING; | 81 | 3 | } | 82 | 18 | } |
|
83 | | |
84 | | /** |
85 | | * Given a nul-terminated string `input`, try to decode the next codepoint from |
86 | | * that string. |
87 | | * |
88 | | * It is required that `input` does not point to the nul-terminator. If |
89 | | * `*input == '\0'`, then it is assumed that the zero-byte is meant to encode |
90 | | * U+00, not a sentinel. The nul-terminator is still necessary because we need |
91 | | * it to prevent buffer overrun in the case of a truncated byte sequence, for |
92 | | * example '\xC2'. This oddity is to facilitate strings which may contain U+00 |
93 | | * codepoints. |
94 | | * |
95 | | * If there was a surrogate, overlong or codepoint to large error then |
96 | | * `result.codepoint` will contain the recovered value. |
97 | | */ |
98 | | static inline void utf8_decode(const unsigned char *input, |
99 | 786 | Utf8Result_t *result) { |
100 | 786 | result->error = UTF8_OK; |
101 | 786 | result->codepoint = 0; |
102 | 786 | result->bytes_read = 0; |
103 | | |
104 | 786 | int first = *input; |
105 | 786 | result->bytes_read = 1; |
106 | | |
107 | 786 | if ((first & 0x80) == 0) { |
108 | | // 1-byte long (ASCII subset) |
109 | 684 | result->codepoint = first; |
110 | 684 | return; |
111 | 684 | } |
112 | | |
113 | 102 | if ((first & 0xE0) == 0xC0) { |
114 | | // 2-bytes long |
115 | 22 | result->codepoint = first & 0x1F; |
116 | | |
117 | 22 | _cont(input, result); |
118 | 22 | if (result->error) return; |
119 | | |
120 | 11 | if (result->codepoint < 0x80) { |
121 | 1 | result->error = UTF8_ERR_OVERLONG; |
122 | 1 | } |
123 | | |
124 | 11 | return; |
125 | 22 | } |
126 | | |
127 | 80 | if ((first & 0xF0) == 0xE0) { |
128 | | // 3-bytes long |
129 | 2 | result->codepoint = first & 0x0F; |
130 | | |
131 | 2 | _cont(input, result); |
132 | 2 | _cont(input, result); |
133 | 2 | if (result->error) return; |
134 | | |
135 | 2 | if (result->codepoint < 0x800) { |
136 | 0 | result->error = UTF8_ERR_OVERLONG; |
137 | 0 | } |
138 | | |
139 | 2 | if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) { |
140 | 1 | result->error = UTF8_ERR_SURROGATE; |
141 | 1 | } |
142 | | |
143 | 2 | return; |
144 | 2 | } |
145 | | |
146 | 78 | if ((first & 0xF8) == 0xF0) { |
147 | | // 4-bytes long |
148 | 16 | result->codepoint = first & 0x07; |
149 | | |
150 | 16 | _cont(input, result); |
151 | 16 | _cont(input, result); |
152 | 16 | _cont(input, result); |
153 | 16 | if (result->error) return; |
154 | | |
155 | 7 | if (result->codepoint < 0x10000) { |
156 | 0 | result->error = UTF8_ERR_OVERLONG; |
157 | 0 | } |
158 | | |
159 | 7 | if (result->codepoint > 0x10FFFF) { |
160 | 1 | result->error = UTF8_ERR_TOO_LARGE; |
161 | 1 | } |
162 | | |
163 | 7 | return; |
164 | 16 | } |
165 | | |
166 | 62 | result->error = UTF8_ERR_BAD_ENCODING; |
167 | 62 | return; |
168 | 78 | } data_lang.cc:_ZL11utf8_decodePKhP10Utf8Result Line | Count | Source | 99 | 540 | Utf8Result_t *result) { | 100 | 540 | result->error = UTF8_OK; | 101 | 540 | result->codepoint = 0; | 102 | 540 | result->bytes_read = 0; | 103 | | | 104 | 540 | int first = *input; | 105 | 540 | result->bytes_read = 1; | 106 | | | 107 | 540 | if ((first & 0x80) == 0) { | 108 | | // 1-byte long (ASCII subset) | 109 | 469 | result->codepoint = first; | 110 | 469 | return; | 111 | 469 | } | 112 | | | 113 | 71 | if ((first & 0xE0) == 0xC0) { | 114 | | // 2-bytes long | 115 | 16 | result->codepoint = first & 0x1F; | 116 | | | 117 | 16 | _cont(input, result); | 118 | 16 | if (result->error) return; | 119 | | | 120 | 8 | if (result->codepoint < 0x80) { | 121 | 1 | result->error = UTF8_ERR_OVERLONG; | 122 | 1 | } | 123 | | | 124 | 8 | return; | 125 | 16 | } | 126 | | | 127 | 55 | if ((first & 0xF0) == 0xE0) { | 128 | | // 3-bytes long | 129 | 2 | result->codepoint = first & 0x0F; | 130 | | | 131 | 2 | _cont(input, result); | 132 | 2 | _cont(input, result); | 133 | 2 | if (result->error) return; | 134 | | | 135 | 2 | if (result->codepoint < 0x800) { | 136 | 0 | result->error = UTF8_ERR_OVERLONG; | 137 | 0 | } | 138 | | | 139 | 2 | if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) { | 140 | 1 | result->error = UTF8_ERR_SURROGATE; | 141 | 1 | } | 142 | | | 143 | 2 | return; | 144 | 2 | } | 145 | | | 146 | 53 | if ((first & 0xF8) == 0xF0) { | 147 | | // 4-bytes long | 148 | 11 | result->codepoint = first & 0x07; | 149 | | | 150 | 11 | _cont(input, result); | 151 | 11 | _cont(input, result); | 152 | 11 | _cont(input, result); | 153 | 11 | if (result->error) return; | 154 | | | 155 | 5 | if (result->codepoint < 0x10000) { | 156 | 0 | result->error = UTF8_ERR_OVERLONG; | 157 | 0 | } | 158 | | | 159 | 5 | if (result->codepoint > 0x10FFFF) { | 160 | 1 | result->error = UTF8_ERR_TOO_LARGE; | 161 | 1 | } | 162 | | | 163 | 5 | return; | 164 | 11 | } | 165 | | | 166 | 42 | result->error = UTF8_ERR_BAD_ENCODING; | 167 | 42 | return; | 168 | 53 | } |
j8_libc.c:_ZL11utf8_decodePKhP10Utf8Result Line | Count | Source | 99 | 246 | Utf8Result_t *result) { | 100 | 246 | result->error = UTF8_OK; | 101 | 246 | result->codepoint = 0; | 102 | 246 | result->bytes_read = 0; | 103 | | | 104 | 246 | int first = *input; | 105 | 246 | result->bytes_read = 1; | 106 | | | 107 | 246 | if ((first & 0x80) == 0) { | 108 | | // 1-byte long (ASCII subset) | 109 | 215 | result->codepoint = first; | 110 | 215 | return; | 111 | 215 | } | 112 | | | 113 | 31 | if ((first & 0xE0) == 0xC0) { | 114 | | // 2-bytes long | 115 | 6 | result->codepoint = first & 0x1F; | 116 | | | 117 | 6 | _cont(input, result); | 118 | 6 | if (result->error) return; | 119 | | | 120 | 3 | if (result->codepoint < 0x80) { | 121 | 0 | result->error = UTF8_ERR_OVERLONG; | 122 | 0 | } | 123 | | | 124 | 3 | return; | 125 | 6 | } | 126 | | | 127 | 25 | if ((first & 0xF0) == 0xE0) { | 128 | | // 3-bytes long | 129 | 0 | result->codepoint = first & 0x0F; | 130 | |
| 131 | 0 | _cont(input, result); | 132 | 0 | _cont(input, result); | 133 | 0 | if (result->error) return; | 134 | | | 135 | 0 | if (result->codepoint < 0x800) { | 136 | 0 | result->error = UTF8_ERR_OVERLONG; | 137 | 0 | } | 138 | |
| 139 | 0 | if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) { | 140 | 0 | result->error = UTF8_ERR_SURROGATE; | 141 | 0 | } | 142 | |
| 143 | 0 | return; | 144 | 0 | } | 145 | | | 146 | 25 | if ((first & 0xF8) == 0xF0) { | 147 | | // 4-bytes long | 148 | 5 | result->codepoint = first & 0x07; | 149 | | | 150 | 5 | _cont(input, result); | 151 | 5 | _cont(input, result); | 152 | 5 | _cont(input, result); | 153 | 5 | if (result->error) return; | 154 | | | 155 | 2 | if (result->codepoint < 0x10000) { | 156 | 0 | result->error = UTF8_ERR_OVERLONG; | 157 | 0 | } | 158 | | | 159 | 2 | if (result->codepoint > 0x10FFFF) { | 160 | 0 | result->error = UTF8_ERR_TOO_LARGE; | 161 | 0 | } | 162 | | | 163 | 2 | return; | 164 | 5 | } | 165 | | | 166 | 20 | result->error = UTF8_ERR_BAD_ENCODING; | 167 | 20 | return; | 168 | 25 | } |
|
169 | | |
170 | | #endif // DATA_LANG_UTF8_H |