cpp

Coverage Report

Created: 2024-08-25 11:48

/home/andy/git/oilshell/oil/data_lang/utf8.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef DATA_LANG_UTF8_H
2
#define DATA_LANG_UTF8_H
3
4
#include <stddef.h>  // size_t
5
#include <stdint.h>  // uint32_t
6
#include <stdio.h>
7
8
/**
9
 *              ---- Quick reference about the encoding ----
10
 *
11
 * First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
12
 * first byte determines the length of the sequence and then the next 0-3 bytes
13
 * are "continuation bytes."
14
 *
15
 * +----------------------------+----------+----------+----------+----------+
16
 * | Scalar Value               | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
17
 * +----------------------------+----------+----------+----------+----------+
18
 * | 00000000 0xxxxxxx          | 0xxxxxxx |          |          |          |
19
 * | 00000yyy yyxxxxxx          | 110yyyyy | 10xxxxxx |          |          |
20
 * | zzzzyyyy yyxxxxxx          | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
21
 * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
22
 * +----------------------------+----------+----------+----------+----------+
23
 *
24
 *      Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
25
 *
26
 * There are 3 further restrictions which make some valid bit patterns
27
 * *invalid*:
28
 *  1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
29
 *     sequence is longer and thus an error.
30
 *  2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
31
 *     surrogate. It is an error to encode surrogates in UTF-8.
32
 *  3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
33
 *     and must be rejected as an error.
34
 *
35
 * See https://aolsen.ca/writings/everything-about-utf8 for more details about
36
 * the encoding.
37
 */
38
39
typedef enum Utf8Error {
40
  UTF8_OK = 0,
41
42
  // Encodes a codepoint in more bytes than necessary
43
  UTF8_ERR_OVERLONG = 1,
44
45
  // Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
46
  UTF8_ERR_SURROGATE = 2,
47
48
  // Encodes a value greater than the max codepoint U+10FFFF
49
  UTF8_ERR_TOO_LARGE = 3,
50
51
  // Encoding doesn't conform to the UTF-8 bit patterns
52
  UTF8_ERR_BAD_ENCODING = 4,
53
54
  // It looks like there is another codepoint, but it has been truncated.
55
  UTF8_ERR_TRUNCATED_BYTES = 5,
56
} Utf8Error_t;
57
58
typedef struct Utf8Result {
59
  Utf8Error_t error;
60
  uint32_t codepoint;
61
  size_t bytes_read;
62
} Utf8Result_t;
63
64
74
static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
65
74
  if (result->error) return;
66
67
74
  int byte = input[result->bytes_read];
68
74
  if (byte == '\0') {
69
10
    result->error = UTF8_ERR_TRUNCATED_BYTES;
70
10
    return;
71
10
  }
72
64
  result->bytes_read += 1;
73
74
  // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
75
  // validate the pattern and b) remove the leading '10'.
76
64
  if ((byte & 0xC0) == 0x80) {
77
54
    result->codepoint <<= 6;
78
54
    result->codepoint |= byte & 0x3F;
79
54
  } else {
80
10
    result->error = UTF8_ERR_BAD_ENCODING;
81
10
  }
82
64
}
data_lang.cc:_ZL5_contPKhP10Utf8Result
Line
Count
Source
64
53
static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
65
53
  if (result->error) return;
66
67
53
  int byte = input[result->bytes_read];
68
53
  if (byte == '\0') {
69
7
    result->error = UTF8_ERR_TRUNCATED_BYTES;
70
7
    return;
71
7
  }
72
46
  result->bytes_read += 1;
73
74
  // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
75
  // validate the pattern and b) remove the leading '10'.
76
46
  if ((byte & 0xC0) == 0x80) {
77
39
    result->codepoint <<= 6;
78
39
    result->codepoint |= byte & 0x3F;
79
39
  } else {
80
7
    result->error = UTF8_ERR_BAD_ENCODING;
81
7
  }
82
46
}
j8_libc.c:_ZL5_contPKhP10Utf8Result
Line
Count
Source
64
21
static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
65
21
  if (result->error) return;
66
67
21
  int byte = input[result->bytes_read];
68
21
  if (byte == '\0') {
69
3
    result->error = UTF8_ERR_TRUNCATED_BYTES;
70
3
    return;
71
3
  }
72
18
  result->bytes_read += 1;
73
74
  // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
75
  // validate the pattern and b) remove the leading '10'.
76
18
  if ((byte & 0xC0) == 0x80) {
77
15
    result->codepoint <<= 6;
78
15
    result->codepoint |= byte & 0x3F;
79
15
  } else {
80
3
    result->error = UTF8_ERR_BAD_ENCODING;
81
3
  }
82
18
}
83
84
/**
85
 * Given a nul-terminated string `input`, try to decode the next codepoint from
86
 * that string.
87
 *
88
 * It is required that `input` does not point to the nul-terminator. If
89
 * `*input == '\0'`, then it is assumed that the zero-byte is meant to encode
90
 * U+00, not a sentinel. The nul-terminator is still necessary because we need
91
 * it to prevent buffer overrun in the case of a truncated byte sequence, for
92
 * example '\xC2'. This oddity is to facilitate strings which may contain U+00
93
 * codepoints.
94
 *
95
 * If there was a surrogate, overlong or codepoint to large error then
96
 * `result.codepoint` will contain the recovered value.
97
 */
98
static inline void utf8_decode(const unsigned char *input,
99
786
                               Utf8Result_t *result) {
100
786
  result->error = UTF8_OK;
101
786
  result->codepoint = 0;
102
786
  result->bytes_read = 0;
103
104
786
  int first = *input;
105
786
  result->bytes_read = 1;
106
107
786
  if ((first & 0x80) == 0) {
108
    // 1-byte long (ASCII subset)
109
684
    result->codepoint = first;
110
684
    return;
111
684
  }
112
113
102
  if ((first & 0xE0) == 0xC0) {
114
    // 2-bytes long
115
22
    result->codepoint = first & 0x1F;
116
117
22
    _cont(input, result);
118
22
    if (result->error) return;
119
120
11
    if (result->codepoint < 0x80) {
121
1
      result->error = UTF8_ERR_OVERLONG;
122
1
    }
123
124
11
    return;
125
22
  }
126
127
80
  if ((first & 0xF0) == 0xE0) {
128
    // 3-bytes long
129
2
    result->codepoint = first & 0x0F;
130
131
2
    _cont(input, result);
132
2
    _cont(input, result);
133
2
    if (result->error) return;
134
135
2
    if (result->codepoint < 0x800) {
136
0
      result->error = UTF8_ERR_OVERLONG;
137
0
    }
138
139
2
    if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140
1
      result->error = UTF8_ERR_SURROGATE;
141
1
    }
142
143
2
    return;
144
2
  }
145
146
78
  if ((first & 0xF8) == 0xF0) {
147
    // 4-bytes long
148
16
    result->codepoint = first & 0x07;
149
150
16
    _cont(input, result);
151
16
    _cont(input, result);
152
16
    _cont(input, result);
153
16
    if (result->error) return;
154
155
7
    if (result->codepoint < 0x10000) {
156
0
      result->error = UTF8_ERR_OVERLONG;
157
0
    }
158
159
7
    if (result->codepoint > 0x10FFFF) {
160
1
      result->error = UTF8_ERR_TOO_LARGE;
161
1
    }
162
163
7
    return;
164
16
  }
165
166
62
  result->error = UTF8_ERR_BAD_ENCODING;
167
62
  return;
168
78
}
data_lang.cc:_ZL11utf8_decodePKhP10Utf8Result
Line
Count
Source
99
540
                               Utf8Result_t *result) {
100
540
  result->error = UTF8_OK;
101
540
  result->codepoint = 0;
102
540
  result->bytes_read = 0;
103
104
540
  int first = *input;
105
540
  result->bytes_read = 1;
106
107
540
  if ((first & 0x80) == 0) {
108
    // 1-byte long (ASCII subset)
109
469
    result->codepoint = first;
110
469
    return;
111
469
  }
112
113
71
  if ((first & 0xE0) == 0xC0) {
114
    // 2-bytes long
115
16
    result->codepoint = first & 0x1F;
116
117
16
    _cont(input, result);
118
16
    if (result->error) return;
119
120
8
    if (result->codepoint < 0x80) {
121
1
      result->error = UTF8_ERR_OVERLONG;
122
1
    }
123
124
8
    return;
125
16
  }
126
127
55
  if ((first & 0xF0) == 0xE0) {
128
    // 3-bytes long
129
2
    result->codepoint = first & 0x0F;
130
131
2
    _cont(input, result);
132
2
    _cont(input, result);
133
2
    if (result->error) return;
134
135
2
    if (result->codepoint < 0x800) {
136
0
      result->error = UTF8_ERR_OVERLONG;
137
0
    }
138
139
2
    if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140
1
      result->error = UTF8_ERR_SURROGATE;
141
1
    }
142
143
2
    return;
144
2
  }
145
146
53
  if ((first & 0xF8) == 0xF0) {
147
    // 4-bytes long
148
11
    result->codepoint = first & 0x07;
149
150
11
    _cont(input, result);
151
11
    _cont(input, result);
152
11
    _cont(input, result);
153
11
    if (result->error) return;
154
155
5
    if (result->codepoint < 0x10000) {
156
0
      result->error = UTF8_ERR_OVERLONG;
157
0
    }
158
159
5
    if (result->codepoint > 0x10FFFF) {
160
1
      result->error = UTF8_ERR_TOO_LARGE;
161
1
    }
162
163
5
    return;
164
11
  }
165
166
42
  result->error = UTF8_ERR_BAD_ENCODING;
167
42
  return;
168
53
}
j8_libc.c:_ZL11utf8_decodePKhP10Utf8Result
Line
Count
Source
99
246
                               Utf8Result_t *result) {
100
246
  result->error = UTF8_OK;
101
246
  result->codepoint = 0;
102
246
  result->bytes_read = 0;
103
104
246
  int first = *input;
105
246
  result->bytes_read = 1;
106
107
246
  if ((first & 0x80) == 0) {
108
    // 1-byte long (ASCII subset)
109
215
    result->codepoint = first;
110
215
    return;
111
215
  }
112
113
31
  if ((first & 0xE0) == 0xC0) {
114
    // 2-bytes long
115
6
    result->codepoint = first & 0x1F;
116
117
6
    _cont(input, result);
118
6
    if (result->error) return;
119
120
3
    if (result->codepoint < 0x80) {
121
0
      result->error = UTF8_ERR_OVERLONG;
122
0
    }
123
124
3
    return;
125
6
  }
126
127
25
  if ((first & 0xF0) == 0xE0) {
128
    // 3-bytes long
129
0
    result->codepoint = first & 0x0F;
130
131
0
    _cont(input, result);
132
0
    _cont(input, result);
133
0
    if (result->error) return;
134
135
0
    if (result->codepoint < 0x800) {
136
0
      result->error = UTF8_ERR_OVERLONG;
137
0
    }
138
139
0
    if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140
0
      result->error = UTF8_ERR_SURROGATE;
141
0
    }
142
143
0
    return;
144
0
  }
145
146
25
  if ((first & 0xF8) == 0xF0) {
147
    // 4-bytes long
148
5
    result->codepoint = first & 0x07;
149
150
5
    _cont(input, result);
151
5
    _cont(input, result);
152
5
    _cont(input, result);
153
5
    if (result->error) return;
154
155
2
    if (result->codepoint < 0x10000) {
156
0
      result->error = UTF8_ERR_OVERLONG;
157
0
    }
158
159
2
    if (result->codepoint > 0x10FFFF) {
160
0
      result->error = UTF8_ERR_TOO_LARGE;
161
0
    }
162
163
2
    return;
164
5
  }
165
166
20
  result->error = UTF8_ERR_BAD_ENCODING;
167
20
  return;
168
25
}
169
170
#endif  // DATA_LANG_UTF8_H