1 1ac119fb 2024-01-23 op /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
3 1ac119fb 2024-01-23 op * Permission is hereby granted, free of charge, to any person
4 1ac119fb 2024-01-23 op * obtaining a copy of this software and associated documentation
5 1ac119fb 2024-01-23 op * files (the "Software"), to deal in the Software without
6 1ac119fb 2024-01-23 op * restriction, including without limitation the rights to use, copy,
7 1ac119fb 2024-01-23 op * modify, merge, publish, distribute, sublicense, and/or sell copies
8 1ac119fb 2024-01-23 op * of the Software, and to permit persons to whom the Software is
9 1ac119fb 2024-01-23 op * furnished to do so, subject to the following conditions:
11 1ac119fb 2024-01-23 op * The above copyright notice and this permission notice shall be
12 1ac119fb 2024-01-23 op * included in all copies or substantial portions of the Software.
14 1ac119fb 2024-01-23 op * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 1ac119fb 2024-01-23 op * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 1ac119fb 2024-01-23 op * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 1ac119fb 2024-01-23 op * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 1ac119fb 2024-01-23 op * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 1ac119fb 2024-01-23 op * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 1ac119fb 2024-01-23 op * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 1ac119fb 2024-01-23 op #include "compat.h"
26 1ac119fb 2024-01-23 op #include <assert.h>
27 1ac119fb 2024-01-23 op #include <stddef.h>
28 1ac119fb 2024-01-23 op #include <stdint.h>
29 1ac119fb 2024-01-23 op #include <wchar.h>
31 1ac119fb 2024-01-23 op #include "utf8.h"
33 1ac119fb 2024-01-23 op #define UTF8_ACCEPT 0
34 1ac119fb 2024-01-23 op #define UTF8_REJECT 1
36 1ac119fb 2024-01-23 op static const uint8_t utf8d[] = {
37 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
38 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
39 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
40 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
41 1ac119fb 2024-01-23 op 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
42 1ac119fb 2024-01-23 op 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
43 1ac119fb 2024-01-23 op 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
44 1ac119fb 2024-01-23 op 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
45 1ac119fb 2024-01-23 op 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
46 1ac119fb 2024-01-23 op 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
47 1ac119fb 2024-01-23 op 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
48 1ac119fb 2024-01-23 op 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
49 1ac119fb 2024-01-23 op 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
50 1ac119fb 2024-01-23 op 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
53 1ac119fb 2024-01-23 op static inline uint32_t
54 1ac119fb 2024-01-23 op decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
56 1ac119fb 2024-01-23 op uint32_t type = utf8d[byte];
58 1ac119fb 2024-01-23 op *codep = (*state != UTF8_ACCEPT) ?
59 1ac119fb 2024-01-23 op (byte & 0x3fu) | (*codep << 6) :
60 1ac119fb 2024-01-23 op (0xff >> type) & (byte);
62 1ac119fb 2024-01-23 op *state = utf8d[256 + *state*16 + type];
63 1ac119fb 2024-01-23 op return *state;
67 1ac119fb 2024-01-23 op /* end of the converter, utility functions ahead */
69 1ac119fb 2024-01-23 op #define ZERO_WIDTH_SPACE 0x200B
71 1ac119fb 2024-01-23 op /* public version of decode */
73 1ac119fb 2024-01-23 op utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
75 1ac119fb 2024-01-23 op return decode(state, codep, byte);
79 f853ec6f 2024-10-22 op * returns 0, 1, 2 or less than 8 for tabs. assumes that
80 f853ec6f 2024-10-22 op * sizeof(wchar_t) == 4
83 f853ec6f 2024-10-22 op utf8_chwidth(uint32_t cp, int col)
85 1ac119fb 2024-01-23 op /* XXX: if we're running on a platform where sizeof(wchar_t)
86 1ac119fb 2024-01-23 op * == 2 what to do? The manpage for wcwidth and wcs isn't
87 1ac119fb 2024-01-23 op * clear about the encoding, but if it's 16 bit wide I assume
88 1ac119fb 2024-01-23 op * it must use UTF-16... right? */
89 1ac119fb 2024-01-23 op assert(sizeof(wchar_t) == 4);
92 f853ec6f 2024-10-22 op * Tabs are wide until the next multiple of eight.
94 1ac119fb 2024-01-23 op if (cp == '\t')
95 f853ec6f 2024-10-22 op return (((col + 8) / 8) * 8) - col;
97 1ac119fb 2024-01-23 op return wcwidth((wchar_t)cp);
101 f853ec6f 2024-10-22 op utf8_snwidth(const char *s, size_t off, int col)
103 1ac119fb 2024-01-23 op size_t i, tot;
104 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
108 6d24bfb3 2024-10-19 op for (i = 0; i < off; ++i)
109 f853ec6f 2024-10-22 op if (!decode(&state, &cp, s[i])) {
110 f853ec6f 2024-10-22 op width = utf8_chwidth(cp, col);
111 f853ec6f 2024-10-22 op tot += width;
112 f853ec6f 2024-10-22 op col += width;
119 f853ec6f 2024-10-22 op utf8_swidth(const char *s, int col )
122 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
126 1ac119fb 2024-01-23 op for (; *s; ++s)
127 f853ec6f 2024-10-22 op if (!decode(&state, &cp, *s)) {
128 f853ec6f 2024-10-22 op width = utf8_chwidth(cp, col);
129 f853ec6f 2024-10-22 op tot += width;
130 f853ec6f 2024-10-22 op col += width;
137 f853ec6f 2024-10-22 op utf8_swidth_between(const char *str, const char *end, int col)
140 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
144 1ac119fb 2024-01-23 op for (; *str && str < end; ++str)
145 f853ec6f 2024-10-22 op if (!decode(&state, &cp, *str)) {
146 f853ec6f 2024-10-22 op width = utf8_chwidth(cp, col);
147 f853ec6f 2024-10-22 op tot += width;
148 f853ec6f 2024-10-22 op col += width;
154 1ac119fb 2024-01-23 op * XXX: This is not correct. There are codepoints classified as
155 2815e3a0 2024-09-01 op * "emoji", but these can be joined together to form more complex
156 1ac119fb 2024-01-23 op * emoji. There is an official list of what these valid combinations
157 1ac119fb 2024-01-23 op * are, but it would require a costly lookup (a trie can be used to
158 1ac119fb 2024-01-23 op * reduce the times, but...). The following approach is conceptually
159 1ac119fb 2024-01-23 op * simpler: if there is a sequence of "emoji codepoints" (or ZWS) and
160 1ac119fb 2024-01-23 op * then a space, consider everything before the space a single emoji.
161 1ac119fb 2024-01-23 op * It needs a special check for numbers (yes, 0..9 and # are
162 1ac119fb 2024-01-23 op * technically speaking emojis) but otherwise seems to work well in
166 1ac119fb 2024-01-23 op emojied_line(const char *s, const char **space_ret)
168 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
169 1ac119fb 2024-01-23 op int only_numbers = 1;
171 1ac119fb 2024-01-23 op for (; *s; ++s) {
172 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s)) {
173 1ac119fb 2024-01-23 op if (cp == ZERO_WIDTH_SPACE)
175 1ac119fb 2024-01-23 op if (cp == ' ') {
176 1ac119fb 2024-01-23 op *space_ret = s;
177 1ac119fb 2024-01-23 op return !only_numbers;
179 1ac119fb 2024-01-23 op if (!is_emoji(cp))
181 1ac119fb 2024-01-23 op if (cp < '0' || cp > '9')
182 1ac119fb 2024-01-23 op only_numbers = 0;