Telescope Browser

Blob

Date:: Tue Oct 22 18:40:42 2024 UTC
Message:: handle tab characters tab characters have their width depending on the column they're in, since they extend to the next multiple of 8. (citation needed?) So, keep track of the column when considering the length (in columns) of the text, so that we can render them properly. In the future we might want to turn them into spaces (either at read or render time) just to stay on the safe side in case not all terminals/ncurses implementations use 8 columns.
Actions:: History | Blame | Raw File
1 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compat.h"
25 
26 #include <assert.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <wchar.h>
30 
31 #include "utf8.h"
32 
33 #define UTF8_ACCEPT 0
34 #define UTF8_REJECT 1
35 
36 static const uint8_t utf8d[] = {
37 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
38 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
39 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
40 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
41 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
42 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
43 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
44 	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
45 	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
46 	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
47 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
48 	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
49 	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
50 	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
51 };
52 
53 static inline uint32_t
54 decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
55 {
56 	uint32_t type = utf8d[byte];
57 
58 	*codep = (*state != UTF8_ACCEPT) ?
59 		(byte & 0x3fu) | (*codep << 6) :
60 		(0xff >> type) & (byte);
61 
62 	*state = utf8d[256 + *state*16 + type];
63 	return *state;
64 }
65 
66 
67 /* end of the converter, utility functions ahead */
68 
69 #define ZERO_WIDTH_SPACE 0x200B
70 
71 /* public version of decode */
72 uint32_t
73 utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
74 {
75 	return decode(state, codep, byte);
76 }
77 
78 /*
79  * returns 0, 1, 2 or less than 8 for tabs.  assumes that
80  * sizeof(wchar_t) == 4
81  */
82 static size_t
83 utf8_chwidth(uint32_t cp, int col)
84 {
85 	/* XXX: if we're running on a platform where sizeof(wchar_t)
86 	 * == 2 what to do?  The manpage for wcwidth and wcs isn't
87 	 * clear about the encoding, but if it's 16 bit wide I assume
88 	 * it must use UTF-16... right? */
89 	assert(sizeof(wchar_t) == 4);
90 
91 	/*
92 	 * Tabs are wide until the next multiple of eight.
93 	 */
94 	if (cp == '\t')
95 		return (((col + 8) / 8) * 8) - col;
96 
97 	return wcwidth((wchar_t)cp);
98 }
99 
100 size_t
101 utf8_snwidth(const char *s, size_t off, int col)
102 {
103 	size_t i, tot;
104 	uint32_t cp = 0, state = 0;
105 	int width;
106 
107 	tot = 0;
108 	for (i = 0; i < off; ++i)
109 		if (!decode(&state, &cp, s[i])) {
110 			width = utf8_chwidth(cp, col);
111 			tot += width;
112 			col += width;
113 		}
114 
115 	return tot;
116 }
117 
118 size_t
119 utf8_swidth(const char *s, int col )
120 {
121 	size_t tot;
122 	uint32_t cp = 0, state = 0;
123 	int width;
124 
125 	tot = 0;
126 	for (; *s; ++s)
127 		if (!decode(&state, &cp, *s)) {
128 			width = utf8_chwidth(cp, col);
129 			tot += width;
130 			col += width;
131 		}
132 
133 	return tot;
134 }
135 
136 size_t
137 utf8_swidth_between(const char *str, const char *end, int col)
138 {
139 	size_t tot;
140 	uint32_t cp = 0, state = 0;
141 	int width;
142 
143 	tot = 0;
144 	for (; *str && str < end; ++str)
145 		if (!decode(&state, &cp, *str)) {
146 			width = utf8_chwidth(cp, col);
147 			tot += width;
148 			col += width;
149 		}
150 	return tot;
151 }
152 
153 /*
154  * XXX: This is not correct.  There are codepoints classified as
155  * "emoji", but these can be joined together to form more complex
156  * emoji.  There is an official list of what these valid combinations
157  * are, but it would require a costly lookup (a trie can be used to
158  * reduce the times, but...).  The following approach is conceptually
159  * simpler: if there is a sequence of "emoji codepoints" (or ZWS) and
160  * then a space, consider everything before the space a single emoji.
161  * It needs a special check for numbers (yes, 0..9 and # are
162  * technically speaking emojis) but otherwise seems to work well in
163  * practice.
164  */
165 int
166 emojied_line(const char *s, const char **space_ret)
167 {
168 	uint32_t cp = 0, state = 0;
169 	int only_numbers = 1;
170 
171 	for (; *s; ++s) {
172 		if (!decode(&state, &cp, *s)) {
173 			if (cp == ZERO_WIDTH_SPACE)
174 				continue;
175 			if (cp == ' ') {
176 				*space_ret = s;
177 				return !only_numbers;
178 			}
179 			if (!is_emoji(cp))
180 				return 0;
181 			if (cp < '0' || cp > '9')
182 				only_numbers = 0;
183 		}
184 	}
185 
186 	return 0;
187 }