GNUstep CoreBase Library 0.2
GSCharacter.h
1/* GSCharacter.h
2
3 Copyright (C) 2014 Free Software Foundation, Inc.
4
5 Written by: Stefan Bidigaray
6 Date: November, 2014
7
8 This file is part of the GNUstep CoreBase Library.
9
10 This library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2.1 of the License, or (at your option) any later version.
14
15 This library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License for more details.
19
20 You should have received a copy of the GNU Lesser General Public
21 License along with this library; see the file COPYING.LIB.
22 If not, see <http://www.gnu.org/licenses/> or write to the
23 Free Software Foundation, 51 Franklin Street, Fifth Floor,
24 Boston, MA 02110-1301, USA.
25*/
26
27#ifndef __GSCHARACTER_H__
28#define __GSCHARACTER_H__ 1
29
30#include <CoreFoundation/CFBase.h>
31
42CF_INLINE Boolean
43GSCharacterIsASCII (const UTF32Char c)
44{
45 return c < 128;
46}
47
52CF_INLINE Boolean
53GSCharacterIsWhitespace (const UTF32Char c)
54{
55 return (0x0009 <= c && c <= 0x000D) || (c == 0x0020) || (c == 0x0085)
56 || (c == 0x00A0) || (c == 0x1680) || (0x2000 <= c && c <= 0x200A)
57 || (c == 0x2028) || (c == 0x2029) || (c == 0x202F) || (c == 0x205F)
58 || (c == 0x3000);
59}
60
66CF_INLINE Boolean
68{
69 return c > 0xFFFF;
70}
71
76CF_INLINE Boolean
77GSCharacterIsSurrogate (const UTF32Char c)
78{
79 return (c & 0xFFFFF800) == 0xD800;
80}
81
86CF_INLINE Boolean
87GSCharacterIsLeadSurrogate (const UTF32Char c)
88{
89 return (c & 0xFFFFFC00) == 0xD800;
90}
91
96CF_INLINE Boolean
97GSCharacterIsTrailSurrogate (const UTF32Char c)
98{
99 return (c & 0xFFFFFC00) == 0xDC00;
100}
101
110#define kGSUTF8CharacterMaximumLength 4
111
117CF_INLINE CFIndex
119{
120 return (c < 0xF5) ? (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0) : 0;
121}
122
127CF_INLINE Boolean
129{
130 return (c & 0xC0) == 0x80;
131}
132
138CF_INLINE CFIndex
139GSUTF8CharacterLength (const UTF32Char c)
140{
141 return (c <= 0x10FFFF) ? 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000) : 0;
142}
143
150CF_INLINE CFIndex
151GSUTF8CharacterAppendByteOrderMark (UTF8Char * d, const UTF8Char * limit)
152{
153 if ((limit - d) > 3)
154 {
155 d[0] = 0xEF;
156 d[1] = 0xBB;
157 d[2] = 0xBF;
158 }
159
160 return 3;
161}
162
171CF_INLINE Boolean
172GSUTF8CharacterSkipByteOrderMark (const UTF8Char ** s, const UTF8Char * limit)
173{
174 const UTF8Char *p;
175
176 p = *s;
177 if ((limit - p) > 3)
178 {
179 if (*p++ == 0xEF && *p++ == 0xBB && *p++ == 0xBF)
180 {
181 *s = p;
182 return true;
183 }
184 }
185
186 return false;
187}
188
197CF_INLINE CFIndex
198GSUTF8CharacterAppend (UTF8Char * d, const UTF8Char * limit, UTF32Char c)
199{
200 static const UTF8Char utf8LeadHeader[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
201 CFIndex l;
202
204 return 0;
205
206 l = GSUTF8CharacterLength (c);
207 if (l && limit - d > l)
208 {
209 switch (l)
210 {
211 case 4:
212 d[3] = (c & 0x3F) | 0x80;
213 c >>= 6;
214 case 3:
215 d[2] = (c & 0x3F) | 0x80;
216 c >>= 6;
217 case 2:
218 d[1] = (c & 0x3F) | 0x80;
219 c >>= 6;
220 case 1:
221 d[0] = c | utf8LeadHeader[l - 1];
222 }
223 }
224
225 return l;
226}
227
240CF_INLINE CFIndex
241GSUTF8CharacterGet (const UTF8Char * s, const UTF8Char * limit, UTF32Char *c)
242{
243 static const UTF32Char utf8LeadMask[4] = { 0x0, 0x1F, 0x0F, 0x07 };
244 const UTF8Char *start;
245 UTF32Char ch;
246
247 start = s;
248 ch = *s++;
249
250 if (ch > 0x7F)
251 {
252 CFIndex trail;
253
255 if (limit - s < trail)
256 trail = 0; /* Force an error */
257 ch &= utf8LeadMask[trail];
258 switch (trail)
259 {
260 case 3:
262 {
263 s = start;
264 break;
265 }
266 ch = (ch << 6) | (*s++ & 0x3F);
267 case 2:
269 {
270 s = start;
271 break;
272 }
273 ch = (ch << 6) | (*s++ & 0x3F);
274 case 1:
276 {
277 s = start;
278 break;
279 }
280 ch = (ch << 6) | (*s++ & 0x3F);
281 break;
282 case 0:
283 s = start;
284 break;
285 }
286 }
287 *c = ch;
288
289 return s - start;
290}
291
300#define kGSUTF16CharacterMaximumLength 2
301
303#define kGSUTF16CharacterByteOrderMark 0xFEFF
304
306#define kGSUTF16CharacterSwappedByteOrderMark 0xFFFE
307
317CF_INLINE CFIndex
318GSUTF16CharacterAppend (UTF16Char * d, const UTF16Char * limit, UTF32Char c)
319{
320 if (c <= 0xFFFF)
321 {
322 if ((limit - d) > 1)
323 *d = c;
324 return 1;
325 }
326 else if (c <= 0x10FFFF)
327 {
328 if ((limit - d) > 2)
329 {
330 d[0] = (c >> 10) + 0xD7C0;
331 d[1] = (c & 0x3FF) + 0xDC00;
332 }
333 return 2;
334 }
335
336 return 0;
337}
338
351CF_INLINE CFIndex
352GSUTF16CharacterGet (const UTF16Char * s, const UTF16Char * limit, UTF32Char *c)
353{
354 const UTF16Char *start;
355 UTF32Char ch;
356
357 start = s;
358 ch = *s++;
359
360 if (GSCharacterIsSurrogate (ch))
361 {
362 if (GSCharacterIsLeadSurrogate (ch) && s < limit
364 ch = (ch << 10) + (*s++) - ((0xD7C0 << 10) + 0xDC00);
365 else
366 --s;
367 }
368 *c = ch;
369
370 return s - start;
371}
372
379#define kGSUTF32CharacterByteOrderMark 0x0000FEFF
380
382#define kGSUTF32CharacterSwappedByteOrderMark 0xFFFE0000
386#endif /* __GSCHARACTER_H__ */
387
signed long CFIndex
Definition CFBase.h:165
Boolean GSUTF8CharacterIsTrailing(const UTF8Char c)
Determines if the specified UTF-8 code unit is a trailing code unit.
Definition GSCharacter.h:128
CFIndex GSUTF8CharacterGet(const UTF8Char *s, const UTF8Char *limit, UTF32Char *c)
Get a Unicode code unit from a UTF-8 string buffer.
Definition GSCharacter.h:241
CFIndex GSUTF16CharacterGet(const UTF16Char *s, const UTF16Char *limit, UTF32Char *c)
Get a Unicode code point from a UTF-16 string buffer.
Definition GSCharacter.h:352
Boolean GSCharacterIsTrailSurrogate(const UTF32Char c)
Determine if character is a trailing surrogate code point.
Definition GSCharacter.h:97
Boolean GSCharacterIsInSupplementaryPlane(const UTF32Char c)
Determine if character is in one of the supplementary planes.
Definition GSCharacter.h:67
CFIndex GSUTF8CharacterAppendByteOrderMark(UTF8Char *d, const UTF8Char *limit)
Append the UTF-8 Byte Order Mark to the string buffer.
Definition GSCharacter.h:151
Boolean GSCharacterIsWhitespace(const UTF32Char c)
Determine if a character is a whitespace character.
Definition GSCharacter.h:53
Boolean GSCharacterIsASCII(const UTF32Char c)
Determine if a character is an ASCII character (less than 128).
Definition GSCharacter.h:43
Boolean GSCharacterIsLeadSurrogate(const UTF32Char c)
Determine if character is a leading surrogate code point.
Definition GSCharacter.h:87
Boolean GSCharacterIsSurrogate(const UTF32Char c)
Determine true if character is a surrogate code point.
Definition GSCharacter.h:77
CFIndex GSUTF8CharacterTrailBytesCount(const UTF8Char c)
Determine the number of trailing bytes for a UTF-8 character based on the leading code unit.
Definition GSCharacter.h:118
CFIndex GSUTF8CharacterLength(const UTF32Char c)
Determine the number of UTF-8 code units required to represent the specified Unicode code point.
Definition GSCharacter.h:139
CFIndex GSUTF16CharacterAppend(UTF16Char *d, const UTF16Char *limit, UTF32Char c)
Append a character to a UTF-16 string buffer.
Definition GSCharacter.h:318
CFIndex GSUTF8CharacterAppend(UTF8Char *d, const UTF8Char *limit, UTF32Char c)
Append a character to a UTF-8 string buffer.
Definition GSCharacter.h:198
Boolean GSUTF8CharacterSkipByteOrderMark(const UTF8Char **s, const UTF8Char *limit)
Determine if a UTF-8 string buffer has a Byte Order Mark.
Definition GSCharacter.h:172