Compare commits
2 commits
734dcfbe55
...
541292b002
| Author | SHA1 | Date | |
|---|---|---|---|
| 541292b002 | |||
| bf1e2bdf38 |
3 changed files with 154 additions and 134 deletions
|
|
@ -10,27 +10,107 @@ typedef uint32_t unicode_char;
|
|||
|
||||
#define UNICODE_ERROR (~(unicode_char)0)
|
||||
|
||||
// output buffer must be big enough (to be sure, `malloc(4*input_len_ints)`)
|
||||
bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output);
|
||||
// output buffer must be big enough (to be sure, `malloc(4*input_len_bytes)`)
|
||||
bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output);
|
||||
// --- read utf-8 ---
|
||||
|
||||
/// Returns next unicode char in the given string and advances the pointer.
|
||||
/// Returns `0` if end of string is reached.
|
||||
/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
|
||||
unicode_char utf8_next_char(const uint8_t **sp);
|
||||
/// Returns next unicode char in the given string and advances the pointer.
|
||||
/// Returns `0` if end of string is reached.
|
||||
/// Panics if the string contains invalid unicode.
|
||||
unicode_char utf8_next_char_asserted(const uint8_t **sp);
|
||||
/// Returns the unicode char stored at the location pointed to by `s`.
|
||||
///
|
||||
/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
|
||||
unicode_char utf8_read(const uint8_t *s);
|
||||
|
||||
/// Returns next unicode char in the given string.
|
||||
/// Like `utf8_read`, but panics on invalid unicode.
|
||||
unicode_char utf8_read_asserted(const uint8_t *s);
|
||||
|
||||
/// Returns the next unicode char in the given string and advances the pointer.
|
||||
///
|
||||
/// Returns `0` if end of string is reached.
|
||||
/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
|
||||
unicode_char utf8_next_char_peek(const uint8_t *s);
|
||||
/// Returns next unicode char in the given string and advances the pointer.
|
||||
/// Returns `0` if end of string is reached.
|
||||
/// Panics if the string contains invalid unicode.
|
||||
unicode_char utf8_next_char_peek_asserted(const uint8_t *s);
|
||||
/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
|
||||
/// In both of these cases, the pointer position remains unchanged.
|
||||
///
|
||||
/// Using this function to move through a string `s`:
|
||||
/// ```
|
||||
/// unicode_char c;
|
||||
/// while (1) {
|
||||
/// unicode_char c = utf8_next(&s);
|
||||
/// if (c == 0) {
|
||||
/// break;
|
||||
/// } else if (c == UNICODE_ERROR) {
|
||||
/// // handle error
|
||||
/// }
|
||||
/// // use character
|
||||
/// printf("%c\n", c);
|
||||
/// }
|
||||
/// ```
|
||||
unicode_char utf8_next(const uint8_t **sp);
|
||||
|
||||
/// Like `utf8_next`, but panics on invalid unicode.
|
||||
///
|
||||
/// Using this function to move through a string `s`:
|
||||
/// ```
|
||||
/// unicode_char c;
|
||||
/// while ((c = utf8_next(&s)) != 0) {
|
||||
/// // use character
|
||||
/// printf("%c\n", c);
|
||||
/// }
|
||||
unicode_char utf8_next_asserted(const uint8_t **sp);
|
||||
|
||||
// --- write utf-8 ---
|
||||
|
||||
/// Writes a single unicode character in utf-8 to the location pointed to by 's'.
|
||||
///
|
||||
/// Panics if the character `c` is not valid unicode. Null character is allowed.
|
||||
///
|
||||
/// Caller is responsible for providing enough memory to store `c` (at most 4 bytes).
|
||||
/// Caller is responsible for terminating a string which is constructed using this function.
|
||||
///
|
||||
/// Returns the number of bytes written (1 to 4).
|
||||
///
|
||||
/// Using this function to manually construct a unicode string:
|
||||
/// ```
|
||||
/// uint8_t buf[100];
|
||||
/// uint8_t *s = buf;
|
||||
/// s += utf8_write(s, 0x3053);
|
||||
/// s += utf8_write(s, 0x3093);
|
||||
/// s += utf8_write(s, 0x306B);
|
||||
/// s += utf8_write(s, 0x3061);
|
||||
/// s += utf8_write(s, 0x306F);
|
||||
/// utf8_write(s, 0);
|
||||
/// ```
|
||||
unsigned int utf8_write(uint8_t *s, unicode_char c);
|
||||
|
||||
/// Like `utf8_write`, but also advances the string pointer by number of bytes written.
|
||||
///
|
||||
/// Panics if the character `c` is the null byte.
|
||||
/// This is to force the difference between writing and terminating to be more explicit.
|
||||
/// Usually you also want the end-of-string pointer to point to the null byte, not past it.
|
||||
///
|
||||
/// Using this function to manually construct a unicode string:
|
||||
/// ```
|
||||
/// uint8_t buf[100];
|
||||
/// uint8_t *s = buf;
|
||||
/// utf8_push(&s, 0x3053);
|
||||
/// utf8_push(&s, 0x3093);
|
||||
/// utf8_push(&s, 0x306B);
|
||||
/// utf8_push(&s, 0x3061);
|
||||
/// utf8_push(&s, 0x306F);
|
||||
/// utf8_write(s, 0);
|
||||
/// ```
|
||||
void utf8_push(uint8_t **sp, unicode_char c);
|
||||
|
||||
/// Like `utf8_push`, but also writes a null byte after the written character.
|
||||
///
|
||||
/// This is the safest and least-effort function when you want to construct a unicode string.
|
||||
/// After calling this function, `*sp` always points to the terminating null byte.
|
||||
///
|
||||
/// Using this function to manually construct a unicode string:
|
||||
/// ```
|
||||
/// uint8_t buf[100];
|
||||
/// uint8_t *s = buf;
|
||||
/// utf8_push_terminated(&s, 0x3053);
|
||||
/// utf8_push_terminated(&s, 0x3093);
|
||||
/// utf8_push_terminated(&s, 0x306B);
|
||||
/// utf8_push_terminated(&s, 0x3061);
|
||||
/// utf8_push_terminated(&s, 0x306F);
|
||||
/// ```
|
||||
void utf8_push_terminated(uint8_t **sp, unicode_char c);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
33
src/std.c
33
src/std.c
|
|
@ -60,7 +60,7 @@ bool streq(const char *a, const char *b) {
|
|||
|
||||
bool strcontains(const char *s, unicode_char c) {
|
||||
while (1) {
|
||||
unicode_char cc = utf8_next_char_asserted((const uint8_t **)&s);
|
||||
unicode_char cc = utf8_next_asserted((const uint8_t **)&s);
|
||||
if (cc == 0) {
|
||||
return false;
|
||||
} else if (cc == c) {
|
||||
|
|
@ -111,7 +111,7 @@ void putln(void) {
|
|||
|
||||
void puts(const char *s) {
|
||||
unicode_char c;
|
||||
while ((c = utf8_next_char_asserted((const uint8_t **)&s)) != 0) {
|
||||
while ((c = utf8_next_asserted((const uint8_t **)&s)) != 0) {
|
||||
putc(c);
|
||||
}
|
||||
}
|
||||
|
|
@ -239,7 +239,7 @@ void printf(const char *format, ...) {
|
|||
const uint8_t *uformat = (const uint8_t *)format;
|
||||
|
||||
unicode_char c;
|
||||
while ((c = utf8_next_char_asserted(&uformat)) != 0) {
|
||||
while ((c = utf8_next_asserted(&uformat)) != 0) {
|
||||
if (c == '%') {
|
||||
bool do_number = false;
|
||||
int number_size_bits = 32;
|
||||
|
|
@ -247,25 +247,25 @@ void printf(const char *format, ...) {
|
|||
char base;
|
||||
bool sign;
|
||||
// Check for length modifiers:
|
||||
switch (utf8_next_char_peek_asserted(uformat)) {
|
||||
switch (utf8_read_asserted(uformat)) {
|
||||
case 'h':
|
||||
do_number = true;
|
||||
number_size_bits = 16; // 16-bit (promoted to int in varargs)
|
||||
utf8_next_char_asserted(&uformat);
|
||||
utf8_next_asserted(&uformat);
|
||||
break;
|
||||
case 'l':
|
||||
do_number = true;
|
||||
number_size_bits = 64; // 64-bit
|
||||
utf8_next_char_asserted(&uformat);
|
||||
utf8_next_asserted(&uformat);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (do_number && !strcontains("duxob", utf8_next_char_peek(uformat))) {
|
||||
if (do_number && !strcontains("duxob", utf8_read(uformat))) {
|
||||
PANIC("printf: expected number format specifier after %h or %l");
|
||||
}
|
||||
|
||||
c = utf8_next_char_asserted(&uformat);
|
||||
c = utf8_next_asserted(&uformat);
|
||||
switch (c) {
|
||||
case 'd':
|
||||
do_number = true;
|
||||
|
|
@ -307,14 +307,14 @@ void printf(const char *format, ...) {
|
|||
break;
|
||||
}
|
||||
case 'X': {
|
||||
unicode_char size1 = utf8_next_char_asserted(&uformat);
|
||||
unicode_char size1 = utf8_next_asserted(&uformat);
|
||||
ASSERT(size1 != 0);
|
||||
if (size1 == '8') {
|
||||
uint8_t val = (uint8_t)va_arg(args, int);
|
||||
putu8x(val);
|
||||
break;
|
||||
}
|
||||
unicode_char size2 = utf8_next_char_asserted(&uformat);
|
||||
unicode_char size2 = utf8_next_asserted(&uformat);
|
||||
ASSERT(size2 != 0);
|
||||
if (size1 == '1' && size2 == '6') {
|
||||
uint16_t val = (uint16_t)va_arg(args, int);
|
||||
|
|
@ -332,19 +332,6 @@ void printf(const char *format, ...) {
|
|||
PANIC("printf: invalid X size");
|
||||
}
|
||||
}
|
||||
// unicode
|
||||
// remember that you could use unicode in the format string directly,
|
||||
// these are just alternative ways
|
||||
// also remember that you can print custom unicode chars with %c
|
||||
// TODO more than 16 bit
|
||||
case 'U': {
|
||||
unicode_char spec1 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
||||
unicode_char spec2 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
||||
unicode_char spec3 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
||||
unicode_char spec4 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
||||
putc((spec1 << 12) | (spec2 << 8) | (spec3 << 4) | spec4);
|
||||
break;
|
||||
}
|
||||
case '%': {
|
||||
putc('%');
|
||||
break;
|
||||
|
|
|
|||
137
src/unicode.c
137
src/unicode.c
|
|
@ -1,94 +1,11 @@
|
|||
#include "unicode.h"
|
||||
#include "std.h"
|
||||
|
||||
bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output) {
|
||||
for (unsigned int i = 0; i < input_len_ints; i++) {
|
||||
uint32_t c = input[i];
|
||||
if (c <= 0x7f) {
|
||||
*output++ = (uint8_t)c;
|
||||
} else if (c <= 0x7ff) {
|
||||
*output++ = 0xc0 | (uint8_t)(c >> 6);
|
||||
*output++ = 0x80 | (uint8_t)(c & 0x3f);
|
||||
} else if (c <= 0xffff) {
|
||||
*output++ = 0xe0 | (uint8_t)(c >> 12);
|
||||
*output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
|
||||
*output++ = 0x80 | (uint8_t)(c & 0x3f);
|
||||
} else if (c <= 0x10ffff) {
|
||||
*output++ = 0xf | (uint8_t)(c >> 18);
|
||||
*output++ = 0x80 | (uint8_t)((c >> 12) & 0x3f);
|
||||
*output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
|
||||
*output++ = 0x80 | (uint8_t)(c & 0x3f);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#define ASSERT_OR_RETURN(cond) if (!(cond)) { return false; }
|
||||
|
||||
bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output) {
|
||||
uint8_t b0, b1, b2, b3;
|
||||
uint32_t codepoint;
|
||||
while (input_len_bytes > 0) {
|
||||
b0 = input[0];
|
||||
if ((b0 & 0x80) == 0) {
|
||||
*output++ = (uint32_t)b0;
|
||||
input += 1;
|
||||
input_len_bytes -= 1;
|
||||
} else if ((b0& 0xe0) == 0xc0) {
|
||||
ASSERT_OR_RETURN(input_len_bytes >= 2);
|
||||
b1 = input[1];
|
||||
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
|
||||
codepoint = ((uint32_t)(b0 & 0x1f) << 6)
|
||||
| (b1 & 0x3f);
|
||||
// avoid overlong encoding
|
||||
ASSERT_OR_RETURN(codepoint > 0x7f);
|
||||
*output++ = codepoint;
|
||||
input += 2;
|
||||
input_len_bytes -= 2;
|
||||
} else if ((b0 & 0xf0) == 0xe0) {
|
||||
ASSERT_OR_RETURN(input_len_bytes >= 3);
|
||||
b1 = input[1];
|
||||
b2 = input[2];
|
||||
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
|
||||
ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
|
||||
codepoint = ((uint32_t)(b0 & 0x0f) << 12)
|
||||
| ((uint32_t)(b1 & 0x3f) << 6)
|
||||
| (b2 & 0x3f);
|
||||
// avoid overlong encoding
|
||||
ASSERT_OR_RETURN(codepoint > 0x7ff);
|
||||
*output++ = codepoint;
|
||||
input += 3;
|
||||
input_len_bytes -= 3;
|
||||
} else if ((b0 & 0xf8) == 0xf0) {
|
||||
ASSERT_OR_RETURN(input_len_bytes >= 4);
|
||||
b1 = input[1];
|
||||
b2 = input[2];
|
||||
b3 = input[2];
|
||||
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
|
||||
ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
|
||||
ASSERT_OR_RETURN((b3 & 0xc0) == 0x80);
|
||||
codepoint = ((uint32_t)(b0 & 0x07) << 18)
|
||||
| ((uint32_t)(b1 & 0x3f) << 12)
|
||||
| ((uint32_t)(b2 & 0x3f) << 6)
|
||||
| (b3 & 0x3f);
|
||||
// avoid overlong encoding
|
||||
ASSERT_OR_RETURN(codepoint > 0xffff);
|
||||
ASSERT_OR_RETURN(codepoint <= 0x10ffff);
|
||||
*output++ = codepoint;
|
||||
input += 4;
|
||||
input_len_bytes -= 4;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// --- read utf-8 ---
|
||||
|
||||
#define ASSERT_OR_RETURN_ERR(cond) if (!(cond)) { return UNICODE_ERROR; }
|
||||
|
||||
unicode_char utf8_next_char(const uint8_t **sp) {
|
||||
unicode_char utf8_next(const uint8_t **sp) {
|
||||
uint8_t b0, b1, b2, b3;
|
||||
unicode_char codepoint;
|
||||
|
||||
|
|
@ -136,25 +53,61 @@ unicode_char utf8_next_char(const uint8_t **sp) {
|
|||
// avoid overlong encoding
|
||||
ASSERT_OR_RETURN_ERR(codepoint > 0xffff);
|
||||
ASSERT_OR_RETURN_ERR(codepoint <= 0x10ffff);
|
||||
*sp += 3;
|
||||
*sp += 4;
|
||||
return codepoint;
|
||||
} else {
|
||||
return UNICODE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
unicode_char utf8_next_char_asserted(const uint8_t **sp) {
|
||||
unicode_char c = utf8_next_char(sp);
|
||||
unicode_char utf8_next_asserted(const uint8_t **sp) {
|
||||
unicode_char c = utf8_next(sp);
|
||||
ASSERT(c != UNICODE_ERROR);
|
||||
return c;
|
||||
}
|
||||
|
||||
unicode_char utf8_next_char_peek(const uint8_t *s) {
|
||||
unicode_char utf8_read(const uint8_t *s) {
|
||||
const uint8_t *ptr_copy = s;
|
||||
return utf8_next_char(&ptr_copy);
|
||||
return utf8_next(&ptr_copy);
|
||||
}
|
||||
|
||||
unicode_char utf8_next_char_peek_asserted(const uint8_t *s) {
|
||||
unicode_char utf8_read_asserted(const uint8_t *s) {
|
||||
const uint8_t *ptr_copy = s;
|
||||
return utf8_next_char_asserted(&ptr_copy);
|
||||
return utf8_next_asserted(&ptr_copy);
|
||||
}
|
||||
|
||||
// --- write utf-8 ---
|
||||
|
||||
unsigned int utf8_write(uint8_t *s, unicode_char c) {
|
||||
if (c <= 0x7f) {
|
||||
s[0] = (uint8_t) c;
|
||||
return 1;
|
||||
} else if (c <= 0x7ff) {
|
||||
s[0] = 0xc0 | (c >> 6);
|
||||
s[1] = 0x80 | (c & 0x3f);
|
||||
return 2;
|
||||
} else if (c <= 0xffff) {
|
||||
s[0] = 0xe0 | (c >> 12);
|
||||
s[1] = 0x80 | ((c >> 6) & 0x3f);
|
||||
s[2] = 0x80 | (c & 0x3f);
|
||||
return 3;
|
||||
} else if (c <= 0x10ffff) {
|
||||
s[0] = 0xf0 | (c >> 18);
|
||||
s[1] = 0x80 | ((c >> 12) & 0x3f);
|
||||
s[2] = 0x80 | ((c >> 6) & 0x3f);
|
||||
s[3] = 0x80 | (c & 0x3f);
|
||||
return 4;
|
||||
} else {
|
||||
PANIC("invalid unicode char");
|
||||
}
|
||||
}
|
||||
|
||||
void utf8_push(uint8_t **sp, unicode_char c) {
|
||||
ASSERT(c != 0);
|
||||
*sp += utf8_write(*sp, c);
|
||||
}
|
||||
|
||||
void utf8_push_terminated(uint8_t **sp, unicode_char c) {
|
||||
utf8_push(sp, c);
|
||||
**sp = 0;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue