Compare commits

...

2 commits

3 changed files with 154 additions and 134 deletions

View file

@ -10,27 +10,107 @@ typedef uint32_t unicode_char;
#define UNICODE_ERROR (~(unicode_char)0)
// output buffer must be big enough (to be sure, `malloc(4*input_len_ints)`)
bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output);
// output buffer must be big enough (to be sure, `malloc(4*input_len_bytes)`)
bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output);
// --- read utf-8 ---
/// Returns next unicode char in the given string and advances the pointer.
/// Returns `0` if end of string is reached.
/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
unicode_char utf8_next_char(const uint8_t **sp);
/// Returns next unicode char in the given string and advances the pointer.
/// Returns `0` if end of string is reached.
/// Panics if the string contains invalid unicode.
unicode_char utf8_next_char_asserted(const uint8_t **sp);
/// Returns the unicode char stored at the location pointed to by `s`.
///
/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
unicode_char utf8_read(const uint8_t *s);
/// Returns next unicode char in the given string.
/// Like `utf8_read`, but panics on invalid unicode.
unicode_char utf8_read_asserted(const uint8_t *s);
/// Returns the next unicode char in the given string and advances the pointer.
///
/// Returns `0` if end of string is reached.
/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
unicode_char utf8_next_char_peek(const uint8_t *s);
/// Returns next unicode char in the given string and advances the pointer.
/// Returns `0` if end of string is reached.
/// Panics if the string contains invalid unicode.
unicode_char utf8_next_char_peek_asserted(const uint8_t *s);
/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
/// In both of these cases, the pointer position remains unchanged.
///
/// Using this function to move through a string `s`:
/// ```
/// unicode_char c;
/// while (1) {
/// unicode_char c = utf8_next(&s);
/// if (c == 0) {
/// break;
/// } else if (c == UNICODE_ERROR) {
/// // handle error
/// }
/// // use character
/// printf("%c\n", c);
/// }
/// ```
unicode_char utf8_next(const uint8_t **sp);
/// Like `utf8_next`, but panics on invalid unicode.
///
/// Using this function to move through a string `s`:
/// ```
/// unicode_char c;
/// while ((c = utf8_next(&s)) != 0) {
/// // use character
/// printf("%c\n", c);
/// }
unicode_char utf8_next_asserted(const uint8_t **sp);
// --- write utf-8 ---
/// Writes a single unicode character in utf-8 to the location pointed to by 's'.
///
/// Panics if the character `c` is not valid unicode. Null character is allowed.
///
/// Caller is responsible for providing enough memory to store `c` (at most 4 bytes).
/// Caller is responsible for terminating a string which is constructed using this function.
///
/// Returns the number of bytes written (1 to 4).
///
/// Using this function to manually construct a unicode string:
/// ```
/// uint8_t buf[100];
/// uint8_t *s = buf;
/// s += utf8_write(s, 0x3053);
/// s += utf8_write(s, 0x3093);
/// s += utf8_write(s, 0x306B);
/// s += utf8_write(s, 0x3061);
/// s += utf8_write(s, 0x306F);
/// utf8_write(s, 0);
/// ```
unsigned int utf8_write(uint8_t *s, unicode_char c);
/// Like `utf8_write`, but also advances the string pointer by number of bytes written.
///
/// Panics if the character `c` is the null byte.
/// This is to force the difference between writing and terminating to be more explicit.
/// Usually you also want the end-of-string pointer to point to the null byte, not past it.
///
/// Using this function to manually construct a unicode string:
/// ```
/// uint8_t buf[100];
/// uint8_t *s = buf;
/// utf8_push(&s, 0x3053);
/// utf8_push(&s, 0x3093);
/// utf8_push(&s, 0x306B);
/// utf8_push(&s, 0x3061);
/// utf8_push(&s, 0x306F);
/// utf8_write(s, 0);
/// ```
void utf8_push(uint8_t **sp, unicode_char c);
/// Like `utf8_push`, but also writes a null byte after the written character.
///
/// This is the safest and least-effort function when you want to construct a unicode string.
/// After calling this function, `*sp` always points to the terminating null byte.
///
/// Using this function to manually construct a unicode string:
/// ```
/// uint8_t buf[100];
/// uint8_t *s = buf;
/// utf8_push_terminated(&s, 0x3053);
/// utf8_push_terminated(&s, 0x3093);
/// utf8_push_terminated(&s, 0x306B);
/// utf8_push_terminated(&s, 0x3061);
/// utf8_push_terminated(&s, 0x306F);
/// ```
void utf8_push_terminated(uint8_t **sp, unicode_char c);
#endif

View file

@ -60,7 +60,7 @@ bool streq(const char *a, const char *b) {
bool strcontains(const char *s, unicode_char c) {
while (1) {
unicode_char cc = utf8_next_char_asserted((const uint8_t **)&s);
unicode_char cc = utf8_next_asserted((const uint8_t **)&s);
if (cc == 0) {
return false;
} else if (cc == c) {
@ -111,7 +111,7 @@ void putln(void) {
void puts(const char *s) {
unicode_char c;
while ((c = utf8_next_char_asserted((const uint8_t **)&s)) != 0) {
while ((c = utf8_next_asserted((const uint8_t **)&s)) != 0) {
putc(c);
}
}
@ -239,7 +239,7 @@ void printf(const char *format, ...) {
const uint8_t *uformat = (const uint8_t *)format;
unicode_char c;
while ((c = utf8_next_char_asserted(&uformat)) != 0) {
while ((c = utf8_next_asserted(&uformat)) != 0) {
if (c == '%') {
bool do_number = false;
int number_size_bits = 32;
@ -247,25 +247,25 @@ void printf(const char *format, ...) {
char base;
bool sign;
// Check for length modifiers:
switch (utf8_next_char_peek_asserted(uformat)) {
switch (utf8_read_asserted(uformat)) {
case 'h':
do_number = true;
number_size_bits = 16; // 16-bit (promoted to int in varargs)
utf8_next_char_asserted(&uformat);
utf8_next_asserted(&uformat);
break;
case 'l':
do_number = true;
number_size_bits = 64; // 64-bit
utf8_next_char_asserted(&uformat);
utf8_next_asserted(&uformat);
break;
default:
break;
}
if (do_number && !strcontains("duxob", utf8_next_char_peek(uformat))) {
if (do_number && !strcontains("duxob", utf8_read(uformat))) {
PANIC("printf: expected number format specifier after %h or %l");
}
c = utf8_next_char_asserted(&uformat);
c = utf8_next_asserted(&uformat);
switch (c) {
case 'd':
do_number = true;
@ -307,14 +307,14 @@ void printf(const char *format, ...) {
break;
}
case 'X': {
unicode_char size1 = utf8_next_char_asserted(&uformat);
unicode_char size1 = utf8_next_asserted(&uformat);
ASSERT(size1 != 0);
if (size1 == '8') {
uint8_t val = (uint8_t)va_arg(args, int);
putu8x(val);
break;
}
unicode_char size2 = utf8_next_char_asserted(&uformat);
unicode_char size2 = utf8_next_asserted(&uformat);
ASSERT(size2 != 0);
if (size1 == '1' && size2 == '6') {
uint16_t val = (uint16_t)va_arg(args, int);
@ -332,19 +332,6 @@ void printf(const char *format, ...) {
PANIC("printf: invalid X size");
}
}
// unicode
// remember that you could use unicode in the format string directly,
// these are just alternative ways
// also remember that you can print custom unicode chars with %c
// TODO more than 16 bit
case 'U': {
unicode_char spec1 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
unicode_char spec2 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
unicode_char spec3 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
unicode_char spec4 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
putc((spec1 << 12) | (spec2 << 8) | (spec3 << 4) | spec4);
break;
}
case '%': {
putc('%');
break;

View file

@ -1,94 +1,11 @@
#include "unicode.h"
#include "std.h"
bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output) {
for (unsigned int i = 0; i < input_len_ints; i++) {
uint32_t c = input[i];
if (c <= 0x7f) {
*output++ = (uint8_t)c;
} else if (c <= 0x7ff) {
*output++ = 0xc0 | (uint8_t)(c >> 6);
*output++ = 0x80 | (uint8_t)(c & 0x3f);
} else if (c <= 0xffff) {
*output++ = 0xe0 | (uint8_t)(c >> 12);
*output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
*output++ = 0x80 | (uint8_t)(c & 0x3f);
} else if (c <= 0x10ffff) {
*output++ = 0xf | (uint8_t)(c >> 18);
*output++ = 0x80 | (uint8_t)((c >> 12) & 0x3f);
*output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
*output++ = 0x80 | (uint8_t)(c & 0x3f);
} else {
return false;
}
}
return true;
}
#define ASSERT_OR_RETURN(cond) if (!(cond)) { return false; }
bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output) {
uint8_t b0, b1, b2, b3;
uint32_t codepoint;
while (input_len_bytes > 0) {
b0 = input[0];
if ((b0 & 0x80) == 0) {
*output++ = (uint32_t)b0;
input += 1;
input_len_bytes -= 1;
} else if ((b0& 0xe0) == 0xc0) {
ASSERT_OR_RETURN(input_len_bytes >= 2);
b1 = input[1];
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
codepoint = ((uint32_t)(b0 & 0x1f) << 6)
| (b1 & 0x3f);
// avoid overlong encoding
ASSERT_OR_RETURN(codepoint > 0x7f);
*output++ = codepoint;
input += 2;
input_len_bytes -= 2;
} else if ((b0 & 0xf0) == 0xe0) {
ASSERT_OR_RETURN(input_len_bytes >= 3);
b1 = input[1];
b2 = input[2];
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
codepoint = ((uint32_t)(b0 & 0x0f) << 12)
| ((uint32_t)(b1 & 0x3f) << 6)
| (b2 & 0x3f);
// avoid overlong encoding
ASSERT_OR_RETURN(codepoint > 0x7ff);
*output++ = codepoint;
input += 3;
input_len_bytes -= 3;
} else if ((b0 & 0xf8) == 0xf0) {
ASSERT_OR_RETURN(input_len_bytes >= 4);
b1 = input[1];
b2 = input[2];
b3 = input[2];
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
ASSERT_OR_RETURN((b3 & 0xc0) == 0x80);
codepoint = ((uint32_t)(b0 & 0x07) << 18)
| ((uint32_t)(b1 & 0x3f) << 12)
| ((uint32_t)(b2 & 0x3f) << 6)
| (b3 & 0x3f);
// avoid overlong encoding
ASSERT_OR_RETURN(codepoint > 0xffff);
ASSERT_OR_RETURN(codepoint <= 0x10ffff);
*output++ = codepoint;
input += 4;
input_len_bytes -= 4;
} else {
return false;
}
}
return true;
}
// --- read utf-8 ---
#define ASSERT_OR_RETURN_ERR(cond) if (!(cond)) { return UNICODE_ERROR; }
unicode_char utf8_next_char(const uint8_t **sp) {
unicode_char utf8_next(const uint8_t **sp) {
uint8_t b0, b1, b2, b3;
unicode_char codepoint;
@ -136,25 +53,61 @@ unicode_char utf8_next_char(const uint8_t **sp) {
// avoid overlong encoding
ASSERT_OR_RETURN_ERR(codepoint > 0xffff);
ASSERT_OR_RETURN_ERR(codepoint <= 0x10ffff);
*sp += 3;
*sp += 4;
return codepoint;
} else {
return UNICODE_ERROR;
}
}
unicode_char utf8_next_char_asserted(const uint8_t **sp) {
unicode_char c = utf8_next_char(sp);
unicode_char utf8_next_asserted(const uint8_t **sp) {
unicode_char c = utf8_next(sp);
ASSERT(c != UNICODE_ERROR);
return c;
}
unicode_char utf8_next_char_peek(const uint8_t *s) {
unicode_char utf8_read(const uint8_t *s) {
const uint8_t *ptr_copy = s;
return utf8_next_char(&ptr_copy);
return utf8_next(&ptr_copy);
}
unicode_char utf8_next_char_peek_asserted(const uint8_t *s) {
unicode_char utf8_read_asserted(const uint8_t *s) {
const uint8_t *ptr_copy = s;
return utf8_next_char_asserted(&ptr_copy);
return utf8_next_asserted(&ptr_copy);
}
// --- write utf-8 ---
unsigned int utf8_write(uint8_t *s, unicode_char c) {
if (c <= 0x7f) {
s[0] = (uint8_t) c;
return 1;
} else if (c <= 0x7ff) {
s[0] = 0xc0 | (c >> 6);
s[1] = 0x80 | (c & 0x3f);
return 2;
} else if (c <= 0xffff) {
s[0] = 0xe0 | (c >> 12);
s[1] = 0x80 | ((c >> 6) & 0x3f);
s[2] = 0x80 | (c & 0x3f);
return 3;
} else if (c <= 0x10ffff) {
s[0] = 0xf0 | (c >> 18);
s[1] = 0x80 | ((c >> 12) & 0x3f);
s[2] = 0x80 | ((c >> 6) & 0x3f);
s[3] = 0x80 | (c & 0x3f);
return 4;
} else {
PANIC("invalid unicode char");
}
}
void utf8_push(uint8_t **sp, unicode_char c) {
ASSERT(c != 0);
*sp += utf8_write(*sp, c);
}
void utf8_push_terminated(uint8_t **sp, unicode_char c) {
utf8_push(sp, c);
**sp = 0;
}