Compare commits
2 commits
734dcfbe55
...
541292b002
| Author | SHA1 | Date | |
|---|---|---|---|
| 541292b002 | |||
| bf1e2bdf38 |
3 changed files with 154 additions and 134 deletions
|
|
@ -10,27 +10,107 @@ typedef uint32_t unicode_char;
|
||||||
|
|
||||||
#define UNICODE_ERROR (~(unicode_char)0)
|
#define UNICODE_ERROR (~(unicode_char)0)
|
||||||
|
|
||||||
// output buffer must be big enough (to be sure, `malloc(4*input_len_ints)`)
|
// --- read utf-8 ---
|
||||||
bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output);
|
|
||||||
// output buffer must be big enough (to be sure, `malloc(4*input_len_bytes)`)
|
|
||||||
bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output);
|
|
||||||
|
|
||||||
/// Returns next unicode char in the given string and advances the pointer.
|
/// Returns the unicode char stored at the location pointed to by `s`.
|
||||||
/// Returns `0` if end of string is reached.
|
///
|
||||||
/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
|
/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
|
||||||
unicode_char utf8_next_char(const uint8_t **sp);
|
unicode_char utf8_read(const uint8_t *s);
|
||||||
/// Returns next unicode char in the given string and advances the pointer.
|
|
||||||
/// Returns `0` if end of string is reached.
|
|
||||||
/// Panics if the string contains invalid unicode.
|
|
||||||
unicode_char utf8_next_char_asserted(const uint8_t **sp);
|
|
||||||
|
|
||||||
/// Returns next unicode char in the given string.
|
/// Like `utf8_read`, but panics on invalid unicode.
|
||||||
|
unicode_char utf8_read_asserted(const uint8_t *s);
|
||||||
|
|
||||||
|
/// Returns the next unicode char in the given string and advances the pointer.
|
||||||
|
///
|
||||||
/// Returns `0` if end of string is reached.
|
/// Returns `0` if end of string is reached.
|
||||||
/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
|
/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
|
||||||
unicode_char utf8_next_char_peek(const uint8_t *s);
|
/// In both of these cases, the pointer position remains unchanged.
|
||||||
/// Returns next unicode char in the given string and advances the pointer.
|
///
|
||||||
/// Returns `0` if end of string is reached.
|
/// Using this function to move through a string `s`:
|
||||||
/// Panics if the string contains invalid unicode.
|
/// ```
|
||||||
unicode_char utf8_next_char_peek_asserted(const uint8_t *s);
|
/// unicode_char c;
|
||||||
|
/// while (1) {
|
||||||
|
/// unicode_char c = utf8_next(&s);
|
||||||
|
/// if (c == 0) {
|
||||||
|
/// break;
|
||||||
|
/// } else if (c == UNICODE_ERROR) {
|
||||||
|
/// // handle error
|
||||||
|
/// }
|
||||||
|
/// // use character
|
||||||
|
/// printf("%c\n", c);
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
unicode_char utf8_next(const uint8_t **sp);
|
||||||
|
|
||||||
|
/// Like `utf8_next`, but panics on invalid unicode.
|
||||||
|
///
|
||||||
|
/// Using this function to move through a string `s`:
|
||||||
|
/// ```
|
||||||
|
/// unicode_char c;
|
||||||
|
/// while ((c = utf8_next(&s)) != 0) {
|
||||||
|
/// // use character
|
||||||
|
/// printf("%c\n", c);
|
||||||
|
/// }
|
||||||
|
unicode_char utf8_next_asserted(const uint8_t **sp);
|
||||||
|
|
||||||
|
// --- write utf-8 ---
|
||||||
|
|
||||||
|
/// Writes a single unicode character in utf-8 to the location pointed to by 's'.
|
||||||
|
///
|
||||||
|
/// Panics if the character `c` is not valid unicode. Null character is allowed.
|
||||||
|
///
|
||||||
|
/// Caller is responsible for providing enough memory to store `c` (at most 4 bytes).
|
||||||
|
/// Caller is responsible for terminating a string which is constructed using this function.
|
||||||
|
///
|
||||||
|
/// Returns the number of bytes written (1 to 4).
|
||||||
|
///
|
||||||
|
/// Using this function to manually construct a unicode string:
|
||||||
|
/// ```
|
||||||
|
/// uint8_t buf[100];
|
||||||
|
/// uint8_t *s = buf;
|
||||||
|
/// s += utf8_write(s, 0x3053);
|
||||||
|
/// s += utf8_write(s, 0x3093);
|
||||||
|
/// s += utf8_write(s, 0x306B);
|
||||||
|
/// s += utf8_write(s, 0x3061);
|
||||||
|
/// s += utf8_write(s, 0x306F);
|
||||||
|
/// utf8_write(s, 0);
|
||||||
|
/// ```
|
||||||
|
unsigned int utf8_write(uint8_t *s, unicode_char c);
|
||||||
|
|
||||||
|
/// Like `utf8_write`, but also advances the string pointer by number of bytes written.
|
||||||
|
///
|
||||||
|
/// Panics if the character `c` is the null byte.
|
||||||
|
/// This is to force the difference between writing and terminating to be more explicit.
|
||||||
|
/// Usually you also want the end-of-string pointer to point to the null byte, not past it.
|
||||||
|
///
|
||||||
|
/// Using this function to manually construct a unicode string:
|
||||||
|
/// ```
|
||||||
|
/// uint8_t buf[100];
|
||||||
|
/// uint8_t *s = buf;
|
||||||
|
/// utf8_push(&s, 0x3053);
|
||||||
|
/// utf8_push(&s, 0x3093);
|
||||||
|
/// utf8_push(&s, 0x306B);
|
||||||
|
/// utf8_push(&s, 0x3061);
|
||||||
|
/// utf8_push(&s, 0x306F);
|
||||||
|
/// utf8_write(s, 0);
|
||||||
|
/// ```
|
||||||
|
void utf8_push(uint8_t **sp, unicode_char c);
|
||||||
|
|
||||||
|
/// Like `utf8_push`, but also writes a null byte after the written character.
|
||||||
|
///
|
||||||
|
/// This is the safest and least-effort function when you want to construct a unicode string.
|
||||||
|
/// After calling this function, `*sp` always points to the terminating null byte.
|
||||||
|
///
|
||||||
|
/// Using this function to manually construct a unicode string:
|
||||||
|
/// ```
|
||||||
|
/// uint8_t buf[100];
|
||||||
|
/// uint8_t *s = buf;
|
||||||
|
/// utf8_push_terminated(&s, 0x3053);
|
||||||
|
/// utf8_push_terminated(&s, 0x3093);
|
||||||
|
/// utf8_push_terminated(&s, 0x306B);
|
||||||
|
/// utf8_push_terminated(&s, 0x3061);
|
||||||
|
/// utf8_push_terminated(&s, 0x306F);
|
||||||
|
/// ```
|
||||||
|
void utf8_push_terminated(uint8_t **sp, unicode_char c);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
33
src/std.c
33
src/std.c
|
|
@ -60,7 +60,7 @@ bool streq(const char *a, const char *b) {
|
||||||
|
|
||||||
bool strcontains(const char *s, unicode_char c) {
|
bool strcontains(const char *s, unicode_char c) {
|
||||||
while (1) {
|
while (1) {
|
||||||
unicode_char cc = utf8_next_char_asserted((const uint8_t **)&s);
|
unicode_char cc = utf8_next_asserted((const uint8_t **)&s);
|
||||||
if (cc == 0) {
|
if (cc == 0) {
|
||||||
return false;
|
return false;
|
||||||
} else if (cc == c) {
|
} else if (cc == c) {
|
||||||
|
|
@ -111,7 +111,7 @@ void putln(void) {
|
||||||
|
|
||||||
void puts(const char *s) {
|
void puts(const char *s) {
|
||||||
unicode_char c;
|
unicode_char c;
|
||||||
while ((c = utf8_next_char_asserted((const uint8_t **)&s)) != 0) {
|
while ((c = utf8_next_asserted((const uint8_t **)&s)) != 0) {
|
||||||
putc(c);
|
putc(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -239,7 +239,7 @@ void printf(const char *format, ...) {
|
||||||
const uint8_t *uformat = (const uint8_t *)format;
|
const uint8_t *uformat = (const uint8_t *)format;
|
||||||
|
|
||||||
unicode_char c;
|
unicode_char c;
|
||||||
while ((c = utf8_next_char_asserted(&uformat)) != 0) {
|
while ((c = utf8_next_asserted(&uformat)) != 0) {
|
||||||
if (c == '%') {
|
if (c == '%') {
|
||||||
bool do_number = false;
|
bool do_number = false;
|
||||||
int number_size_bits = 32;
|
int number_size_bits = 32;
|
||||||
|
|
@ -247,25 +247,25 @@ void printf(const char *format, ...) {
|
||||||
char base;
|
char base;
|
||||||
bool sign;
|
bool sign;
|
||||||
// Check for length modifiers:
|
// Check for length modifiers:
|
||||||
switch (utf8_next_char_peek_asserted(uformat)) {
|
switch (utf8_read_asserted(uformat)) {
|
||||||
case 'h':
|
case 'h':
|
||||||
do_number = true;
|
do_number = true;
|
||||||
number_size_bits = 16; // 16-bit (promoted to int in varargs)
|
number_size_bits = 16; // 16-bit (promoted to int in varargs)
|
||||||
utf8_next_char_asserted(&uformat);
|
utf8_next_asserted(&uformat);
|
||||||
break;
|
break;
|
||||||
case 'l':
|
case 'l':
|
||||||
do_number = true;
|
do_number = true;
|
||||||
number_size_bits = 64; // 64-bit
|
number_size_bits = 64; // 64-bit
|
||||||
utf8_next_char_asserted(&uformat);
|
utf8_next_asserted(&uformat);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (do_number && !strcontains("duxob", utf8_next_char_peek(uformat))) {
|
if (do_number && !strcontains("duxob", utf8_read(uformat))) {
|
||||||
PANIC("printf: expected number format specifier after %h or %l");
|
PANIC("printf: expected number format specifier after %h or %l");
|
||||||
}
|
}
|
||||||
|
|
||||||
c = utf8_next_char_asserted(&uformat);
|
c = utf8_next_asserted(&uformat);
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'd':
|
case 'd':
|
||||||
do_number = true;
|
do_number = true;
|
||||||
|
|
@ -307,14 +307,14 @@ void printf(const char *format, ...) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 'X': {
|
case 'X': {
|
||||||
unicode_char size1 = utf8_next_char_asserted(&uformat);
|
unicode_char size1 = utf8_next_asserted(&uformat);
|
||||||
ASSERT(size1 != 0);
|
ASSERT(size1 != 0);
|
||||||
if (size1 == '8') {
|
if (size1 == '8') {
|
||||||
uint8_t val = (uint8_t)va_arg(args, int);
|
uint8_t val = (uint8_t)va_arg(args, int);
|
||||||
putu8x(val);
|
putu8x(val);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
unicode_char size2 = utf8_next_char_asserted(&uformat);
|
unicode_char size2 = utf8_next_asserted(&uformat);
|
||||||
ASSERT(size2 != 0);
|
ASSERT(size2 != 0);
|
||||||
if (size1 == '1' && size2 == '6') {
|
if (size1 == '1' && size2 == '6') {
|
||||||
uint16_t val = (uint16_t)va_arg(args, int);
|
uint16_t val = (uint16_t)va_arg(args, int);
|
||||||
|
|
@ -332,19 +332,6 @@ void printf(const char *format, ...) {
|
||||||
PANIC("printf: invalid X size");
|
PANIC("printf: invalid X size");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// unicode
|
|
||||||
// remember that you could use unicode in the format string directly,
|
|
||||||
// these are just alternative ways
|
|
||||||
// also remember that you can print custom unicode chars with %c
|
|
||||||
// TODO more than 16 bit
|
|
||||||
case 'U': {
|
|
||||||
unicode_char spec1 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
|
||||||
unicode_char spec2 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
|
||||||
unicode_char spec3 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
|
||||||
unicode_char spec4 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
|
|
||||||
putc((spec1 << 12) | (spec2 << 8) | (spec3 << 4) | spec4);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case '%': {
|
case '%': {
|
||||||
putc('%');
|
putc('%');
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
137
src/unicode.c
137
src/unicode.c
|
|
@ -1,94 +1,11 @@
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
#include "std.h"
|
#include "std.h"
|
||||||
|
|
||||||
bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output) {
|
// --- read utf-8 ---
|
||||||
for (unsigned int i = 0; i < input_len_ints; i++) {
|
|
||||||
uint32_t c = input[i];
|
|
||||||
if (c <= 0x7f) {
|
|
||||||
*output++ = (uint8_t)c;
|
|
||||||
} else if (c <= 0x7ff) {
|
|
||||||
*output++ = 0xc0 | (uint8_t)(c >> 6);
|
|
||||||
*output++ = 0x80 | (uint8_t)(c & 0x3f);
|
|
||||||
} else if (c <= 0xffff) {
|
|
||||||
*output++ = 0xe0 | (uint8_t)(c >> 12);
|
|
||||||
*output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
|
|
||||||
*output++ = 0x80 | (uint8_t)(c & 0x3f);
|
|
||||||
} else if (c <= 0x10ffff) {
|
|
||||||
*output++ = 0xf | (uint8_t)(c >> 18);
|
|
||||||
*output++ = 0x80 | (uint8_t)((c >> 12) & 0x3f);
|
|
||||||
*output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
|
|
||||||
*output++ = 0x80 | (uint8_t)(c & 0x3f);
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define ASSERT_OR_RETURN(cond) if (!(cond)) { return false; }
|
|
||||||
|
|
||||||
bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output) {
|
|
||||||
uint8_t b0, b1, b2, b3;
|
|
||||||
uint32_t codepoint;
|
|
||||||
while (input_len_bytes > 0) {
|
|
||||||
b0 = input[0];
|
|
||||||
if ((b0 & 0x80) == 0) {
|
|
||||||
*output++ = (uint32_t)b0;
|
|
||||||
input += 1;
|
|
||||||
input_len_bytes -= 1;
|
|
||||||
} else if ((b0& 0xe0) == 0xc0) {
|
|
||||||
ASSERT_OR_RETURN(input_len_bytes >= 2);
|
|
||||||
b1 = input[1];
|
|
||||||
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
|
|
||||||
codepoint = ((uint32_t)(b0 & 0x1f) << 6)
|
|
||||||
| (b1 & 0x3f);
|
|
||||||
// avoid overlong encoding
|
|
||||||
ASSERT_OR_RETURN(codepoint > 0x7f);
|
|
||||||
*output++ = codepoint;
|
|
||||||
input += 2;
|
|
||||||
input_len_bytes -= 2;
|
|
||||||
} else if ((b0 & 0xf0) == 0xe0) {
|
|
||||||
ASSERT_OR_RETURN(input_len_bytes >= 3);
|
|
||||||
b1 = input[1];
|
|
||||||
b2 = input[2];
|
|
||||||
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
|
|
||||||
ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
|
|
||||||
codepoint = ((uint32_t)(b0 & 0x0f) << 12)
|
|
||||||
| ((uint32_t)(b1 & 0x3f) << 6)
|
|
||||||
| (b2 & 0x3f);
|
|
||||||
// avoid overlong encoding
|
|
||||||
ASSERT_OR_RETURN(codepoint > 0x7ff);
|
|
||||||
*output++ = codepoint;
|
|
||||||
input += 3;
|
|
||||||
input_len_bytes -= 3;
|
|
||||||
} else if ((b0 & 0xf8) == 0xf0) {
|
|
||||||
ASSERT_OR_RETURN(input_len_bytes >= 4);
|
|
||||||
b1 = input[1];
|
|
||||||
b2 = input[2];
|
|
||||||
b3 = input[2];
|
|
||||||
ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
|
|
||||||
ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
|
|
||||||
ASSERT_OR_RETURN((b3 & 0xc0) == 0x80);
|
|
||||||
codepoint = ((uint32_t)(b0 & 0x07) << 18)
|
|
||||||
| ((uint32_t)(b1 & 0x3f) << 12)
|
|
||||||
| ((uint32_t)(b2 & 0x3f) << 6)
|
|
||||||
| (b3 & 0x3f);
|
|
||||||
// avoid overlong encoding
|
|
||||||
ASSERT_OR_RETURN(codepoint > 0xffff);
|
|
||||||
ASSERT_OR_RETURN(codepoint <= 0x10ffff);
|
|
||||||
*output++ = codepoint;
|
|
||||||
input += 4;
|
|
||||||
input_len_bytes -= 4;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define ASSERT_OR_RETURN_ERR(cond) if (!(cond)) { return UNICODE_ERROR; }
|
#define ASSERT_OR_RETURN_ERR(cond) if (!(cond)) { return UNICODE_ERROR; }
|
||||||
|
|
||||||
unicode_char utf8_next_char(const uint8_t **sp) {
|
unicode_char utf8_next(const uint8_t **sp) {
|
||||||
uint8_t b0, b1, b2, b3;
|
uint8_t b0, b1, b2, b3;
|
||||||
unicode_char codepoint;
|
unicode_char codepoint;
|
||||||
|
|
||||||
|
|
@ -136,25 +53,61 @@ unicode_char utf8_next_char(const uint8_t **sp) {
|
||||||
// avoid overlong encoding
|
// avoid overlong encoding
|
||||||
ASSERT_OR_RETURN_ERR(codepoint > 0xffff);
|
ASSERT_OR_RETURN_ERR(codepoint > 0xffff);
|
||||||
ASSERT_OR_RETURN_ERR(codepoint <= 0x10ffff);
|
ASSERT_OR_RETURN_ERR(codepoint <= 0x10ffff);
|
||||||
*sp += 3;
|
*sp += 4;
|
||||||
return codepoint;
|
return codepoint;
|
||||||
} else {
|
} else {
|
||||||
return UNICODE_ERROR;
|
return UNICODE_ERROR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unicode_char utf8_next_char_asserted(const uint8_t **sp) {
|
unicode_char utf8_next_asserted(const uint8_t **sp) {
|
||||||
unicode_char c = utf8_next_char(sp);
|
unicode_char c = utf8_next(sp);
|
||||||
ASSERT(c != UNICODE_ERROR);
|
ASSERT(c != UNICODE_ERROR);
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
unicode_char utf8_next_char_peek(const uint8_t *s) {
|
unicode_char utf8_read(const uint8_t *s) {
|
||||||
const uint8_t *ptr_copy = s;
|
const uint8_t *ptr_copy = s;
|
||||||
return utf8_next_char(&ptr_copy);
|
return utf8_next(&ptr_copy);
|
||||||
}
|
}
|
||||||
|
|
||||||
unicode_char utf8_next_char_peek_asserted(const uint8_t *s) {
|
unicode_char utf8_read_asserted(const uint8_t *s) {
|
||||||
const uint8_t *ptr_copy = s;
|
const uint8_t *ptr_copy = s;
|
||||||
return utf8_next_char_asserted(&ptr_copy);
|
return utf8_next_asserted(&ptr_copy);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- write utf-8 ---
|
||||||
|
|
||||||
|
unsigned int utf8_write(uint8_t *s, unicode_char c) {
|
||||||
|
if (c <= 0x7f) {
|
||||||
|
s[0] = (uint8_t) c;
|
||||||
|
return 1;
|
||||||
|
} else if (c <= 0x7ff) {
|
||||||
|
s[0] = 0xc0 | (c >> 6);
|
||||||
|
s[1] = 0x80 | (c & 0x3f);
|
||||||
|
return 2;
|
||||||
|
} else if (c <= 0xffff) {
|
||||||
|
s[0] = 0xe0 | (c >> 12);
|
||||||
|
s[1] = 0x80 | ((c >> 6) & 0x3f);
|
||||||
|
s[2] = 0x80 | (c & 0x3f);
|
||||||
|
return 3;
|
||||||
|
} else if (c <= 0x10ffff) {
|
||||||
|
s[0] = 0xf0 | (c >> 18);
|
||||||
|
s[1] = 0x80 | ((c >> 12) & 0x3f);
|
||||||
|
s[2] = 0x80 | ((c >> 6) & 0x3f);
|
||||||
|
s[3] = 0x80 | (c & 0x3f);
|
||||||
|
return 4;
|
||||||
|
} else {
|
||||||
|
PANIC("invalid unicode char");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void utf8_push(uint8_t **sp, unicode_char c) {
|
||||||
|
ASSERT(c != 0);
|
||||||
|
*sp += utf8_write(*sp, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
void utf8_push_terminated(uint8_t **sp, unicode_char c) {
|
||||||
|
utf8_push(sp, c);
|
||||||
|
**sp = 0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue