document and improve unicode functions

remove %UXXXX unicode printing because it makes no sense to do it with %
2025-07-31 15:24:42 +02:00 · 2025-07-31 13:23:30 +02:00
3 changed files with 154 additions and 134 deletions
--- a/include/unicode.h
+++ b/include/unicode.h
@ -10,27 +10,107 @@ typedef uint32_t unicode_char;
 #define UNICODE_ERROR (~(unicode_char)0)
-// output buffer must be big enough (to be sure, `malloc(4*input_len_ints)`)
+// --- read utf-8 ---
 bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output);
 // output buffer must be big enough (to be sure, `malloc(4*input_len_bytes)`)
 bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output);
-/// Returns next unicode char in the given string and advances the pointer.
+/// Returns the unicode char stored at the location pointed to by `s`.
-/// Returns `0` if end of string is reached.
+///
-/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
+/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
-unicode_char utf8_next_char(const uint8_t **sp);
+unicode_char utf8_read(const uint8_t *s);
 /// Returns next unicode char in the given string and advances the pointer.
 /// Returns `0` if end of string is reached.
 /// Panics if the string contains invalid unicode.
 unicode_char utf8_next_char_asserted(const uint8_t **sp);
-/// Returns next unicode char in the given string.
+/// Like `utf8_read`, but panics on invalid unicode.
 unicode_char utf8_read_asserted(const uint8_t *s);
 /// Returns the next unicode char in the given string and advances the pointer.
 ///
 /// Returns `0` if end of string is reached.
-/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
+/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
-unicode_char utf8_next_char_peek(const uint8_t *s);
+/// In both of these cases, the pointer position remains unchanged.
-/// Returns next unicode char in the given string and advances the pointer.
+///
-/// Returns `0` if end of string is reached.
+/// Using this function to move through a string `s`:
-/// Panics if the string contains invalid unicode.
+/// ```
-unicode_char utf8_next_char_peek_asserted(const uint8_t *s);
+/// unicode_char c;
 /// while (1) {
 ///     unicode_char c = utf8_next(&s);
 ///     if (c == 0) {
 ///         break;
 ///     } else if (c == UNICODE_ERROR) {
 ///         // handle error
 ///     }
 ///     // use character
 ///     printf("%c\n", c);
 /// }
 /// ```
 unicode_char utf8_next(const uint8_t **sp);
 /// Like `utf8_next`, but panics on invalid unicode.
 ///
 /// Using this function to move through a string `s`:
 /// ```
 /// unicode_char c;
 /// while ((c = utf8_next(&s)) != 0) {
 ///     // use character
 ///     printf("%c\n", c);
 /// }
 unicode_char utf8_next_asserted(const uint8_t **sp);
 // --- write utf-8 ---
 /// Writes a single unicode character in utf-8 to the location pointed to by 's'.
 ///
 /// Panics if the character `c` is not valid unicode. Null character is allowed.
 ///
 /// Caller is responsible for providing enough memory to store `c` (at most 4 bytes).
 /// Caller is responsible for terminating a string which is constructed using this function.
 ///
 /// Returns the number of bytes written (1 to 4).
 ///
 /// Using this function to manually construct a unicode string:
 /// ```
 /// uint8_t buf[100];
 /// uint8_t *s = buf;
 /// s += utf8_write(s, 0x3053);
 /// s += utf8_write(s, 0x3093);
 /// s += utf8_write(s, 0x306B);
 /// s += utf8_write(s, 0x3061);
 /// s += utf8_write(s, 0x306F);
 /// utf8_write(s, 0);
 /// ```
 unsigned int utf8_write(uint8_t *s, unicode_char c);
 /// Like `utf8_write`, but also advances the string pointer by number of bytes written.
 ///
 /// Panics if the character `c` is the null byte.
 /// This is to force the difference between writing and terminating to be more explicit.
 /// Usually you also want the end-of-string pointer to point to the null byte, not past it.
 ///
 /// Using this function to manually construct a unicode string:
 /// ```
 /// uint8_t buf[100];
 /// uint8_t *s = buf;
 /// utf8_push(&s, 0x3053);
 /// utf8_push(&s, 0x3093);
 /// utf8_push(&s, 0x306B);
 /// utf8_push(&s, 0x3061);
 /// utf8_push(&s, 0x306F);
 /// utf8_write(s, 0);
 /// ```
 void utf8_push(uint8_t **sp, unicode_char c);
 /// Like `utf8_push`, but also writes a null byte after the written character.
 ///
 /// This is the safest and least-effort function when you want to construct a unicode string.
 /// After calling this function, `*sp` always points to the terminating null byte.
 ///
 /// Using this function to manually construct a unicode string:
 /// ```
 /// uint8_t buf[100];
 /// uint8_t *s = buf;
 /// utf8_push_terminated(&s, 0x3053);
 /// utf8_push_terminated(&s, 0x3093);
 /// utf8_push_terminated(&s, 0x306B);
 /// utf8_push_terminated(&s, 0x3061);
 /// utf8_push_terminated(&s, 0x306F);
 /// ```
 void utf8_push_terminated(uint8_t **sp, unicode_char c);
 #endif
--- a/src/std.c
+++ b/src/std.c
@ -60,7 +60,7 @@ bool streq(const char *a, const char *b) {
 bool strcontains(const char *s, unicode_char c) {
    while (1) {
-        unicode_char cc = utf8_next_char_asserted((const uint8_t **)&s);
+        unicode_char cc = utf8_next_asserted((const uint8_t **)&s);
        if (cc == 0) {
            return false;
        } else if (cc == c) {
@ -111,7 +111,7 @@ void putln(void) {
 void puts(const char *s) {
    unicode_char c;
-    while ((c = utf8_next_char_asserted((const uint8_t **)&s)) != 0) {
+    while ((c = utf8_next_asserted((const uint8_t **)&s)) != 0) {
        putc(c);
    }
 }
@ -239,7 +239,7 @@ void printf(const char *format, ...) {
    const uint8_t *uformat = (const uint8_t *)format;
    unicode_char c;
-    while ((c = utf8_next_char_asserted(&uformat)) != 0) {
+    while ((c = utf8_next_asserted(&uformat)) != 0) {
        if (c == '%') {
            bool do_number = false;
            int number_size_bits = 32;
@ -247,25 +247,25 @@ void printf(const char *format, ...) {
            char base;
            bool sign;
            // Check for length modifiers:
-            switch (utf8_next_char_peek_asserted(uformat)) {
+            switch (utf8_read_asserted(uformat)) {
                case 'h':
                    do_number = true;
                    number_size_bits = 16; // 16-bit (promoted to int in varargs)
-                    utf8_next_char_asserted(&uformat);
+                    utf8_next_asserted(&uformat);
                    break;
                case 'l':
                    do_number = true;
                    number_size_bits = 64; // 64-bit
-                    utf8_next_char_asserted(&uformat);
+                    utf8_next_asserted(&uformat);
                    break;
                default:
                    break;
            }
-            if (do_number && !strcontains("duxob", utf8_next_char_peek(uformat))) {
+            if (do_number && !strcontains("duxob", utf8_read(uformat))) {
                PANIC("printf: expected number format specifier after %h or %l");
            }
-            c = utf8_next_char_asserted(&uformat);
+            c = utf8_next_asserted(&uformat);
            switch (c) {
                case 'd':
                    do_number = true;
@ -307,14 +307,14 @@ void printf(const char *format, ...) {
                    break;
                }
                case 'X': {
-                    unicode_char size1 = utf8_next_char_asserted(&uformat);
+                    unicode_char size1 = utf8_next_asserted(&uformat);
                    ASSERT(size1 != 0);
                    if (size1 == '8') {
                        uint8_t val = (uint8_t)va_arg(args, int);
                        putu8x(val);
                        break;
                    }
-                    unicode_char size2 = utf8_next_char_asserted(&uformat);
+                    unicode_char size2 = utf8_next_asserted(&uformat);
                    ASSERT(size2 != 0);
                    if (size1 == '1' && size2 == '6') {
                        uint16_t val = (uint16_t)va_arg(args, int);
@ -332,19 +332,6 @@ void printf(const char *format, ...) {
                        PANIC("printf: invalid X size");
                    }
                }
                // unicode
                // remember that you could use unicode in the format string directly,
                // these are just alternative ways
                // also remember that you can print custom unicode chars with %c
                // TODO more than 16 bit
                case 'U': {
                    unicode_char spec1 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
                    unicode_char spec2 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
                    unicode_char spec3 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
                    unicode_char spec4 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
                    putc((spec1 << 12) | (spec2 << 8) | (spec3 << 4) | spec4);
                    break;
                }
                case '%': {
                              putc('%');
                              break;
--- a/src/unicode.c
+++ b/src/unicode.c
@ -1,94 +1,11 @@
 #include "unicode.h"
 #include "std.h"
-bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output) {
+// --- read utf-8 ---
    for (unsigned int i = 0; i < input_len_ints; i++) {
        uint32_t c = input[i];
        if (c <= 0x7f) {
            *output++ = (uint8_t)c;
        } else if (c <= 0x7ff) {
            *output++ = 0xc0 | (uint8_t)(c >> 6);
            *output++ = 0x80 | (uint8_t)(c & 0x3f);
        } else if (c <= 0xffff) {
            *output++ = 0xe0 | (uint8_t)(c >> 12);
            *output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
            *output++ = 0x80 | (uint8_t)(c & 0x3f);
        } else if (c <= 0x10ffff) {
            *output++ = 0xf | (uint8_t)(c >> 18);
            *output++ = 0x80 | (uint8_t)((c >> 12) & 0x3f);
            *output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
            *output++ = 0x80 | (uint8_t)(c & 0x3f);
        } else {
            return false;
        }
    }
    return true;
 }
 #define ASSERT_OR_RETURN(cond) if (!(cond)) { return false; }
 bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output) {
    uint8_t b0, b1, b2, b3;
    uint32_t codepoint;
    while (input_len_bytes > 0) {
        b0 = input[0];
        if ((b0 & 0x80) == 0) {
            *output++ = (uint32_t)b0;
            input += 1;
            input_len_bytes -= 1;
        } else if ((b0& 0xe0) == 0xc0) {
            ASSERT_OR_RETURN(input_len_bytes >= 2);
            b1 = input[1];
            ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
            codepoint = ((uint32_t)(b0 & 0x1f) << 6)
                | (b1 & 0x3f);
            // avoid overlong encoding
            ASSERT_OR_RETURN(codepoint > 0x7f);
            *output++ = codepoint;
            input += 2;
            input_len_bytes -= 2;
        } else if ((b0 & 0xf0) == 0xe0) {
            ASSERT_OR_RETURN(input_len_bytes >= 3);
            b1 = input[1];
            b2 = input[2];
            ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
            ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
            codepoint = ((uint32_t)(b0 & 0x0f) << 12)
                | ((uint32_t)(b1 & 0x3f) << 6)
                | (b2 & 0x3f);
            // avoid overlong encoding
            ASSERT_OR_RETURN(codepoint > 0x7ff);
            *output++ = codepoint;
            input += 3;
            input_len_bytes -= 3;
        } else if ((b0 & 0xf8) == 0xf0) {
            ASSERT_OR_RETURN(input_len_bytes >= 4);
            b1 = input[1];
            b2 = input[2];
            b3 = input[2];
            ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
            ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
            ASSERT_OR_RETURN((b3 & 0xc0) == 0x80);
            codepoint = ((uint32_t)(b0 & 0x07) << 18)
                | ((uint32_t)(b1 & 0x3f) << 12)
                | ((uint32_t)(b2 & 0x3f) << 6)
                | (b3 & 0x3f);
            // avoid overlong encoding
            ASSERT_OR_RETURN(codepoint > 0xffff);
            ASSERT_OR_RETURN(codepoint <= 0x10ffff);
            *output++ = codepoint;
            input += 4;
            input_len_bytes -= 4;
        } else {
            return false;
        }
    }
    return true;
 }
 #define ASSERT_OR_RETURN_ERR(cond) if (!(cond)) { return UNICODE_ERROR; }
-unicode_char utf8_next_char(const uint8_t **sp) {
+unicode_char utf8_next(const uint8_t **sp) {
    uint8_t b0, b1, b2, b3;
    unicode_char codepoint;
@ -136,25 +53,61 @@ unicode_char utf8_next_char(const uint8_t **sp) {
        // avoid overlong encoding
        ASSERT_OR_RETURN_ERR(codepoint > 0xffff);
        ASSERT_OR_RETURN_ERR(codepoint <= 0x10ffff);
-        *sp += 3;
+        *sp += 4;
        return codepoint;
    } else {
        return UNICODE_ERROR;
    }
 }
-unicode_char utf8_next_char_asserted(const uint8_t **sp) {
+unicode_char utf8_next_asserted(const uint8_t **sp) {
-    unicode_char c = utf8_next_char(sp);
+    unicode_char c = utf8_next(sp);
    ASSERT(c != UNICODE_ERROR);
    return c;
 }
-unicode_char utf8_next_char_peek(const uint8_t *s) {
+unicode_char utf8_read(const uint8_t *s) {
    const uint8_t *ptr_copy = s;
-    return utf8_next_char(&ptr_copy);
+    return utf8_next(&ptr_copy);
 }
-unicode_char utf8_next_char_peek_asserted(const uint8_t *s) {
+unicode_char utf8_read_asserted(const uint8_t *s) {
    const uint8_t *ptr_copy = s;
-    return utf8_next_char_asserted(&ptr_copy);
+    return utf8_next_asserted(&ptr_copy);
 }
 // --- write utf-8 ---
 unsigned int utf8_write(uint8_t *s, unicode_char c) {
    if (c <= 0x7f) {
        s[0] = (uint8_t) c;
        return 1;
    } else if (c <= 0x7ff) {
        s[0] = 0xc0 | (c >> 6);
        s[1] = 0x80 | (c & 0x3f);
        return 2;
    } else if (c <= 0xffff) {
        s[0] = 0xe0 | (c >> 12);
        s[1] = 0x80 | ((c >> 6) & 0x3f);
        s[2] = 0x80 | (c & 0x3f);
        return 3;
    } else if (c <= 0x10ffff) {
        s[0] = 0xf0 | (c >> 18);
        s[1] = 0x80 | ((c >> 12) & 0x3f);
        s[2] = 0x80 | ((c >> 6) & 0x3f);
        s[3] = 0x80 | (c & 0x3f);
        return 4;
    } else {
        PANIC("invalid unicode char");
    }
 }
 void utf8_push(uint8_t **sp, unicode_char c) {
    ASSERT(c != 0);
    *sp += utf8_write(*sp, c);
 }
 void utf8_push_terminated(uint8_t **sp, unicode_char c) {
    utf8_push(sp, c);
    **sp = 0;
 }
Author	SHA1	Message	Date
uosfz	541292b002	document and improve unicode functions	2025-07-31 15:24:42 +02:00
uosfz	bf1e2bdf38	remove %UXXXX unicode printing because it makes no sense to do it with %	2025-07-31 13:23:30 +02:00