document and improve unicode functions

remove %UXXXX unicode printing because it makes no sense to do it with %
2025-07-31 15:24:42 +02:00 · 2025-07-31 13:23:30 +02:00
3 changed files with 154 additions and 134 deletions
--- a/include/unicode.h
+++ b/include/unicode.h
@ -10,27 +10,107 @@ typedef uint32_t unicode_char;

 #define UNICODE_ERROR (~(unicode_char)0)

-// output buffer must be big enough (to be sure, `malloc(4*input_len_ints)`)
-bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output);
-// output buffer must be big enough (to be sure, `malloc(4*input_len_bytes)`)
-bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output);
+// --- read utf-8 ---

-/// Returns next unicode char in the given string and advances the pointer.
-/// Returns `0` if end of string is reached.
-/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
-unicode_char utf8_next_char(const uint8_t **sp);
-/// Returns next unicode char in the given string and advances the pointer.
-/// Returns `0` if end of string is reached.
-/// Panics if the string contains invalid unicode.
-unicode_char utf8_next_char_asserted(const uint8_t **sp);
+/// Returns the unicode char stored at the location pointed to by `s`.
+///
+/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
+unicode_char utf8_read(const uint8_t *s);

-/// Returns next unicode char in the given string.
+/// Like `utf8_read`, but panics on invalid unicode.
+unicode_char utf8_read_asserted(const uint8_t *s);
+
+/// Returns the next unicode char in the given string and advances the pointer.
+///
 /// Returns `0` if end of string is reached.
-/// Returns `UNICODE_ERROR` if the string contains invalid unicode.
-unicode_char utf8_next_char_peek(const uint8_t *s);
-/// Returns next unicode char in the given string and advances the pointer.
-/// Returns `0` if end of string is reached.
-/// Panics if the string contains invalid unicode.
-unicode_char utf8_next_char_peek_asserted(const uint8_t *s);
+/// Returns `UNICODE_ERROR` if invalid unicode is encountered.
+/// In both of these cases, the pointer position remains unchanged.
+///
+/// Using this function to move through a string `s`:
+/// ```
+/// unicode_char c;
+/// while (1) {
+///     unicode_char c = utf8_next(&s);
+///     if (c == 0) {
+///         break;
+///     } else if (c == UNICODE_ERROR) {
+///         // handle error
+///     }
+///     // use character
+///     printf("%c\n", c);
+/// }
+/// ```
+unicode_char utf8_next(const uint8_t **sp);
+
+/// Like `utf8_next`, but panics on invalid unicode.
+///
+/// Using this function to move through a string `s`:
+/// ```
+/// unicode_char c;
+/// while ((c = utf8_next(&s)) != 0) {
+///     // use character
+///     printf("%c\n", c);
+/// }
+unicode_char utf8_next_asserted(const uint8_t **sp);
+
+// --- write utf-8 ---
+
+/// Writes a single unicode character in utf-8 to the location pointed to by 's'.
+///
+/// Panics if the character `c` is not valid unicode. Null character is allowed.
+///
+/// Caller is responsible for providing enough memory to store `c` (at most 4 bytes).
+/// Caller is responsible for terminating a string which is constructed using this function.
+///
+/// Returns the number of bytes written (1 to 4).
+///
+/// Using this function to manually construct a unicode string:
+/// ```
+/// uint8_t buf[100];
+/// uint8_t *s = buf;
+/// s += utf8_write(s, 0x3053);
+/// s += utf8_write(s, 0x3093);
+/// s += utf8_write(s, 0x306B);
+/// s += utf8_write(s, 0x3061);
+/// s += utf8_write(s, 0x306F);
+/// utf8_write(s, 0);
+/// ```
+unsigned int utf8_write(uint8_t *s, unicode_char c);
+
+/// Like `utf8_write`, but also advances the string pointer by number of bytes written.
+///
+/// Panics if the character `c` is the null byte.
+/// This is to force the difference between writing and terminating to be more explicit.
+/// Usually you also want the end-of-string pointer to point to the null byte, not past it.
+///
+/// Using this function to manually construct a unicode string:
+/// ```
+/// uint8_t buf[100];
+/// uint8_t *s = buf;
+/// utf8_push(&s, 0x3053);
+/// utf8_push(&s, 0x3093);
+/// utf8_push(&s, 0x306B);
+/// utf8_push(&s, 0x3061);
+/// utf8_push(&s, 0x306F);
+/// utf8_write(s, 0);
+/// ```
+void utf8_push(uint8_t **sp, unicode_char c);
+
+/// Like `utf8_push`, but also writes a null byte after the written character.
+///
+/// This is the safest and least-effort function when you want to construct a unicode string.
+/// After calling this function, `*sp` always points to the terminating null byte.
+///
+/// Using this function to manually construct a unicode string:
+/// ```
+/// uint8_t buf[100];
+/// uint8_t *s = buf;
+/// utf8_push_terminated(&s, 0x3053);
+/// utf8_push_terminated(&s, 0x3093);
+/// utf8_push_terminated(&s, 0x306B);
+/// utf8_push_terminated(&s, 0x3061);
+/// utf8_push_terminated(&s, 0x306F);
+/// ```
+void utf8_push_terminated(uint8_t **sp, unicode_char c);

 #endif
--- a/src/std.c
+++ b/src/std.c
@ -60,7 +60,7 @@ bool streq(const char *a, const char *b) {

 bool strcontains(const char *s, unicode_char c) {
    while (1) {
-        unicode_char cc = utf8_next_char_asserted((const uint8_t **)&s);
+        unicode_char cc = utf8_next_asserted((const uint8_t **)&s);
        if (cc == 0) {
            return false;
        } else if (cc == c) {
@ -111,7 +111,7 @@ void putln(void) {

 void puts(const char *s) {
    unicode_char c;
-    while ((c = utf8_next_char_asserted((const uint8_t **)&s)) != 0) {
+    while ((c = utf8_next_asserted((const uint8_t **)&s)) != 0) {
        putc(c);
    }
 }
@ -239,7 +239,7 @@ void printf(const char *format, ...) {
    const uint8_t *uformat = (const uint8_t *)format;

    unicode_char c;
-    while ((c = utf8_next_char_asserted(&uformat)) != 0) {
+    while ((c = utf8_next_asserted(&uformat)) != 0) {
        if (c == '%') {
            bool do_number = false;
            int number_size_bits = 32;
@ -247,25 +247,25 @@ void printf(const char *format, ...) {
            char base;
            bool sign;
            // Check for length modifiers:
-            switch (utf8_next_char_peek_asserted(uformat)) {
+            switch (utf8_read_asserted(uformat)) {
                case 'h':
                    do_number = true;
                    number_size_bits = 16; // 16-bit (promoted to int in varargs)
-                    utf8_next_char_asserted(&uformat);
+                    utf8_next_asserted(&uformat);
                    break;
                case 'l':
                    do_number = true;
                    number_size_bits = 64; // 64-bit
-                    utf8_next_char_asserted(&uformat);
+                    utf8_next_asserted(&uformat);
                    break;
                default:
                    break;
            }
-            if (do_number && !strcontains("duxob", utf8_next_char_peek(uformat))) {
+            if (do_number && !strcontains("duxob", utf8_read(uformat))) {
                PANIC("printf: expected number format specifier after %h or %l");
            }

-            c = utf8_next_char_asserted(&uformat);
+            c = utf8_next_asserted(&uformat);
            switch (c) {
                case 'd':
                    do_number = true;
@ -307,14 +307,14 @@ void printf(const char *format, ...) {
                    break;
                }
                case 'X': {
-                    unicode_char size1 = utf8_next_char_asserted(&uformat);
+                    unicode_char size1 = utf8_next_asserted(&uformat);
                    ASSERT(size1 != 0);
                    if (size1 == '8') {
                        uint8_t val = (uint8_t)va_arg(args, int);
                        putu8x(val);
                        break;
                    }
-                    unicode_char size2 = utf8_next_char_asserted(&uformat);
+                    unicode_char size2 = utf8_next_asserted(&uformat);
                    ASSERT(size2 != 0);
                    if (size1 == '1' && size2 == '6') {
                        uint16_t val = (uint16_t)va_arg(args, int);
@ -332,19 +332,6 @@ void printf(const char *format, ...) {
                        PANIC("printf: invalid X size");
                    }
                }
-                // unicode
-                // remember that you could use unicode in the format string directly,
-                // these are just alternative ways
-                // also remember that you can print custom unicode chars with %c
-                // TODO more than 16 bit
-                case 'U': {
-                    unicode_char spec1 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
-                    unicode_char spec2 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
-                    unicode_char spec3 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
-                    unicode_char spec4 = hexdigit_to_number(utf8_next_char_asserted(&uformat));
-                    putc((spec1 << 12) | (spec2 << 8) | (spec3 << 4) | spec4);
-                    break;
-                }
                case '%': {
                              putc('%');
                              break;
--- a/src/unicode.c
+++ b/src/unicode.c
@ -1,94 +1,11 @@
 #include "unicode.h"
 #include "std.h"

-bool utf32_to_utf8(uint32_t *input, unsigned int input_len_ints, uint8_t *output) {
-    for (unsigned int i = 0; i < input_len_ints; i++) {
-        uint32_t c = input[i];
-        if (c <= 0x7f) {
-            *output++ = (uint8_t)c;
-        } else if (c <= 0x7ff) {
-            *output++ = 0xc0 | (uint8_t)(c >> 6);
-            *output++ = 0x80 | (uint8_t)(c & 0x3f);
-        } else if (c <= 0xffff) {
-            *output++ = 0xe0 | (uint8_t)(c >> 12);
-            *output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
-            *output++ = 0x80 | (uint8_t)(c & 0x3f);
-        } else if (c <= 0x10ffff) {
-            *output++ = 0xf | (uint8_t)(c >> 18);
-            *output++ = 0x80 | (uint8_t)((c >> 12) & 0x3f);
-            *output++ = 0x80 | (uint8_t)((c >> 6) & 0x3f);
-            *output++ = 0x80 | (uint8_t)(c & 0x3f);
-        } else {
-            return false;
-        }
-    }
-    return true;
-}
-
-#define ASSERT_OR_RETURN(cond) if (!(cond)) { return false; }
-
-bool utf8_to_utf32(uint8_t *input, unsigned int input_len_bytes, uint32_t *output) {
-    uint8_t b0, b1, b2, b3;
-    uint32_t codepoint;
-    while (input_len_bytes > 0) {
-        b0 = input[0];
-        if ((b0 & 0x80) == 0) {
-            *output++ = (uint32_t)b0;
-            input += 1;
-            input_len_bytes -= 1;
-        } else if ((b0& 0xe0) == 0xc0) {
-            ASSERT_OR_RETURN(input_len_bytes >= 2);
-            b1 = input[1];
-            ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
-            codepoint = ((uint32_t)(b0 & 0x1f) << 6)
-                | (b1 & 0x3f);
-            // avoid overlong encoding
-            ASSERT_OR_RETURN(codepoint > 0x7f);
-            *output++ = codepoint;
-            input += 2;
-            input_len_bytes -= 2;
-        } else if ((b0 & 0xf0) == 0xe0) {
-            ASSERT_OR_RETURN(input_len_bytes >= 3);
-            b1 = input[1];
-            b2 = input[2];
-            ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
-            ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
-            codepoint = ((uint32_t)(b0 & 0x0f) << 12)
-                | ((uint32_t)(b1 & 0x3f) << 6)
-                | (b2 & 0x3f);
-            // avoid overlong encoding
-            ASSERT_OR_RETURN(codepoint > 0x7ff);
-            *output++ = codepoint;
-            input += 3;
-            input_len_bytes -= 3;
-        } else if ((b0 & 0xf8) == 0xf0) {
-            ASSERT_OR_RETURN(input_len_bytes >= 4);
-            b1 = input[1];
-            b2 = input[2];
-            b3 = input[2];
-            ASSERT_OR_RETURN((b1 & 0xc0) == 0x80);
-            ASSERT_OR_RETURN((b2 & 0xc0) == 0x80);
-            ASSERT_OR_RETURN((b3 & 0xc0) == 0x80);
-            codepoint = ((uint32_t)(b0 & 0x07) << 18)
-                | ((uint32_t)(b1 & 0x3f) << 12)
-                | ((uint32_t)(b2 & 0x3f) << 6)
-                | (b3 & 0x3f);
-            // avoid overlong encoding
-            ASSERT_OR_RETURN(codepoint > 0xffff);
-            ASSERT_OR_RETURN(codepoint <= 0x10ffff);
-            *output++ = codepoint;
-            input += 4;
-            input_len_bytes -= 4;
-        } else {
-            return false;
-        }
-    }
-    return true;
-}
+// --- read utf-8 ---

 #define ASSERT_OR_RETURN_ERR(cond) if (!(cond)) { return UNICODE_ERROR; }

-unicode_char utf8_next_char(const uint8_t **sp) {
+unicode_char utf8_next(const uint8_t **sp) {
    uint8_t b0, b1, b2, b3;
    unicode_char codepoint;

@ -136,25 +53,61 @@ unicode_char utf8_next_char(const uint8_t **sp) {
        // avoid overlong encoding
        ASSERT_OR_RETURN_ERR(codepoint > 0xffff);
        ASSERT_OR_RETURN_ERR(codepoint <= 0x10ffff);
-        *sp += 3;
+        *sp += 4;
        return codepoint;
    } else {
        return UNICODE_ERROR;
    }
 }

-unicode_char utf8_next_char_asserted(const uint8_t **sp) {
-    unicode_char c = utf8_next_char(sp);
+unicode_char utf8_next_asserted(const uint8_t **sp) {
+    unicode_char c = utf8_next(sp);
    ASSERT(c != UNICODE_ERROR);
    return c;
 }

-unicode_char utf8_next_char_peek(const uint8_t *s) {
+unicode_char utf8_read(const uint8_t *s) {
    const uint8_t *ptr_copy = s;
-    return utf8_next_char(&ptr_copy);
+    return utf8_next(&ptr_copy);
 }

-unicode_char utf8_next_char_peek_asserted(const uint8_t *s) {
+unicode_char utf8_read_asserted(const uint8_t *s) {
    const uint8_t *ptr_copy = s;
-    return utf8_next_char_asserted(&ptr_copy);
+    return utf8_next_asserted(&ptr_copy);
+}
+
+// --- write utf-8 ---
+
+unsigned int utf8_write(uint8_t *s, unicode_char c) {
+    if (c <= 0x7f) {
+        s[0] = (uint8_t) c;
+        return 1;
+    } else if (c <= 0x7ff) {
+        s[0] = 0xc0 | (c >> 6);
+        s[1] = 0x80 | (c & 0x3f);
+        return 2;
+    } else if (c <= 0xffff) {
+        s[0] = 0xe0 | (c >> 12);
+        s[1] = 0x80 | ((c >> 6) & 0x3f);
+        s[2] = 0x80 | (c & 0x3f);
+        return 3;
+    } else if (c <= 0x10ffff) {
+        s[0] = 0xf0 | (c >> 18);
+        s[1] = 0x80 | ((c >> 12) & 0x3f);
+        s[2] = 0x80 | ((c >> 6) & 0x3f);
+        s[3] = 0x80 | (c & 0x3f);
+        return 4;
+    } else {
+        PANIC("invalid unicode char");
+    }
+}
+
+void utf8_push(uint8_t **sp, unicode_char c) {
+    ASSERT(c != 0);
+    *sp += utf8_write(*sp, c);
+}
+
+void utf8_push_terminated(uint8_t **sp, unicode_char c) {
+    utf8_push(sp, c);
+    **sp = 0;
 }
Author	SHA1	Message	Date
uosfz	541292b002	document and improve unicode functions	2025-07-31 15:24:42 +02:00
uosfz	bf1e2bdf38	remove %UXXXX unicode printing because it makes no sense to do it with %	2025-07-31 13:23:30 +02:00