parser: handle graphemes

I'm not positive this is the final approach. We fill in the `text` field if there was multi-codepoint text generated from the key_press. Signed-off-by: Tim Culverhouse <tim@timculverhouse.com>
2024-01-21 17:54:44 -06:00 · 2024-01-21 17:54:44 -06:00 · 40525eb038
commit 40525eb038
parent 0f12881c17
6 changed files with 137 additions and 18 deletions
--- a/src/GraphemeCache.zig
+++ b/src/GraphemeCache.zig
@ -0,0 +1,78 @@
+const std = @import("std");
+const testing = std.testing;
+
+const GraphemeCache = @This();
+
+/// the underlying storage for graphemes
+buf: [1024 * 4]u8 = undefined,
+
+// the start index of the next grapheme
+idx: usize = 0,
+
+/// the cache of graphemes. This allows up to 1024 graphemes with 4 codepoints
+/// each
+grapheme_buf: [1024]Grapheme = undefined,
+
+// index of our next grapheme
+g_idx: u21 = 0,
+
+pub const UNICODE_MAX = 1_114_112;
+
+const Grapheme = struct {
+    // codepoint is an index into the internal storage
+    codepoint: u21,
+    start: usize,
+    end: usize,
+};
+
+/// put a slice of bytes in the cache as a grapheme
+pub fn put(self: *GraphemeCache, bytes: []const u8) !u21 {
+    // See if we already have these bytes. It's a likely case that if we get one
+    // grapheme, we'll get it again. So this will save a lot of storage and is
+    // most likely worth the cost as it's pretty rare
+    for (self.grapheme_buf) |grapheme| {
+        const g_bytes = self.buf[grapheme.start..grapheme.end];
+        if (std.mem.eql(u8, g_bytes, bytes)) {
+            return grapheme.codepoint;
+        }
+    }
+    if (self.idx + bytes.len > self.buf.len) return error.OutOfGraphemeBufferMemory;
+    if (self.g_idx + 1 > self.grapheme_buf.len) return error.OutOfGraphemeMemory;
+
+    // copy the grapheme to our storage
+    @memcpy(self.buf[self.idx .. self.idx + bytes.len], bytes);
+
+    const g = Grapheme{
+        // assign a codepoint that is always outside of valid unicode
+        .codepoint = self.g_idx + UNICODE_MAX + 1,
+        .start = self.idx,
+        .end = self.idx + bytes.len,
+    };
+    self.grapheme_buf[self.g_idx] = g;
+    self.g_idx += 1;
+    self.idx += bytes.len;
+
+    return g.codepoint;
+}
+
+/// get the slice of bytes for a given grapheme
+pub fn get(self: *GraphemeCache, cp: u21) ![]const u8 {
+    if (cp < (UNICODE_MAX + 1)) return error.InvalidGraphemeIndex;
+    const idx: usize = cp - UNICODE_MAX - 1;
+    if (idx > self.g_idx) return error.InvalidGraphemeIndex;
+    const g = self.grapheme_buf[idx];
+    return self.buf[g.start..g.end];
+}
+
+test "GraphemeCache: roundtrip" {
+    var cache: GraphemeCache = .{};
+    const cp = try cache.put("abc");
+    const bytes = try cache.get(cp);
+    try testing.expectEqualStrings("abc", bytes);
+
+    const cp_2 = try cache.put("abc");
+    try testing.expectEqual(cp, cp_2);
+
+    const cp_3 = try cache.put("def");
+    try testing.expectEqual(cp + 1, cp_3);
+}
--- a/src/Key.zig
+++ b/src/Key.zig
@ -11,8 +11,7 @@ pub const Modifiers = packed struct(u8) {
    num_lock: bool = false,
 };

-/// the unicode codepoint of the key event. This can be greater than the maximum
-/// allowable unicode codepoint for special keys
+/// the unicode codepoint of the key event.
 codepoint: u21,

 /// the text generated from the key event, if any
--- a/src/Tty.zig
+++ b/src/Tty.zig
@ -143,7 +143,12 @@ pub fn run(
            switch (event) {
                .key_press => |key| {
                    if (@hasField(EventType, "key_press")) {
-                        vx.postEvent(.{ .key_press = key });
+                        // HACK: yuck. there has to be a better way
+                        var mut_key = key;
+                        if (key.text) |text| {
+                            mut_key.codepoint = try vx.g_cache.put(text);
+                        }
+                        vx.postEvent(.{ .key_press = mut_key });
                    }
                },
                .focus_in => {
--- a/src/main.zig
+++ b/src/main.zig
@ -13,6 +13,7 @@ pub fn init(comptime EventType: type, opts: Options) !Vaxis(EventType) {
 }

 test {
+    _ = @import("GraphemeCache.zig");
    _ = @import("Key.zig");
    _ = @import("Options.zig");
    _ = @import("Screen.zig");
--- a/src/parser.zig
+++ b/src/parser.zig
@ -4,6 +4,7 @@ const Event = @import("event.zig").Event;
 const Key = @import("Key.zig");
 const CodePointIterator = @import("ziglyph").CodePointIterator;
 const graphemeBreak = @import("ziglyph").graphemeBreak;
+const UNICODE_MAX = @import("GraphemeCache.zig").UNICODE_MAX;

 const log = std.log.scoped(.parser);

@ -82,23 +83,31 @@ pub fn parse(input: []const u8) !Result {
                    // 0x20...0x7E => .{ .codepoint = b },
                    0x7F => .{ .codepoint = Key.backspace },
                    else => blk: {
-                        // TODO: iterate codepoints to find a complete grapheme.
-                        // For now we are just taking the first codepoint and
-                        // throwing a warning. I think we'll end up mapping a
-                        // u21 to a look-aside table of graphemes, I just need
-                        // to implement that table somewhere and give access to
-                        // it here.
                        var iter: CodePointIterator = .{ .bytes = input[i..] };
                        // return null if we don't have a valid codepoint
-                        const cp = iter.next() orelse return .{ .event = null, .n = 0 };
-                        if (iter.next()) |next_cp| {
-                            var break_state: u3 = 0;
-                            if (!graphemeBreak(cp.code, next_cp.code, &break_state)) {
-                                log.warn("grapheme support not implemented yet", .{});
+                        var cp = iter.next() orelse return .{ .event = null, .n = 0 };
+
+                        var code = cp.code;
+                        const g_start = i;
+                        i += cp.len - 1; // subtract one for the loop iter
+                        var g_state: u3 = 0;
+                        while (iter.next()) |next_cp| {
+                            if (graphemeBreak(cp.code, next_cp.code, &g_state)) {
+                                break;
                            }
+                            code = UNICODE_MAX + 1;
+                            i += next_cp.len;
+                            cp = next_cp;
                        }
-                        i += cp.len - 1;
-                        break :blk .{ .codepoint = cp.code };
+                        const text: ?[]const u8 = multi: {
+                            if (code > UNICODE_MAX) {
+                                break :multi input[g_start .. i + 1];
+                            } else {
+                                break :multi null;
+                            }
+                        };
+
+                        break :blk .{ .codepoint = code, .text = text };
                    },
                };
                return .{
@ -562,10 +571,27 @@ test "parse: multiple codepoint grapheme" {
    const input = "👩‍🚀";
    const result = try parse(input);
    const expected_key: Key = .{
-        .codepoint = 0x1F469,
+        .codepoint = UNICODE_MAX + 1,
+        .text = input,
    };
    const expected_event: Event = .{ .key_press = expected_key };

-    try testing.expectEqual(4, result.n);
+    try testing.expectEqual(input.len, result.n);
    try testing.expectEqual(expected_event, result.event);
 }
+
+test "parse: multiple codepoint grapheme with more after" {
+    // TODO: this test is passing but throws a warning. Not sure how we'll
+    // handle graphemes yet
+    const input = "👩‍🚀abc";
+    const result = try parse(input);
+    const expected_key: Key = .{
+        .codepoint = UNICODE_MAX + 1,
+        .text = "👩‍🚀",
+    };
+
+    try testing.expectEqual(expected_key.text.?.len, result.n);
+    const actual = result.event.?.key_press;
+    try testing.expectEqualStrings(expected_key.text.?, actual.text.?);
+    try testing.expectEqual(expected_key.codepoint, actual.codepoint);
+}
--- a/src/vaxis.zig
+++ b/src/vaxis.zig
@ -9,6 +9,7 @@ const Screen = @import("Screen.zig");
 const Window = @import("Window.zig");
 const Options = @import("Options.zig");
 const Style = @import("cell.zig").Style;
+const GraphemeCache = @import("GraphemeCache.zig");

 /// Vaxis is the entrypoint for a Vaxis application. The provided type T should
 /// be a tagged union which contains all of the events the application will
@ -46,6 +47,9 @@ pub fn Vaxis(comptime T: type) type {
        renders: usize = 0,
        render_dur: i128 = 0,

+        // grapheme cache
+        g_cache: GraphemeCache = .{},
+
        /// Initialize Vaxis with runtime options
        pub fn init(_: Options) !Self {
            return Self{
@ -78,6 +82,12 @@ pub fn Vaxis(comptime T: type) type {
                const tpr = @divTrunc(self.render_dur, self.renders);
                log.info("total renders = {d}", .{self.renders});
                log.info("microseconds per render = {d}", .{tpr});
+                log.info("cached graphemes n = {d} / {d}, bytes = {d} / {d}", .{
+                    self.g_cache.g_idx,
+                    self.g_cache.grapheme_buf.len,
+                    self.g_cache.idx,
+                    self.g_cache.buf.len,
+                });
            }
        }