From 40525eb0387021a7eef50b736d43325fffef5228 Mon Sep 17 00:00:00 2001 From: Tim Culverhouse Date: Sun, 21 Jan 2024 17:54:44 -0600 Subject: [PATCH] parser: handle graphemes I'm not positive this is the final approach. We fill in the `text` field if there was multi-codepoint text generated from the key_press. Signed-off-by: Tim Culverhouse --- src/GraphemeCache.zig | 78 +++++++++++++++++++++++++++++++++++++++++++ src/Key.zig | 3 +- src/Tty.zig | 7 +++- src/main.zig | 1 + src/parser.zig | 56 ++++++++++++++++++++++--------- src/vaxis.zig | 10 ++++++ 6 files changed, 137 insertions(+), 18 deletions(-) create mode 100644 src/GraphemeCache.zig diff --git a/src/GraphemeCache.zig b/src/GraphemeCache.zig new file mode 100644 index 0000000..779646e --- /dev/null +++ b/src/GraphemeCache.zig @@ -0,0 +1,78 @@ +const std = @import("std"); +const testing = std.testing; + +const GraphemeCache = @This(); + +/// the underlying storage for graphemes +buf: [1024 * 4]u8 = undefined, + +// the start index of the next grapheme +idx: usize = 0, + +/// the cache of graphemes. This allows up to 1024 graphemes with 4 codepoints +/// each +grapheme_buf: [1024]Grapheme = undefined, + +// index of our next grapheme +g_idx: u21 = 0, + +pub const UNICODE_MAX = 1_114_112; + +const Grapheme = struct { + // codepoint is an index into the internal storage + codepoint: u21, + start: usize, + end: usize, +}; + +/// put a slice of bytes in the cache as a grapheme +pub fn put(self: *GraphemeCache, bytes: []const u8) !u21 { + // See if we already have these bytes. It's a likely case that if we get one + // grapheme, we'll get it again. So this will save a lot of storage and is + // most likely worth the cost as it's pretty rare + for (self.grapheme_buf) |grapheme| { + const g_bytes = self.buf[grapheme.start..grapheme.end]; + if (std.mem.eql(u8, g_bytes, bytes)) { + return grapheme.codepoint; + } + } + if (self.idx + bytes.len > self.buf.len) return error.OutOfGraphemeBufferMemory; + if (self.g_idx + 1 > self.grapheme_buf.len) return error.OutOfGraphemeMemory; + + // copy the grapheme to our storage + @memcpy(self.buf[self.idx .. self.idx + bytes.len], bytes); + + const g = Grapheme{ + // assign a codepoint that is always outside of valid unicode + .codepoint = self.g_idx + UNICODE_MAX + 1, + .start = self.idx, + .end = self.idx + bytes.len, + }; + self.grapheme_buf[self.g_idx] = g; + self.g_idx += 1; + self.idx += bytes.len; + + return g.codepoint; +} + +/// get the slice of bytes for a given grapheme +pub fn get(self: *GraphemeCache, cp: u21) ![]const u8 { + if (cp < (UNICODE_MAX + 1)) return error.InvalidGraphemeIndex; + const idx: usize = cp - UNICODE_MAX - 1; + if (idx > self.g_idx) return error.InvalidGraphemeIndex; + const g = self.grapheme_buf[idx]; + return self.buf[g.start..g.end]; +} + +test "GraphemeCache: roundtrip" { + var cache: GraphemeCache = .{}; + const cp = try cache.put("abc"); + const bytes = try cache.get(cp); + try testing.expectEqualStrings("abc", bytes); + + const cp_2 = try cache.put("abc"); + try testing.expectEqual(cp, cp_2); + + const cp_3 = try cache.put("def"); + try testing.expectEqual(cp + 1, cp_3); +} diff --git a/src/Key.zig b/src/Key.zig index 7f5eaf9..d0486a5 100644 --- a/src/Key.zig +++ b/src/Key.zig @@ -11,8 +11,7 @@ pub const Modifiers = packed struct(u8) { num_lock: bool = false, }; -/// the unicode codepoint of the key event. This can be greater than the maximum -/// allowable unicode codepoint for special keys +/// the unicode codepoint of the key event. codepoint: u21, /// the text generated from the key event, if any diff --git a/src/Tty.zig b/src/Tty.zig index 6244b33..6c9b562 100644 --- a/src/Tty.zig +++ b/src/Tty.zig @@ -143,7 +143,12 @@ pub fn run( switch (event) { .key_press => |key| { if (@hasField(EventType, "key_press")) { - vx.postEvent(.{ .key_press = key }); + // HACK: yuck. there has to be a better way + var mut_key = key; + if (key.text) |text| { + mut_key.codepoint = try vx.g_cache.put(text); + } + vx.postEvent(.{ .key_press = mut_key }); } }, .focus_in => { diff --git a/src/main.zig b/src/main.zig index 5f53a2b..bac994d 100644 --- a/src/main.zig +++ b/src/main.zig @@ -13,6 +13,7 @@ pub fn init(comptime EventType: type, opts: Options) !Vaxis(EventType) { } test { + _ = @import("GraphemeCache.zig"); _ = @import("Key.zig"); _ = @import("Options.zig"); _ = @import("Screen.zig"); diff --git a/src/parser.zig b/src/parser.zig index cb8bcc2..ca5d191 100644 --- a/src/parser.zig +++ b/src/parser.zig @@ -4,6 +4,7 @@ const Event = @import("event.zig").Event; const Key = @import("Key.zig"); const CodePointIterator = @import("ziglyph").CodePointIterator; const graphemeBreak = @import("ziglyph").graphemeBreak; +const UNICODE_MAX = @import("GraphemeCache.zig").UNICODE_MAX; const log = std.log.scoped(.parser); @@ -82,23 +83,31 @@ pub fn parse(input: []const u8) !Result { // 0x20...0x7E => .{ .codepoint = b }, 0x7F => .{ .codepoint = Key.backspace }, else => blk: { - // TODO: iterate codepoints to find a complete grapheme. - // For now we are just taking the first codepoint and - // throwing a warning. I think we'll end up mapping a - // u21 to a look-aside table of graphemes, I just need - // to implement that table somewhere and give access to - // it here. var iter: CodePointIterator = .{ .bytes = input[i..] }; // return null if we don't have a valid codepoint - const cp = iter.next() orelse return .{ .event = null, .n = 0 }; - if (iter.next()) |next_cp| { - var break_state: u3 = 0; - if (!graphemeBreak(cp.code, next_cp.code, &break_state)) { - log.warn("grapheme support not implemented yet", .{}); + var cp = iter.next() orelse return .{ .event = null, .n = 0 }; + + var code = cp.code; + const g_start = i; + i += cp.len - 1; // subtract one for the loop iter + var g_state: u3 = 0; + while (iter.next()) |next_cp| { + if (graphemeBreak(cp.code, next_cp.code, &g_state)) { + break; } + code = UNICODE_MAX + 1; + i += next_cp.len; + cp = next_cp; } - i += cp.len - 1; - break :blk .{ .codepoint = cp.code }; + const text: ?[]const u8 = multi: { + if (code > UNICODE_MAX) { + break :multi input[g_start .. i + 1]; + } else { + break :multi null; + } + }; + + break :blk .{ .codepoint = code, .text = text }; }, }; return .{ @@ -562,10 +571,27 @@ test "parse: multiple codepoint grapheme" { const input = "👩‍🚀"; const result = try parse(input); const expected_key: Key = .{ - .codepoint = 0x1F469, + .codepoint = UNICODE_MAX + 1, + .text = input, }; const expected_event: Event = .{ .key_press = expected_key }; - try testing.expectEqual(4, result.n); + try testing.expectEqual(input.len, result.n); try testing.expectEqual(expected_event, result.event); } + +test "parse: multiple codepoint grapheme with more after" { + // TODO: this test is passing but throws a warning. Not sure how we'll + // handle graphemes yet + const input = "👩‍🚀abc"; + const result = try parse(input); + const expected_key: Key = .{ + .codepoint = UNICODE_MAX + 1, + .text = "👩‍🚀", + }; + + try testing.expectEqual(expected_key.text.?.len, result.n); + const actual = result.event.?.key_press; + try testing.expectEqualStrings(expected_key.text.?, actual.text.?); + try testing.expectEqual(expected_key.codepoint, actual.codepoint); +} diff --git a/src/vaxis.zig b/src/vaxis.zig index 253f173..7f231d1 100644 --- a/src/vaxis.zig +++ b/src/vaxis.zig @@ -9,6 +9,7 @@ const Screen = @import("Screen.zig"); const Window = @import("Window.zig"); const Options = @import("Options.zig"); const Style = @import("cell.zig").Style; +const GraphemeCache = @import("GraphemeCache.zig"); /// Vaxis is the entrypoint for a Vaxis application. The provided type T should /// be a tagged union which contains all of the events the application will @@ -46,6 +47,9 @@ pub fn Vaxis(comptime T: type) type { renders: usize = 0, render_dur: i128 = 0, + // grapheme cache + g_cache: GraphemeCache = .{}, + /// Initialize Vaxis with runtime options pub fn init(_: Options) !Self { return Self{ @@ -78,6 +82,12 @@ pub fn Vaxis(comptime T: type) type { const tpr = @divTrunc(self.render_dur, self.renders); log.info("total renders = {d}", .{self.renders}); log.info("microseconds per render = {d}", .{tpr}); + log.info("cached graphemes n = {d} / {d}, bytes = {d} / {d}", .{ + self.g_cache.g_idx, + self.g_cache.grapheme_buf.len, + self.g_cache.idx, + self.g_cache.buf.len, + }); } }