parser: handle graphemes
I'm not positive this is the final approach. We fill in the `text` field if there was multi-codepoint text generated from the key_press. Signed-off-by: Tim Culverhouse <tim@timculverhouse.com>
This commit is contained in:
parent
0f12881c17
commit
40525eb038
6 changed files with 137 additions and 18 deletions
78
src/GraphemeCache.zig
Normal file
78
src/GraphemeCache.zig
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
const std = @import("std");
|
||||||
|
const testing = std.testing;
|
||||||
|
|
||||||
|
const GraphemeCache = @This();
|
||||||
|
|
||||||
|
/// the underlying storage for graphemes
|
||||||
|
buf: [1024 * 4]u8 = undefined,
|
||||||
|
|
||||||
|
// the start index of the next grapheme
|
||||||
|
idx: usize = 0,
|
||||||
|
|
||||||
|
/// the cache of graphemes. This allows up to 1024 graphemes with 4 codepoints
|
||||||
|
/// each
|
||||||
|
grapheme_buf: [1024]Grapheme = undefined,
|
||||||
|
|
||||||
|
// index of our next grapheme
|
||||||
|
g_idx: u21 = 0,
|
||||||
|
|
||||||
|
pub const UNICODE_MAX = 1_114_112;
|
||||||
|
|
||||||
|
const Grapheme = struct {
|
||||||
|
// codepoint is an index into the internal storage
|
||||||
|
codepoint: u21,
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// put a slice of bytes in the cache as a grapheme
|
||||||
|
pub fn put(self: *GraphemeCache, bytes: []const u8) !u21 {
|
||||||
|
// See if we already have these bytes. It's a likely case that if we get one
|
||||||
|
// grapheme, we'll get it again. So this will save a lot of storage and is
|
||||||
|
// most likely worth the cost as it's pretty rare
|
||||||
|
for (self.grapheme_buf) |grapheme| {
|
||||||
|
const g_bytes = self.buf[grapheme.start..grapheme.end];
|
||||||
|
if (std.mem.eql(u8, g_bytes, bytes)) {
|
||||||
|
return grapheme.codepoint;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (self.idx + bytes.len > self.buf.len) return error.OutOfGraphemeBufferMemory;
|
||||||
|
if (self.g_idx + 1 > self.grapheme_buf.len) return error.OutOfGraphemeMemory;
|
||||||
|
|
||||||
|
// copy the grapheme to our storage
|
||||||
|
@memcpy(self.buf[self.idx .. self.idx + bytes.len], bytes);
|
||||||
|
|
||||||
|
const g = Grapheme{
|
||||||
|
// assign a codepoint that is always outside of valid unicode
|
||||||
|
.codepoint = self.g_idx + UNICODE_MAX + 1,
|
||||||
|
.start = self.idx,
|
||||||
|
.end = self.idx + bytes.len,
|
||||||
|
};
|
||||||
|
self.grapheme_buf[self.g_idx] = g;
|
||||||
|
self.g_idx += 1;
|
||||||
|
self.idx += bytes.len;
|
||||||
|
|
||||||
|
return g.codepoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// get the slice of bytes for a given grapheme
|
||||||
|
pub fn get(self: *GraphemeCache, cp: u21) ![]const u8 {
|
||||||
|
if (cp < (UNICODE_MAX + 1)) return error.InvalidGraphemeIndex;
|
||||||
|
const idx: usize = cp - UNICODE_MAX - 1;
|
||||||
|
if (idx > self.g_idx) return error.InvalidGraphemeIndex;
|
||||||
|
const g = self.grapheme_buf[idx];
|
||||||
|
return self.buf[g.start..g.end];
|
||||||
|
}
|
||||||
|
|
||||||
|
test "GraphemeCache: roundtrip" {
|
||||||
|
var cache: GraphemeCache = .{};
|
||||||
|
const cp = try cache.put("abc");
|
||||||
|
const bytes = try cache.get(cp);
|
||||||
|
try testing.expectEqualStrings("abc", bytes);
|
||||||
|
|
||||||
|
const cp_2 = try cache.put("abc");
|
||||||
|
try testing.expectEqual(cp, cp_2);
|
||||||
|
|
||||||
|
const cp_3 = try cache.put("def");
|
||||||
|
try testing.expectEqual(cp + 1, cp_3);
|
||||||
|
}
|
|
@ -11,8 +11,7 @@ pub const Modifiers = packed struct(u8) {
|
||||||
num_lock: bool = false,
|
num_lock: bool = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// the unicode codepoint of the key event. This can be greater than the maximum
|
/// the unicode codepoint of the key event.
|
||||||
/// allowable unicode codepoint for special keys
|
|
||||||
codepoint: u21,
|
codepoint: u21,
|
||||||
|
|
||||||
/// the text generated from the key event, if any
|
/// the text generated from the key event, if any
|
||||||
|
|
|
@ -143,7 +143,12 @@ pub fn run(
|
||||||
switch (event) {
|
switch (event) {
|
||||||
.key_press => |key| {
|
.key_press => |key| {
|
||||||
if (@hasField(EventType, "key_press")) {
|
if (@hasField(EventType, "key_press")) {
|
||||||
vx.postEvent(.{ .key_press = key });
|
// HACK: yuck. there has to be a better way
|
||||||
|
var mut_key = key;
|
||||||
|
if (key.text) |text| {
|
||||||
|
mut_key.codepoint = try vx.g_cache.put(text);
|
||||||
|
}
|
||||||
|
vx.postEvent(.{ .key_press = mut_key });
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
.focus_in => {
|
.focus_in => {
|
||||||
|
|
|
@ -13,6 +13,7 @@ pub fn init(comptime EventType: type, opts: Options) !Vaxis(EventType) {
|
||||||
}
|
}
|
||||||
|
|
||||||
test {
|
test {
|
||||||
|
_ = @import("GraphemeCache.zig");
|
||||||
_ = @import("Key.zig");
|
_ = @import("Key.zig");
|
||||||
_ = @import("Options.zig");
|
_ = @import("Options.zig");
|
||||||
_ = @import("Screen.zig");
|
_ = @import("Screen.zig");
|
||||||
|
|
|
@ -4,6 +4,7 @@ const Event = @import("event.zig").Event;
|
||||||
const Key = @import("Key.zig");
|
const Key = @import("Key.zig");
|
||||||
const CodePointIterator = @import("ziglyph").CodePointIterator;
|
const CodePointIterator = @import("ziglyph").CodePointIterator;
|
||||||
const graphemeBreak = @import("ziglyph").graphemeBreak;
|
const graphemeBreak = @import("ziglyph").graphemeBreak;
|
||||||
|
const UNICODE_MAX = @import("GraphemeCache.zig").UNICODE_MAX;
|
||||||
|
|
||||||
const log = std.log.scoped(.parser);
|
const log = std.log.scoped(.parser);
|
||||||
|
|
||||||
|
@ -82,23 +83,31 @@ pub fn parse(input: []const u8) !Result {
|
||||||
// 0x20...0x7E => .{ .codepoint = b },
|
// 0x20...0x7E => .{ .codepoint = b },
|
||||||
0x7F => .{ .codepoint = Key.backspace },
|
0x7F => .{ .codepoint = Key.backspace },
|
||||||
else => blk: {
|
else => blk: {
|
||||||
// TODO: iterate codepoints to find a complete grapheme.
|
|
||||||
// For now we are just taking the first codepoint and
|
|
||||||
// throwing a warning. I think we'll end up mapping a
|
|
||||||
// u21 to a look-aside table of graphemes, I just need
|
|
||||||
// to implement that table somewhere and give access to
|
|
||||||
// it here.
|
|
||||||
var iter: CodePointIterator = .{ .bytes = input[i..] };
|
var iter: CodePointIterator = .{ .bytes = input[i..] };
|
||||||
// return null if we don't have a valid codepoint
|
// return null if we don't have a valid codepoint
|
||||||
const cp = iter.next() orelse return .{ .event = null, .n = 0 };
|
var cp = iter.next() orelse return .{ .event = null, .n = 0 };
|
||||||
if (iter.next()) |next_cp| {
|
|
||||||
var break_state: u3 = 0;
|
var code = cp.code;
|
||||||
if (!graphemeBreak(cp.code, next_cp.code, &break_state)) {
|
const g_start = i;
|
||||||
log.warn("grapheme support not implemented yet", .{});
|
i += cp.len - 1; // subtract one for the loop iter
|
||||||
|
var g_state: u3 = 0;
|
||||||
|
while (iter.next()) |next_cp| {
|
||||||
|
if (graphemeBreak(cp.code, next_cp.code, &g_state)) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
code = UNICODE_MAX + 1;
|
||||||
|
i += next_cp.len;
|
||||||
|
cp = next_cp;
|
||||||
}
|
}
|
||||||
i += cp.len - 1;
|
const text: ?[]const u8 = multi: {
|
||||||
break :blk .{ .codepoint = cp.code };
|
if (code > UNICODE_MAX) {
|
||||||
|
break :multi input[g_start .. i + 1];
|
||||||
|
} else {
|
||||||
|
break :multi null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
break :blk .{ .codepoint = code, .text = text };
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
return .{
|
return .{
|
||||||
|
@ -562,10 +571,27 @@ test "parse: multiple codepoint grapheme" {
|
||||||
const input = "👩🚀";
|
const input = "👩🚀";
|
||||||
const result = try parse(input);
|
const result = try parse(input);
|
||||||
const expected_key: Key = .{
|
const expected_key: Key = .{
|
||||||
.codepoint = 0x1F469,
|
.codepoint = UNICODE_MAX + 1,
|
||||||
|
.text = input,
|
||||||
};
|
};
|
||||||
const expected_event: Event = .{ .key_press = expected_key };
|
const expected_event: Event = .{ .key_press = expected_key };
|
||||||
|
|
||||||
try testing.expectEqual(4, result.n);
|
try testing.expectEqual(input.len, result.n);
|
||||||
try testing.expectEqual(expected_event, result.event);
|
try testing.expectEqual(expected_event, result.event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test "parse: multiple codepoint grapheme with more after" {
|
||||||
|
// TODO: this test is passing but throws a warning. Not sure how we'll
|
||||||
|
// handle graphemes yet
|
||||||
|
const input = "👩🚀abc";
|
||||||
|
const result = try parse(input);
|
||||||
|
const expected_key: Key = .{
|
||||||
|
.codepoint = UNICODE_MAX + 1,
|
||||||
|
.text = "👩🚀",
|
||||||
|
};
|
||||||
|
|
||||||
|
try testing.expectEqual(expected_key.text.?.len, result.n);
|
||||||
|
const actual = result.event.?.key_press;
|
||||||
|
try testing.expectEqualStrings(expected_key.text.?, actual.text.?);
|
||||||
|
try testing.expectEqual(expected_key.codepoint, actual.codepoint);
|
||||||
|
}
|
||||||
|
|
|
@ -9,6 +9,7 @@ const Screen = @import("Screen.zig");
|
||||||
const Window = @import("Window.zig");
|
const Window = @import("Window.zig");
|
||||||
const Options = @import("Options.zig");
|
const Options = @import("Options.zig");
|
||||||
const Style = @import("cell.zig").Style;
|
const Style = @import("cell.zig").Style;
|
||||||
|
const GraphemeCache = @import("GraphemeCache.zig");
|
||||||
|
|
||||||
/// Vaxis is the entrypoint for a Vaxis application. The provided type T should
|
/// Vaxis is the entrypoint for a Vaxis application. The provided type T should
|
||||||
/// be a tagged union which contains all of the events the application will
|
/// be a tagged union which contains all of the events the application will
|
||||||
|
@ -46,6 +47,9 @@ pub fn Vaxis(comptime T: type) type {
|
||||||
renders: usize = 0,
|
renders: usize = 0,
|
||||||
render_dur: i128 = 0,
|
render_dur: i128 = 0,
|
||||||
|
|
||||||
|
// grapheme cache
|
||||||
|
g_cache: GraphemeCache = .{},
|
||||||
|
|
||||||
/// Initialize Vaxis with runtime options
|
/// Initialize Vaxis with runtime options
|
||||||
pub fn init(_: Options) !Self {
|
pub fn init(_: Options) !Self {
|
||||||
return Self{
|
return Self{
|
||||||
|
@ -78,6 +82,12 @@ pub fn Vaxis(comptime T: type) type {
|
||||||
const tpr = @divTrunc(self.render_dur, self.renders);
|
const tpr = @divTrunc(self.render_dur, self.renders);
|
||||||
log.info("total renders = {d}", .{self.renders});
|
log.info("total renders = {d}", .{self.renders});
|
||||||
log.info("microseconds per render = {d}", .{tpr});
|
log.info("microseconds per render = {d}", .{tpr});
|
||||||
|
log.info("cached graphemes n = {d} / {d}, bytes = {d} / {d}", .{
|
||||||
|
self.g_cache.g_idx,
|
||||||
|
self.g_cache.grapheme_buf.len,
|
||||||
|
self.g_cache.idx,
|
||||||
|
self.g_cache.buf.len,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue