libvaxis/src/GraphemeCache.zig

79 lines
2.4 KiB
Zig
Raw Normal View History

const std = @import("std");
const testing = std.testing;
const GraphemeCache = @This();
/// the underlying storage for graphemes
buf: [1024 * 4]u8 = undefined,
// the start index of the next grapheme
idx: usize = 0,
/// the cache of graphemes. This allows up to 1024 graphemes with 4 codepoints
/// each
grapheme_buf: [1024]Grapheme = undefined,
// index of our next grapheme
g_idx: u21 = 0,
pub const UNICODE_MAX = 1_114_112;
const Grapheme = struct {
// codepoint is an index into the internal storage
codepoint: u21,
start: usize,
end: usize,
};
/// put a slice of bytes in the cache as a grapheme
pub fn put(self: *GraphemeCache, bytes: []const u8) !u21 {
// See if we already have these bytes. It's a likely case that if we get one
// grapheme, we'll get it again. So this will save a lot of storage and is
// most likely worth the cost as it's pretty rare
for (self.grapheme_buf) |grapheme| {
const g_bytes = self.buf[grapheme.start..grapheme.end];
if (std.mem.eql(u8, g_bytes, bytes)) {
return grapheme.codepoint;
}
}
if (self.idx + bytes.len > self.buf.len) return error.OutOfGraphemeBufferMemory;
if (self.g_idx + 1 > self.grapheme_buf.len) return error.OutOfGraphemeMemory;
// copy the grapheme to our storage
@memcpy(self.buf[self.idx .. self.idx + bytes.len], bytes);
const g = Grapheme{
// assign a codepoint that is always outside of valid unicode
.codepoint = self.g_idx + UNICODE_MAX + 1,
.start = self.idx,
.end = self.idx + bytes.len,
};
self.grapheme_buf[self.g_idx] = g;
self.g_idx += 1;
self.idx += bytes.len;
return g.codepoint;
}
/// get the slice of bytes for a given grapheme
pub fn get(self: *GraphemeCache, cp: u21) ![]const u8 {
if (cp < (UNICODE_MAX + 1)) return error.InvalidGraphemeIndex;
const idx: usize = cp - UNICODE_MAX - 1;
if (idx > self.g_idx) return error.InvalidGraphemeIndex;
const g = self.grapheme_buf[idx];
return self.buf[g.start..g.end];
}
test "GraphemeCache: roundtrip" {
var cache: GraphemeCache = .{};
const cp = try cache.put("abc");
const bytes = try cache.get(cp);
try testing.expectEqualStrings("abc", bytes);
const cp_2 = try cache.put("abc");
try testing.expectEqual(cp, cp_2);
const cp_3 = try cache.put("def");
try testing.expectEqual(cp + 1, cp_3);
}