Fix Unicode (#135)
* init * wip * wip * fix unicode break * fix unicode break * Update helix-core/src/transaction.rs Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu> * clippy * fix * add changes * added test * wip * wip * wip * wip * fix * fix view * fix #88 Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu>
This commit is contained in:
parent
8f1eb7b2b0
commit
b873fb9897
7 changed files with 94 additions and 26 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
@ -265,6 +265,7 @@ dependencies = [
|
||||||
"tendril",
|
"tendril",
|
||||||
"toml",
|
"toml",
|
||||||
"tree-sitter",
|
"tree-sitter",
|
||||||
|
"unicode-general-category",
|
||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
"unicode-width",
|
"unicode-width",
|
||||||
]
|
]
|
||||||
|
@ -969,6 +970,12 @@ dependencies = [
|
||||||
"matches",
|
"matches",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-general-category"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "07547e3ee45e28326cc23faac56d44f58f16ab23e413db526debce3b0bfd2742"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-normalization"
|
name = "unicode-normalization"
|
||||||
version = "0.1.19"
|
version = "0.1.19"
|
||||||
|
|
|
@ -19,6 +19,7 @@ smallvec = "1.4"
|
||||||
tendril = "0.4.2"
|
tendril = "0.4.2"
|
||||||
unicode-segmentation = "1.6"
|
unicode-segmentation = "1.6"
|
||||||
unicode-width = "0.1"
|
unicode-width = "0.1"
|
||||||
|
unicode-general-category = "0.4.0"
|
||||||
# slab = "0.4.2"
|
# slab = "0.4.2"
|
||||||
tree-sitter = "0.19"
|
tree-sitter = "0.19"
|
||||||
once_cell = "1.4"
|
once_cell = "1.4"
|
||||||
|
|
|
@ -88,11 +88,11 @@ pub fn move_next_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->
|
||||||
|
|
||||||
if is_word(ch) {
|
if is_word(ch) {
|
||||||
skip_over_next(slice, &mut end, is_word);
|
skip_over_next(slice, &mut end, is_word);
|
||||||
} else if ch.is_ascii_punctuation() {
|
} else if is_punctuation(ch) {
|
||||||
skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation());
|
skip_over_next(slice, &mut end, is_punctuation);
|
||||||
}
|
}
|
||||||
|
|
||||||
skip_over_next(slice, &mut end, is_horiz_blank);
|
skip_over_next(slice, &mut end, char::is_whitespace);
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(Range::new(begin, end - 1))
|
Some(Range::new(begin, end - 1))
|
||||||
|
@ -119,15 +119,15 @@ pub fn move_prev_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->
|
||||||
|
|
||||||
end = begin;
|
end = begin;
|
||||||
|
|
||||||
with_end = skip_over_prev(slice, &mut end, is_horiz_blank);
|
with_end = skip_over_prev(slice, &mut end, char::is_whitespace);
|
||||||
|
|
||||||
// refetch
|
// refetch
|
||||||
let ch = slice.char(end);
|
let ch = slice.char(end);
|
||||||
|
|
||||||
if is_word(ch) {
|
if is_word(ch) {
|
||||||
with_end = skip_over_prev(slice, &mut end, is_word);
|
with_end = skip_over_prev(slice, &mut end, is_word);
|
||||||
} else if ch.is_ascii_punctuation() {
|
} else if is_punctuation(ch) {
|
||||||
with_end = skip_over_prev(slice, &mut end, |ch| ch.is_ascii_punctuation());
|
with_end = skip_over_prev(slice, &mut end, is_punctuation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,15 +155,15 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O
|
||||||
|
|
||||||
end = begin;
|
end = begin;
|
||||||
|
|
||||||
skip_over_next(slice, &mut end, is_horiz_blank);
|
skip_over_next(slice, &mut end, char::is_whitespace);
|
||||||
|
|
||||||
// refetch
|
// refetch
|
||||||
let ch = slice.char(end);
|
let ch = slice.char(end);
|
||||||
|
|
||||||
if is_word(ch) {
|
if is_word(ch) {
|
||||||
skip_over_next(slice, &mut end, is_word);
|
skip_over_next(slice, &mut end, is_word);
|
||||||
} else if ch.is_ascii_punctuation() {
|
} else if is_punctuation(ch) {
|
||||||
skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation());
|
skip_over_next(slice, &mut end, is_punctuation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,12 +174,28 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O
|
||||||
|
|
||||||
// used for by-word movement
|
// used for by-word movement
|
||||||
|
|
||||||
|
#[inline]
|
||||||
pub(crate) fn is_word(ch: char) -> bool {
|
pub(crate) fn is_word(ch: char) -> bool {
|
||||||
ch.is_alphanumeric() || ch == '_'
|
ch.is_alphanumeric() || ch == '_'
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn is_horiz_blank(ch: char) -> bool {
|
#[inline]
|
||||||
matches!(ch, ' ' | '\t')
|
pub(crate) fn is_punctuation(ch: char) -> bool {
|
||||||
|
use unicode_general_category::{get_general_category, GeneralCategory};
|
||||||
|
|
||||||
|
matches!(
|
||||||
|
get_general_category(ch),
|
||||||
|
GeneralCategory::OtherPunctuation
|
||||||
|
| GeneralCategory::OpenPunctuation
|
||||||
|
| GeneralCategory::ClosePunctuation
|
||||||
|
| GeneralCategory::InitialPunctuation
|
||||||
|
| GeneralCategory::FinalPunctuation
|
||||||
|
| GeneralCategory::ConnectorPunctuation
|
||||||
|
| GeneralCategory::DashPunctuation
|
||||||
|
| GeneralCategory::MathSymbol
|
||||||
|
| GeneralCategory::CurrencySymbol
|
||||||
|
| GeneralCategory::ModifierSymbol
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq)]
|
#[derive(Debug, Eq, PartialEq)]
|
||||||
|
@ -191,14 +207,15 @@ pub(crate) enum Category {
|
||||||
Unknown,
|
Unknown,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
pub(crate) fn categorize(ch: char) -> Category {
|
pub(crate) fn categorize(ch: char) -> Category {
|
||||||
if ch == '\n' {
|
if ch == '\n' {
|
||||||
Category::Eol
|
Category::Eol
|
||||||
} else if ch.is_ascii_whitespace() {
|
} else if ch.is_whitespace() {
|
||||||
Category::Whitespace
|
Category::Whitespace
|
||||||
} else if is_word(ch) {
|
} else if is_word(ch) {
|
||||||
Category::Word
|
Category::Word
|
||||||
} else if ch.is_ascii_punctuation() {
|
} else if is_punctuation(ch) {
|
||||||
Category::Punctuation
|
Category::Punctuation
|
||||||
} else {
|
} else {
|
||||||
Category::Unknown
|
Category::Unknown
|
||||||
|
@ -213,6 +230,7 @@ where
|
||||||
{
|
{
|
||||||
let mut chars = slice.chars_at(*pos);
|
let mut chars = slice.chars_at(*pos);
|
||||||
|
|
||||||
|
#[allow(clippy::while_let_on_iterator)]
|
||||||
while let Some(ch) = chars.next() {
|
while let Some(ch) = chars.next() {
|
||||||
if !fun(ch) {
|
if !fun(ch) {
|
||||||
break;
|
break;
|
||||||
|
@ -231,6 +249,7 @@ where
|
||||||
// need to +1 so that prev() includes current char
|
// need to +1 so that prev() includes current char
|
||||||
let mut chars = slice.chars_at(*pos + 1);
|
let mut chars = slice.chars_at(*pos + 1);
|
||||||
|
|
||||||
|
#[allow(clippy::while_let_on_iterator)]
|
||||||
while let Some(ch) = chars.prev() {
|
while let Some(ch) = chars.prev() {
|
||||||
if !fun(ch) {
|
if !fun(ch) {
|
||||||
break;
|
break;
|
||||||
|
@ -259,4 +278,44 @@ mod test {
|
||||||
(1, 2).into()
|
(1, 2).into()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_categorize() {
|
||||||
|
const WORD_TEST_CASE: &'static str =
|
||||||
|
"_hello_world_あいうえおー12345678901234567890";
|
||||||
|
const PUNCTUATION_TEST_CASE: &'static str = "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~";
|
||||||
|
const WHITESPACE_TEST_CASE: &'static str = " ";
|
||||||
|
|
||||||
|
assert_eq!(Category::Eol, categorize('\n'));
|
||||||
|
|
||||||
|
for ch in WHITESPACE_TEST_CASE.chars() {
|
||||||
|
assert_eq!(
|
||||||
|
Category::Whitespace,
|
||||||
|
categorize(ch),
|
||||||
|
"Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
|
||||||
|
ch,
|
||||||
|
categorize(ch)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ch in WORD_TEST_CASE.chars() {
|
||||||
|
assert_eq!(
|
||||||
|
Category::Word,
|
||||||
|
categorize(ch),
|
||||||
|
"Testing '{}', but got `{:?}` instead of `Category::Word`",
|
||||||
|
ch,
|
||||||
|
categorize(ch)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ch in PUNCTUATION_TEST_CASE.chars() {
|
||||||
|
assert_eq!(
|
||||||
|
Category::Punctuation,
|
||||||
|
categorize(ch),
|
||||||
|
"Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
|
||||||
|
ch,
|
||||||
|
categorize(ch)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -758,7 +758,7 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn combine_with_utf8() {
|
fn combine_with_utf8() {
|
||||||
const TEST_CASE: &'static str = "Hello, これはヒレクスエディターです!";
|
const TEST_CASE: &'static str = "Hello, これはヘリックスエディターです!";
|
||||||
|
|
||||||
let empty = Rope::from("");
|
let empty = Rope::from("");
|
||||||
let mut a = ChangeSet::new(&empty);
|
let mut a = ChangeSet::new(&empty);
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
use crate::movement::{categorize, is_horiz_blank, is_word, skip_over_prev};
|
use crate::movement::{categorize, is_punctuation, is_word, skip_over_prev};
|
||||||
use ropey::RopeSlice;
|
use ropey::RopeSlice;
|
||||||
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
|
@ -13,15 +13,15 @@ pub fn nth_prev_word_boundary(slice: RopeSlice, mut char_idx: usize, count: usiz
|
||||||
// return if not skip while?
|
// return if not skip while?
|
||||||
skip_over_prev(slice, &mut char_idx, |ch| ch == '\n');
|
skip_over_prev(slice, &mut char_idx, |ch| ch == '\n');
|
||||||
|
|
||||||
with_end = skip_over_prev(slice, &mut char_idx, is_horiz_blank);
|
with_end = skip_over_prev(slice, &mut char_idx, char::is_whitespace);
|
||||||
|
|
||||||
// refetch
|
// refetch
|
||||||
let ch = slice.char(char_idx);
|
let ch = slice.char(char_idx);
|
||||||
|
|
||||||
if is_word(ch) {
|
if is_word(ch) {
|
||||||
with_end = skip_over_prev(slice, &mut char_idx, is_word);
|
with_end = skip_over_prev(slice, &mut char_idx, is_word);
|
||||||
} else if ch.is_ascii_punctuation() {
|
} else if is_punctuation(ch) {
|
||||||
with_end = skip_over_prev(slice, &mut char_idx, |ch| ch.is_ascii_punctuation());
|
with_end = skip_over_prev(slice, &mut char_idx, is_punctuation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,11 +47,11 @@ fn different_prev_word_boundary() {
|
||||||
t("hello, world", "hello, ");
|
t("hello, world", "hello, ");
|
||||||
t("hello, ", "hello");
|
t("hello, ", "hello");
|
||||||
t("hello", "");
|
t("hello", "");
|
||||||
t("こんにちは、世界!", "こんにちは、世界!"); // TODO: punctuation
|
t("こんにちは、世界!", "こんにちは、世界");
|
||||||
t("こんにちは、世界", "こんにちは、");
|
t("こんにちは、世界", "こんにちは、");
|
||||||
t("こんにちは、", "こんにちは、"); // what?
|
t("こんにちは、", "こんにちは");
|
||||||
t("こんにちは", "");
|
t("こんにちは", "");
|
||||||
t("この世界。", "この世界。"); // what?
|
t("この世界。", "この世界");
|
||||||
t("この世界", "");
|
t("この世界", "");
|
||||||
t("お前はもう死んでいる", "");
|
t("お前はもう死んでいる", "");
|
||||||
t("その300円です", ""); // TODO: should stop at 300
|
t("その300円です", ""); // TODO: should stop at 300
|
||||||
|
|
|
@ -654,9 +654,10 @@ pub fn split_selection_on_newline(cx: &mut Context) {
|
||||||
fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, extend: bool) {
|
fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, extend: bool) {
|
||||||
let text = doc.text();
|
let text = doc.text();
|
||||||
let selection = doc.selection(view.id);
|
let selection = doc.selection(view.id);
|
||||||
let start = selection.cursor();
|
let start = text.char_to_byte(selection.cursor());
|
||||||
|
|
||||||
// use find_at to find the next match after the cursor, loop around the end
|
// use find_at to find the next match after the cursor, loop around the end
|
||||||
|
// Careful, `Regex` uses `bytes` as offsets, not character indices!
|
||||||
let mat = regex
|
let mat = regex
|
||||||
.find_at(contents, start)
|
.find_at(contents, start)
|
||||||
.or_else(|| regex.find(contents));
|
.or_else(|| regex.find(contents));
|
||||||
|
@ -670,7 +671,7 @@ fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, e
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let head = end;
|
let head = end - 1;
|
||||||
|
|
||||||
let selection = if extend {
|
let selection = if extend {
|
||||||
selection.clone().push(Range::new(start, head))
|
selection.clone().push(Range::new(start, head))
|
||||||
|
@ -1027,7 +1028,7 @@ pub fn command_mode(cx: &mut Context) {
|
||||||
let mut prompt = Prompt::new(
|
let mut prompt = Prompt::new(
|
||||||
":".to_owned(),
|
":".to_owned(),
|
||||||
|input: &str| {
|
|input: &str| {
|
||||||
// we use .this over split_ascii_whitespace() because we care about empty segments
|
// we use .this over split_whitespace() because we care about empty segments
|
||||||
let parts = input.split(' ').collect::<Vec<&str>>();
|
let parts = input.split(' ').collect::<Vec<&str>>();
|
||||||
|
|
||||||
// simple heuristic: if there's no just one part, complete command name.
|
// simple heuristic: if there's no just one part, complete command name.
|
||||||
|
@ -1069,7 +1070,7 @@ pub fn command_mode(cx: &mut Context) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let parts = input.split_ascii_whitespace().collect::<Vec<&str>>();
|
let parts = input.split_whitespace().collect::<Vec<&str>>();
|
||||||
if parts.is_empty() {
|
if parts.is_empty() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,7 +106,7 @@ impl View {
|
||||||
/// Calculates the last visible line on screen
|
/// Calculates the last visible line on screen
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn last_line(&self, doc: &Document) -> usize {
|
pub fn last_line(&self, doc: &Document) -> usize {
|
||||||
let height = self.area.height.saturating_sub(2); // - 2 for statusline
|
let height = self.area.height.saturating_sub(1); // - 1 for statusline
|
||||||
std::cmp::min(
|
std::cmp::min(
|
||||||
self.first_line + height as usize,
|
self.first_line + height as usize,
|
||||||
doc.text().len_lines() - 1,
|
doc.text().len_lines() - 1,
|
||||||
|
|
Loading…
Add table
Reference in a new issue