Merge remote-tracking branch 'upstream/word-completion'

2025-04-05 19:10:07 +02:00 · 2025-04-05 19:10:07 +02:00 · abc39e381a
commit abc39e381a
parent d26f1b9d41 9720020504
15 changed files with 958 additions and 5 deletions
--- a/book/src/editor.md
+++ b/book/src/editor.md
@ -38,6 +38,8 @@
 | `gutters` | Gutters to display: Available are `diagnostics` and `diff` and `line-numbers` and `spacer`, note that `diagnostics` also includes other features like breakpoints, 1-width padding will be inserted if gutters is non-empty | `["diagnostics", "spacer", "line-numbers", "spacer", "diff"]` |
 | `auto-completion` | Enable automatic pop up of auto-completion | `true` |
 | `path-completion` | Enable filepath completion. Show files and directories if an existing path at the cursor was recognized, either absolute or relative to the current opened document or current working directory (if the buffer is not yet saved). Defaults to true. | `true` |
+| `word-completion` | Enable completion of words from open buffers. | `true` |
+| `word-completion-trigger-length` | Minimum number of characters required to automatically trigger word completion. | `7` |
 | `auto-format` | Enable automatic formatting on save | `true` |
 | `idle-timeout` | Time in milliseconds since last keypress before idle timers trigger. | `250` |
 | `completion-timeout` | Time in milliseconds after typing a word character before completions are shown, set to 5 for instant.  | `250` |
--- a/book/src/languages.md
+++ b/book/src/languages.md
@ -71,6 +71,7 @@ These configuration keys are available:
 | `text-width`          |  Maximum line length. Used for the `:reflow` command and soft-wrapping if `soft-wrap.wrap-at-text-width` is set, defaults to `editor.text-width`   |
 | `rulers`              | Overrides the `editor.rulers` config key for the language. |
 | `path-completion`     | Overrides the `editor.path-completion` config key for the language. |
+| `word-completion`     | Overrides the `editor.word-completion` config key for the language. |
 | `workspace-lsp-roots`     | Directories relative to the workspace root that are treated as LSP roots. Should only be set in `.helix/config.toml`. Overwrites the setting of the same name in `config.toml` if set. |
 | `persistent-diagnostic-sources` | An array of LSP diagnostic sources assumed unchanged when the language server resends the same set of diagnostics. Helix can track the position for these diagnostics internally instead. Useful for diagnostics that are recomputed on save.

--- a/helix-core/src/completion.rs
+++ b/helix-core/src/completion.rs
@ -16,6 +16,7 @@ pub struct CompletionItem {
 pub enum CompletionProvider {
    Lsp(LanguageServerId),
    Path,
+    Word,
 }

 impl From<LanguageServerId> for CompletionProvider {
--- a/helix-core/src/syntax.rs
+++ b/helix-core/src/syntax.rs
@ -24,6 +24,7 @@ use std::{
    fmt::{self, Display, Write},
    hash::{Hash, Hasher},
    mem::replace,
+    num::NonZeroU8,
    path::{Path, PathBuf},
    str::FromStr,
    sync::Arc,
@ -127,6 +128,10 @@ pub struct LanguageConfiguration {

    /// If set, overrides `editor.path-completion`.
    pub path_completion: Option<bool>,
+    /// If set, overrides `editor.word-completion`.
+    pub word_completion: Option<bool>,
+    /// If set, overrides `editor.word-completion-trigger-length`.
+    pub word_completion_trigger_length: Option<NonZeroU8>,

    #[serde(default)]
    pub diagnostic_severity: Severity,
--- a/helix-core/src/transaction.rs
+++ b/helix-core/src/transaction.rs
@ -19,6 +19,17 @@ pub enum Operation {
    Insert(Tendril),
 }

+impl Operation {
+    /// The number of characters affected by the operation.
+    #[allow(clippy::len_without_is_empty)]
+    pub fn len(&self) -> usize {
+        match self {
+            Self::Retain(n) | Self::Delete(n) => *n,
+            Self::Insert(s) => s.chars().count(),
+        }
+    }
+}
+
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub enum Assoc {
    Before,
--- a/helix-stdx/src/lib.rs
+++ b/helix-stdx/src/lib.rs
@ -5,5 +5,4 @@ pub mod range;
 pub mod rope;
 pub mod str;
 pub mod time;
-
 pub use range::Range;
--- a/helix-stdx/src/str.rs
+++ b/helix-stdx/src/str.rs
@ -16,3 +16,291 @@ macro_rules! concat {
        buf
    }}
 }
+// Utilities for working with strings and specialized string types.
+
+use std::{
+    alloc,
+    borrow::{Borrow, Cow},
+    fmt, hash,
+    mem::{size_of, ManuallyDrop},
+    ptr::{self, NonNull},
+    slice, str,
+};
+
+/// A very very small owned string type.
+///
+/// This type is like a `Box<str>` and is similarly two `usize`s large. It can only fit strings
+/// with a byte length smaller than 256. On 64-bit machines this type stores up to 15 bytes inline
+/// (7 bytes on 32-bit machines). One byte is used to store the length. For strings short enough
+/// to be stored inline, the remaining 15 (or 7) bytes store the content inline. Otherwise the
+/// second `usize` of memory is a thin pointer to the string content.
+///
+/// Unlike `Box<str>` this type is not null-pointer optimized.
+#[repr(C)]
+pub struct TinyBoxedStr {
+    len: u8,
+    prefix: [u8; Self::PREFIX_LEN],
+    trailing: TinyBoxedStrTrailing,
+}
+
+#[repr(C)]
+union TinyBoxedStrTrailing {
+    suffix: [u8; TinyBoxedStr::SUFFIX_LEN],
+    ptr: ManuallyDrop<NonNull<u8>>,
+}
+
+impl TinyBoxedStr {
+    // 1 usize minus the byte to store the length.
+    const PREFIX_LEN: usize = size_of::<usize>() - size_of::<u8>();
+    // The other `usize` is a pointer or the end parts of an inline string.
+    const SUFFIX_LEN: usize = size_of::<usize>();
+    // ... for a grand total of 15 bytes for 64-bit machines or 7 for 32-bit.
+    const INLINE_LEN: u8 = (Self::PREFIX_LEN + Self::SUFFIX_LEN) as u8;
+
+    pub const MAX_LEN: usize = u8::MAX as usize;
+
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.len as usize
+    }
+
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        let ptr = if self.len <= Self::INLINE_LEN {
+            let ptr = ptr::from_ref(self);
+            unsafe { ptr::addr_of!((*ptr).prefix) }.cast()
+        } else {
+            unsafe { self.trailing.ptr }.as_ptr()
+        };
+        unsafe { slice::from_raw_parts(ptr, self.len()) }
+    }
+
+    #[inline]
+    pub fn as_str(&self) -> &str {
+        unsafe { str::from_utf8_unchecked(self.as_bytes()) }
+    }
+
+    /// Exposes the bytes as a mutable slice.
+    ///
+    /// When a string is short enough to be inline, this slice points to the `prefix` and `suffix`
+    /// parts of the struct. Otherwise the slice wraps the pointer to the allocation.
+    ///
+    /// SAFETY: As such, if the string is allocated then it is the caller's responsibility to
+    /// ensure that any modifications made to `&s.as_bytes_mut[..Self::PREFIX_LEN]` are written
+    /// to `s.prefix` as well if the string is allocated.
+    ///
+    /// SAFETY: It is also the caller's responsibility to ensure that edits to the bytes do not
+    /// make the bytes invalid UTF-8.
+    unsafe fn as_bytes_mut(&mut self) -> &mut [u8] {
+        let ptr = if self.len <= Self::INLINE_LEN {
+            let ptr = ptr::from_mut(self);
+            unsafe { ptr::addr_of_mut!((*ptr).prefix) }.cast()
+        } else {
+            unsafe { self.trailing.ptr }.as_ptr()
+        };
+        unsafe { slice::from_raw_parts_mut(ptr, self.len()) }
+    }
+
+    fn layout(len: u8) -> alloc::Layout {
+        alloc::Layout::array::<u8>(len as usize)
+            .expect("a valid layout for an array")
+            .pad_to_align()
+    }
+
+    /// Creates a new `TinyBoxedStr` of the given length with all bytes zeroed.
+    ///
+    /// While this is used to create uninitialized strings which are later filled, note that the
+    /// zero byte is valid UTF-8 so the zeroed representation is always valid.
+    fn zeroed(len: u8) -> Self {
+        let trailing = if len <= Self::INLINE_LEN {
+            TinyBoxedStrTrailing {
+                suffix: [0; Self::SUFFIX_LEN],
+            }
+        } else {
+            let layout = Self::layout(len);
+            let nullable = unsafe { alloc::alloc_zeroed(layout) };
+            let Some(ptr) = NonNull::new(nullable) else {
+                alloc::handle_alloc_error(layout);
+            };
+            TinyBoxedStrTrailing {
+                ptr: ManuallyDrop::new(ptr),
+            }
+        };
+        Self {
+            len,
+            prefix: [0; Self::PREFIX_LEN],
+            trailing,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct TooLongError;
+
+impl fmt::Display for TooLongError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str("string was too long to be stored as a `TinyBoxedStr` (max 256 bytes)")
+    }
+}
+
+impl std::error::Error for TooLongError {}
+
+impl TryFrom<&str> for TinyBoxedStr {
+    type Error = TooLongError;
+
+    fn try_from(s: &str) -> Result<Self, Self::Error> {
+        if s.len() > Self::MAX_LEN {
+            return Err(TooLongError);
+        }
+
+        let mut this = Self::zeroed(s.len() as u8);
+        // SAFETY: if `s` is valid UTF-8, `this`'s bytes will be valid UTF-8.
+        unsafe { this.as_bytes_mut() }.copy_from_slice(s.as_bytes());
+        if this.len > Self::INLINE_LEN {
+            this.prefix
+                .copy_from_slice(&s.as_bytes()[..Self::PREFIX_LEN]);
+        }
+        Ok(this)
+    }
+}
+
+// NOTE: converting from a `String` to a `TinyBoxedStr` is cheap when the string's length is equal
+// to its capacity.
+impl TryFrom<String> for TinyBoxedStr {
+    type Error = TooLongError;
+
+    fn try_from(s: String) -> Result<Self, Self::Error> {
+        // Inline strings must be cloned. It's a constant number of bytes to copy though.
+        if s.len() <= Self::INLINE_LEN as usize {
+            return s.as_str().try_into();
+        }
+
+        // Otherwise we can sometimes steal the `String`'s allocation if the string is allocated
+        // exactly (i.e. `s.len() == s.capacity()`). A `Box<str>` is defined as being allocated
+        // exactly so we first convert to `Box<str>` (which will reallocate if the capacity is not
+        // the same as the length) and then steal its pointer.
+
+        if s.len() > Self::MAX_LEN {
+            return Err(TooLongError);
+        }
+
+        let len = s.len() as u8;
+        let mut prefix = [0; Self::PREFIX_LEN];
+        prefix.copy_from_slice(&s.as_bytes()[..Self::PREFIX_LEN]);
+        let ptr = Box::into_raw(s.into_boxed_str()).cast::<u8>();
+        // SAFETY: `Box::into_raw` docs guarantee non-null.
+        let ptr = ManuallyDrop::new(unsafe { NonNull::new_unchecked(ptr) });
+        let trailing = TinyBoxedStrTrailing { ptr };
+
+        Ok(Self {
+            len,
+            prefix,
+            trailing,
+        })
+    }
+}
+
+impl TryFrom<Cow<'_, str>> for TinyBoxedStr {
+    type Error = TooLongError;
+
+    fn try_from(s: Cow<'_, str>) -> Result<Self, Self::Error> {
+        match s {
+            Cow::Borrowed(s) => s.try_into(),
+            Cow::Owned(s) => s.try_into(),
+        }
+    }
+}
+
+impl TryFrom<ropey::RopeSlice<'_>> for TinyBoxedStr {
+    type Error = TooLongError;
+
+    fn try_from(slice: ropey::RopeSlice<'_>) -> Result<Self, Self::Error> {
+        // `impl From<RopeSlice> for String` uses `String::with_capacity` so we can reuse its
+        // allocation whenever it allocates `slice.len_bytes()`.
+        let s: Cow<str> = slice.into();
+        s.try_into()
+    }
+}
+
+impl Drop for TinyBoxedStr {
+    fn drop(&mut self) {
+        if self.len > Self::INLINE_LEN {
+            let ptr = unsafe { self.trailing.ptr }.as_ptr();
+            let layout = Self::layout(self.len);
+            unsafe { alloc::dealloc(ptr, layout) }
+        }
+    }
+}
+
+impl Clone for TinyBoxedStr {
+    fn clone(&self) -> Self {
+        let mut this = Self::zeroed(self.len);
+        // SAFETY: if `self` is valid UTF-8 then `this` will be too.
+        unsafe { this.as_bytes_mut() }.copy_from_slice(self.as_bytes());
+        if this.len > Self::INLINE_LEN {
+            this.prefix
+                .copy_from_slice(&self.as_bytes()[..Self::PREFIX_LEN]);
+        }
+        this
+    }
+}
+
+impl Default for TinyBoxedStr {
+    fn default() -> Self {
+        Self::zeroed(0)
+    }
+}
+
+impl AsRef<str> for TinyBoxedStr {
+    fn as_ref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+impl Borrow<str> for TinyBoxedStr {
+    fn borrow(&self) -> &str {
+        self.as_str()
+    }
+}
+
+// NOTE: this could be specialized to optimize the number of comparison operations. We could cast
+// the first `usize` of memory together to do a single comparison (and same for the suffixes).
+// This optimization would only matter if we compared these strings very frequently however.
+impl PartialEq for TinyBoxedStr {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_str() == other.as_str()
+    }
+}
+
+impl Eq for TinyBoxedStr {}
+
+impl PartialEq<str> for TinyBoxedStr {
+    fn eq(&self, other: &str) -> bool {
+        self.as_str() == other
+    }
+}
+
+impl hash::Hash for TinyBoxedStr {
+    fn hash<H: hash::Hasher>(&self, state: &mut H) {
+        self.as_str().hash(state)
+    }
+}
+
+impl fmt::Debug for TinyBoxedStr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.as_str().fmt(f)
+    }
+}
+
+impl fmt::Display for TinyBoxedStr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.as_str().fmt(f)
+    }
+}
+
+unsafe impl Send for TinyBoxedStr {}
+unsafe impl Sync for TinyBoxedStr {}
--- a/helix-term/src/handlers.rs
+++ b/helix-term/src/handlers.rs
@ -9,7 +9,7 @@ use crate::handlers::auto_save::AutoSaveHandler;
 use crate::handlers::diagnostics::PullDiagnosticsHandler;
 use crate::handlers::signature_help::SignatureHelpHandler;

-pub use helix_view::handlers::Handlers;
+pub use helix_view::handlers::{word_index, Handlers};

 use self::blame::BlameHandler;
 use self::document_colors::DocumentColorsHandler;
@ -31,6 +31,7 @@ pub fn setup(config: Arc<ArcSwap<Config>>) -> Handlers {
    let document_colors = DocumentColorsHandler::default().spawn();
    let blame = BlameHandler::default().spawn();
    let pull_diagnostics = PullDiagnosticsHandler::new().spawn();
+    let word_index = word_index::Handler::spawn();

    let handlers = Handlers {
        completions: helix_view::handlers::completion::CompletionHandler::new(event_tx),
@ -39,6 +40,7 @@ pub fn setup(config: Arc<ArcSwap<Config>>) -> Handlers {
        document_colors,
        blame,
        pull_diagnostics,
+        word_index,
    };

    helix_view::handlers::register_hooks(&handlers);
--- a/helix-term/src/handlers/completion.rs
+++ b/helix-term/src/handlers/completion.rs
@ -30,6 +30,7 @@ mod item;
 mod path;
 mod request;
 mod resolve;
+mod word;

 async fn handle_response(
    requests: &mut JoinSet<CompletionResponse>,
@ -82,7 +83,7 @@ async fn replace_completions(
 fn show_completion(
    editor: &mut Editor,
    compositor: &mut Compositor,
-    items: Vec<CompletionItem>,
+    mut items: Vec<CompletionItem>,
    context: HashMap<CompletionProvider, ResponseContext>,
    trigger: Trigger,
 ) {
@ -101,6 +102,7 @@ fn show_completion(
    if ui.completion.is_some() {
        return;
    }
+    word::retain_valid_completions(trigger, doc, view.id, &mut items);
    editor.handlers.completions.active_completions = context;

    let completion_area = ui.set_completion(editor, items, trigger.pos, size);
--- a/helix-term/src/handlers/completion/request.rs
+++ b/helix-term/src/handlers/completion/request.rs
@ -28,6 +28,8 @@ use crate::job::{dispatch, dispatch_blocking};
 use crate::ui;
 use crate::ui::editor::InsertEvent;

+use super::word;
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum TriggerKind {
    Auto,
@ -242,10 +244,15 @@ fn request_completions(
        doc.selection(view.id).clone(),
        doc,
        handle.clone(),
-        savepoint,
+        savepoint.clone(),
    ) {
        requests.spawn_blocking(path_completion_request);
    }
+    if let Some(word_completion_request) =
+        word::completion(editor, trigger, handle.clone(), savepoint)
+    {
+        requests.spawn_blocking(word_completion_request);
+    }

    let ui = compositor.find::<ui::EditorView>().unwrap();
    ui.last_insert.1.push(InsertEvent::RequestCompletion);
--- a/helix-term/src/handlers/completion/word.rs
+++ b/helix-term/src/handlers/completion/word.rs
@ -0,0 +1,131 @@
+use std::{borrow::Cow, sync::Arc};
+
+use helix_core::{
+    self as core, chars::char_is_word, completion::CompletionProvider, movement, Transaction,
+};
+use helix_event::TaskHandle;
+use helix_stdx::rope::RopeSliceExt as _;
+use helix_view::{
+    document::SavePoint, handlers::completion::ResponseContext, Document, Editor, ViewId,
+};
+
+use super::{request::TriggerKind, CompletionItem, CompletionItems, CompletionResponse, Trigger};
+
+const COMPLETION_KIND: &str = "word";
+
+pub(super) fn completion(
+    editor: &Editor,
+    trigger: Trigger,
+    handle: TaskHandle,
+    savepoint: Arc<SavePoint>,
+) -> Option<impl FnOnce() -> CompletionResponse> {
+    let config = editor.config();
+    if !config.word_completion {
+        return None;
+    }
+    let trigger_length = doc!(editor)
+        .language_config()
+        .and_then(|config| config.word_completion_trigger_length)
+        .unwrap_or(config.word_completion_trigger_length)
+        .get() as usize;
+    let (view, doc) = current_ref!(editor);
+    let rope = doc.text().clone();
+    let word_index = editor.handlers.word_index().clone();
+    let text = doc.text().slice(..);
+    let selection = doc.selection(view.id).clone();
+    let pos = selection.primary().cursor(text);
+
+    let cursor = movement::move_prev_word_start(text, core::Range::point(pos), 1);
+    if cursor.head == pos {
+        return None;
+    }
+    if trigger.kind != TriggerKind::Manual
+        && text
+            .slice(cursor.head..)
+            .graphemes()
+            .take(trigger_length)
+            .take_while(|g| g.chars().all(char_is_word))
+            .count()
+            != trigger_length
+    {
+        return None;
+    }
+
+    let typed_word_range = cursor.head..pos;
+    let typed_word = text.slice(typed_word_range.clone());
+    let edit_diff = if typed_word
+        .char(typed_word.len_chars().saturating_sub(1))
+        .is_whitespace()
+    {
+        0
+    } else {
+        typed_word.len_chars()
+    };
+
+    if handle.is_canceled() {
+        return None;
+    }
+
+    let future = move || {
+        let text = rope.slice(..);
+        let typed_word: Cow<_> = text.slice(typed_word_range).into();
+        let items = word_index
+            .matches(&typed_word)
+            .into_iter()
+            .filter(|word| word.as_str() != typed_word.as_ref())
+            .map(|word| {
+                let transaction = Transaction::change_by_selection(&rope, &selection, |range| {
+                    let cursor = range.cursor(text);
+                    (cursor - edit_diff, cursor, Some((&word).into()))
+                });
+                CompletionItem::Other(core::CompletionItem {
+                    transaction,
+                    label: word.into(),
+                    kind: Cow::Borrowed(COMPLETION_KIND),
+                    documentation: None,
+                    provider: CompletionProvider::Word,
+                })
+            })
+            .collect();
+
+        CompletionResponse {
+            items: CompletionItems::Other(items),
+            provider: CompletionProvider::Word,
+            context: ResponseContext {
+                is_incomplete: false,
+                priority: 0,
+                savepoint,
+            },
+        }
+    };
+
+    Some(future)
+}
+
+pub(super) fn retain_valid_completions(
+    trigger: Trigger,
+    doc: &Document,
+    view_id: ViewId,
+    items: &mut Vec<CompletionItem>,
+) {
+    if trigger.kind == TriggerKind::Manual {
+        return;
+    }
+
+    let text = doc.text().slice(..);
+    let cursor = doc.selection(view_id).primary().cursor(text);
+    if text
+        .get_char(cursor.saturating_sub(1))
+        .is_some_and(|ch| ch.is_whitespace())
+    {
+        items.retain(|item| {
+            !matches!(
+                item,
+                CompletionItem::Other(core::CompletionItem {
+                    provider: CompletionProvider::Word,
+                    ..
+                })
+            )
+        });
+    }
+}
--- a/helix-view/src/document.rs
+++ b/helix-view/src/document.rs
@ -1878,6 +1878,12 @@ impl Document {
        self.version
    }

+    pub fn word_completion_enabled(&self) -> bool {
+        self.language_config()
+            .and_then(|lang_config| lang_config.word_completion)
+            .unwrap_or_else(|| self.config.load().word_completion)
+    }
+
    pub fn path_completion_enabled(&self) -> bool {
        self.language_config()
            .and_then(|lang_config| lang_config.path_completion)
--- a/helix-view/src/editor.rs
+++ b/helix-view/src/editor.rs
@ -29,7 +29,7 @@ use std::{
    collections::{BTreeMap, HashMap, HashSet},
    fs,
    io::{self, stdin},
-    num::NonZeroUsize,
+    num::{NonZeroU8, NonZeroUsize},
    path::{Path, PathBuf},
    pin::Pin,
    sync::Arc,
@ -321,6 +321,11 @@ pub struct Config {
    /// either absolute or relative to the current opened document or current working directory (if the buffer is not yet saved).
    /// Defaults to true.
    pub path_completion: bool,
+    /// Enable completion of words from open buffers. Defaults to true.
+    pub word_completion: bool,
+    /// Minimum number of characters required to automatically trigger word completion, if
+    /// enabled. Defaults to `7`.
+    pub word_completion_trigger_length: NonZeroU8,
    /// Automatic formatting on save. Defaults to true.
    pub auto_format: bool,
    /// Default register used for yank/paste. Defaults to '"'
@ -1028,6 +1033,8 @@ impl Default for Config {
            auto_pairs: AutoPairConfig::default(),
            auto_completion: true,
            path_completion: true,
+            word_completion: true,
+            word_completion_trigger_length: unsafe { NonZeroU8::new_unchecked(7) },
            auto_format: true,
            default_yank_register: '"',
            auto_save: AutoSave::default(),
--- a/helix-view/src/handlers.rs
+++ b/helix-view/src/handlers.rs
@ -9,6 +9,7 @@ pub mod completion;
 pub mod dap;
 pub mod diagnostics;
 pub mod lsp;
+pub mod word_index;

 #[derive(Debug)]
 pub enum AutoSaveEvent {
@ -35,6 +36,7 @@ pub struct Handlers {
    pub document_colors: Sender<lsp::DocumentColorsEvent>,
    pub blame: Sender<BlameEvent>,
    pub pull_diagnostics: Sender<lsp::PullDiagnosticsEvent>,
+    pub word_index: word_index::Handler,
 }

 impl Handlers {
@ -59,8 +61,13 @@ impl Handlers {
        };
        send_blocking(&self.signature_hints, event)
    }
+
+    pub fn word_index(&self) -> &word_index::WordIndex {
+        &self.word_index.index
+    }
 }

 pub fn register_hooks(handlers: &Handlers) {
    lsp::register_hooks(handlers);
+    word_index::register_hooks(handlers);
 }
--- a/helix-view/src/handlers/word_index.rs
+++ b/helix-view/src/handlers/word_index.rs
@ -0,0 +1,484 @@
+//! Indexing of words from open buffers.
+//!
+//! This provides an eventually consistent set of words used in any open buffers. This set is
+//! later used for lexical completion.
+
+use std::{borrow::Cow, collections::HashMap, iter, mem, sync::Arc, time::Duration};
+
+use helix_core::{
+    chars::char_is_word, fuzzy::fuzzy_match, movement, ChangeSet, Range, Rope, RopeSlice,
+};
+use helix_event::{register_hook, AsyncHook};
+use helix_stdx::rope::RopeSliceExt as _;
+use parking_lot::RwLock;
+use tokio::{sync::mpsc, time::Instant};
+
+use crate::{
+    events::{DocumentDidChange, DocumentDidClose, DocumentDidOpen},
+    DocumentId,
+};
+
+use super::Handlers;
+
+#[derive(Debug)]
+struct Change {
+    old_text: Rope,
+    text: Rope,
+    changes: ChangeSet,
+}
+
+#[derive(Debug)]
+enum Event {
+    Insert(Rope),
+    Update(DocumentId, Change),
+    Delete(DocumentId, Rope),
+}
+
+#[derive(Debug)]
+pub struct Handler {
+    pub(super) index: WordIndex,
+    /// A sender into an async hook which debounces updates to the index.
+    hook: mpsc::Sender<Event>,
+    /// A sender to a tokio task which coordinates the indexing of documents.
+    ///
+    /// See [WordIndex::run]. A supervisor-like task is in charge of spawning tasks to update the
+    /// index. This ensures that consecutive edits to a document trigger the correct order of
+    /// insertions and deletions into the word set.
+    coordinator: mpsc::UnboundedSender<Event>,
+}
+
+impl Handler {
+    pub fn spawn() -> Self {
+        let index = WordIndex::default();
+        let (tx, rx) = mpsc::unbounded_channel();
+        tokio::spawn(index.clone().run(rx));
+        Self {
+            hook: Hook {
+                changes: HashMap::default(),
+                coordinator: tx.clone(),
+            }
+            .spawn(),
+            index,
+            coordinator: tx,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Hook {
+    changes: HashMap<DocumentId, Change>,
+    coordinator: mpsc::UnboundedSender<Event>,
+}
+
+const DEBOUNCE: Duration = Duration::from_secs(1);
+
+impl AsyncHook for Hook {
+    type Event = Event;
+
+    fn handle_event(&mut self, event: Self::Event, timeout: Option<Instant>) -> Option<Instant> {
+        match event {
+            Event::Insert(_) => unreachable!("inserts are sent to the worker directly"),
+            Event::Update(doc, change) => {
+                if let Some(pending_change) = self.changes.get_mut(&doc) {
+                    // If there is already a change waiting for this document, merge the two
+                    // changes together by composing the changesets and saving the new `text`.
+                    pending_change.changes =
+                        mem::take(&mut pending_change.changes).compose(change.changes);
+                    pending_change.text = change.text;
+                    Some(Instant::now() + DEBOUNCE)
+                } else if !is_changeset_significant(&change.changes) {
+                    // If the changeset is fairly large, debounce before updating the index.
+                    self.changes.insert(doc, change);
+                    Some(Instant::now() + DEBOUNCE)
+                } else {
+                    // Otherwise if the change is small, queue the update to the index immediately.
+                    self.coordinator.send(Event::Update(doc, change)).unwrap();
+                    timeout
+                }
+            }
+            Event::Delete(doc, text) => {
+                // If there are pending changes that haven't been indexed since the last debounce,
+                // forget them and delete the old text.
+                if let Some(change) = self.changes.remove(&doc) {
+                    self.coordinator
+                        .send(Event::Delete(doc, change.old_text))
+                        .unwrap();
+                } else {
+                    self.coordinator.send(Event::Delete(doc, text)).unwrap();
+                }
+                timeout
+            }
+        }
+    }
+
+    fn finish_debounce(&mut self) {
+        for (doc, change) in self.changes.drain() {
+            self.coordinator.send(Event::Update(doc, change)).unwrap();
+        }
+    }
+}
+
+/// Minimum number of grapheme clusters required to include a word in the index
+const MIN_WORD_GRAPHEMES: usize = 3;
+/// Maximum word length allowed (in chars)
+const MAX_WORD_LEN: usize = 50;
+
+type Word = helix_stdx::str::TinyBoxedStr;
+
+#[derive(Debug, Default)]
+struct WordIndexInner {
+    /// Reference counted storage for words.
+    ///
+    /// Words are very likely to be reused many times. Instead of storing duplicates we keep a
+    /// reference count of times a word is used. When the reference count drops to zero the word
+    /// is removed from the index.
+    words: HashMap<Word, u32>,
+}
+
+impl WordIndexInner {
+    fn words(&self) -> impl Iterator<Item = &Word> {
+        self.words.keys()
+    }
+
+    fn insert(&mut self, word: RopeSlice) {
+        assert!(word.len_chars() <= MAX_WORD_LEN);
+        // The word must be shorter than `TinyBoxedStr::MAX` because it is fewer than 50
+        // characters and characters take at most four bytes.
+        assert!(word.len_bytes() < Word::MAX_LEN);
+
+        let word: Cow<str> = word.into();
+        if let Some(rc) = self.words.get_mut(word.as_ref()) {
+            *rc = rc.saturating_add(1);
+        } else {
+            self.words.insert(word.try_into().unwrap(), 1);
+        }
+    }
+
+    fn remove(&mut self, word: RopeSlice) {
+        let word: Cow<str> = word.into();
+        match self.words.get_mut(word.as_ref()) {
+            Some(1) => {
+                self.words.remove(word.as_ref());
+            }
+            Some(n) => *n -= 1,
+            None => (),
+        }
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct WordIndex {
+    inner: Arc<RwLock<WordIndexInner>>,
+}
+
+impl WordIndex {
+    pub fn matches(&self, pattern: &str) -> Vec<String> {
+        let inner = self.inner.read();
+        let mut matches = fuzzy_match(pattern, inner.words(), false);
+        matches.sort_unstable_by_key(|(_, score)| *score);
+        matches
+            .into_iter()
+            .map(|(word, _)| word.to_string())
+            .collect()
+    }
+
+    fn add_document(&self, text: &Rope) {
+        let words: Vec<_> = words(text.slice(..)).collect();
+        let mut inner = self.inner.write();
+        for word in words {
+            inner.insert(word);
+        }
+    }
+
+    fn update_document(&self, old_text: &Rope, text: &Rope, changes: &ChangeSet) {
+        let mut inserted = Vec::new();
+        let mut removed = Vec::new();
+        for (old_window, new_window) in changed_windows(old_text.slice(..), text.slice(..), changes)
+        {
+            inserted.extend(words(new_window));
+            removed.extend(words(old_window));
+        }
+
+        let mut inner = self.inner.write();
+        for word in inserted {
+            inner.insert(word);
+        }
+        for word in removed {
+            inner.remove(word);
+        }
+    }
+
+    fn remove_document(&self, text: &Rope) {
+        let words: Vec<_> = words(text.slice(..)).collect();
+        let mut inner = self.inner.write();
+        for word in words {
+            inner.remove(word);
+        }
+    }
+
+    /// Coordinate the indexing of documents.
+    ///
+    /// This task wraps a MPSC queue and spawns blocking tasks which update the index. Updates
+    /// are applied one-by-one to ensure that changes to the index are **serialized**:
+    /// updates to each document must be applied in-order.
+    async fn run(self, mut events: mpsc::UnboundedReceiver<Event>) {
+        while let Some(event) = events.recv().await {
+            let this = self.clone();
+            tokio::task::spawn_blocking(move || match event {
+                Event::Insert(text) => {
+                    this.add_document(&text);
+                }
+                Event::Update(
+                    _doc,
+                    Change {
+                        old_text,
+                        text,
+                        changes,
+                        ..
+                    },
+                ) => {
+                    this.update_document(&old_text, &text, &changes);
+                }
+                Event::Delete(_doc, text) => {
+                    this.remove_document(&text);
+                }
+            })
+            .await
+            .unwrap();
+        }
+    }
+}
+
+fn words(text: RopeSlice) -> impl Iterator<Item = RopeSlice> {
+    let mut cursor = Range::point(0);
+    if text
+        .get_char(cursor.anchor)
+        .is_some_and(|ch| !ch.is_whitespace())
+    {
+        let cursor_word_end = movement::move_next_word_end(text, cursor, 1);
+        if cursor_word_end.anchor == 0 {
+            cursor = cursor_word_end;
+        }
+    }
+
+    iter::from_fn(move || {
+        while cursor.head <= text.len_chars() {
+            let mut word = None;
+            if text
+                .slice(..cursor.head)
+                .graphemes_rev()
+                .take(MIN_WORD_GRAPHEMES)
+                .take_while(|g| g.chars().all(char_is_word))
+                .count()
+                == MIN_WORD_GRAPHEMES
+            {
+                cursor.anchor += text
+                    .chars_at(cursor.anchor)
+                    .take_while(|&c| !char_is_word(c))
+                    .count();
+                let slice = cursor.slice(text);
+                if slice.len_chars() <= MAX_WORD_LEN {
+                    word = Some(slice);
+                }
+            }
+            let head = cursor.head;
+            cursor = movement::move_next_word_end(text, cursor, 1);
+            if cursor.head == head {
+                cursor.head = usize::MAX;
+            }
+            if word.is_some() {
+                return word;
+            }
+        }
+        None
+    })
+}
+
+/// Finds areas of the old and new texts around each operation in `changes`.
+///
+/// The window is larger than the changed area and can encompass multiple insert/delete operations
+/// if they are grouped closely together.
+///
+/// The ranges of the old and new text should usually be of different sizes. For example a
+/// deletion of "foo" surrounded by large retain sections would give a longer window into the
+/// `old_text` and shorter window of `new_text`. Vice-versa for an insertion. A full replacement
+/// of a word though would give two slices of the same size.
+fn changed_windows<'a>(
+    old_text: RopeSlice<'a>,
+    new_text: RopeSlice<'a>,
+    changes: &'a ChangeSet,
+) -> impl Iterator<Item = (RopeSlice<'a>, RopeSlice<'a>)> {
+    use helix_core::Operation::*;
+
+    let mut operations = changes.changes().iter().peekable();
+    let mut old_pos = 0;
+    let mut new_pos = 0;
+    iter::from_fn(move || loop {
+        let operation = operations.next()?;
+        let old_start = old_pos;
+        let new_start = new_pos;
+        let len = operation.len();
+        match operation {
+            Retain(_) => {
+                old_pos += len;
+                new_pos += len;
+                continue;
+            }
+            Insert(_) => new_pos += len,
+            Delete(_) => old_pos += len,
+        }
+
+        // Scan ahead until a `Retain` is found which would end a window.
+        while let Some(o) = operations.next_if(|op| !matches!(op, Retain(n) if *n > MAX_WORD_LEN)) {
+            let len = o.len();
+            match o {
+                Retain(_) => {
+                    old_pos += len;
+                    new_pos += len;
+                }
+                Delete(_) => old_pos += len,
+                Insert(_) => new_pos += len,
+            }
+        }
+
+        let old_window = old_start.saturating_sub(MAX_WORD_LEN)
+            ..(old_pos + MAX_WORD_LEN).min(old_text.len_chars());
+        let new_window = new_start.saturating_sub(MAX_WORD_LEN)
+            ..(new_pos + MAX_WORD_LEN).min(new_text.len_chars());
+
+        return Some((old_text.slice(old_window), new_text.slice(new_window)));
+    })
+}
+
+/// Estimates whether a changeset is significant or small.
+fn is_changeset_significant(changes: &ChangeSet) -> bool {
+    use helix_core::Operation::*;
+
+    let mut diff = 0;
+    for operation in changes.changes() {
+        match operation {
+            Retain(_) => continue,
+            Delete(_) | Insert(_) => diff += operation.len(),
+        }
+    }
+
+    // This is arbitrary and could be tuned further:
+    diff > 1_000
+}
+
+pub(crate) fn register_hooks(handlers: &Handlers) {
+    let coordinator = handlers.word_index.coordinator.clone();
+    register_hook!(move |event: &mut DocumentDidOpen<'_>| {
+        let doc = doc!(event.editor, &event.doc);
+        if doc.word_completion_enabled() {
+            coordinator.send(Event::Insert(doc.text().clone())).unwrap();
+        }
+        Ok(())
+    });
+
+    let tx = handlers.word_index.hook.clone();
+    register_hook!(move |event: &mut DocumentDidChange<'_>| {
+        if !event.ghost_transaction && event.doc.word_completion_enabled() {
+            helix_event::send_blocking(
+                &tx,
+                Event::Update(
+                    event.doc.id(),
+                    Change {
+                        old_text: event.old_text.clone(),
+                        text: event.doc.text().clone(),
+                        changes: event.changes.clone(),
+                    },
+                ),
+            );
+        }
+        Ok(())
+    });
+
+    let tx = handlers.word_index.hook.clone();
+    register_hook!(move |event: &mut DocumentDidClose<'_>| {
+        if event.doc.word_completion_enabled() {
+            helix_event::send_blocking(
+                &tx,
+                Event::Delete(event.doc.id(), event.doc.text().clone()),
+            );
+        }
+        Ok(())
+    });
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use super::*;
+    use helix_core::diff::compare_ropes;
+
+    impl WordIndex {
+        fn words(&self) -> HashSet<String> {
+            let inner = self.inner.read();
+            inner.words().map(|w| w.to_string()).collect()
+        }
+    }
+
+    #[track_caller]
+    fn assert_words<I: ToString, T: IntoIterator<Item = I>>(text: &str, expected: T) {
+        let text = Rope::from_str(text);
+        let index = WordIndex::default();
+        index.add_document(&text);
+        let actual = index.words();
+        let expected: HashSet<_> = expected.into_iter().map(|i| i.to_string()).collect();
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn parse() {
+        assert_words("one two three", ["one", "two", "three"]);
+        assert_words("a foo c", ["foo"]);
+    }
+
+    #[track_caller]
+    fn assert_diff<S, R, I>(before: &str, after: &str, expect_removed: R, expect_inserted: I)
+    where
+        S: ToString,
+        R: IntoIterator<Item = S>,
+        I: IntoIterator<Item = S>,
+    {
+        let before = Rope::from_str(before);
+        let after = Rope::from_str(after);
+        let diff = compare_ropes(&before, &after);
+        let expect_removed: HashSet<_> =
+            expect_removed.into_iter().map(|i| i.to_string()).collect();
+        let expect_inserted: HashSet<_> =
+            expect_inserted.into_iter().map(|i| i.to_string()).collect();
+
+        let index = WordIndex::default();
+        index.add_document(&before);
+        let words_before = index.words();
+        index.update_document(&before, &after, diff.changes());
+        let words_after = index.words();
+
+        let actual_removed = words_before.difference(&words_after).cloned().collect();
+        let actual_inserted = words_after.difference(&words_before).cloned().collect();
+
+        eprintln!("\"{before}\" {words_before:?} => \"{after}\" {words_after:?}");
+        assert_eq!(
+            expect_removed, actual_removed,
+            "expected {expect_removed:?} to be removed, instead {actual_removed:?} was"
+        );
+        assert_eq!(
+            expect_inserted, actual_inserted,
+            "expected {expect_inserted:?} to be inserted, instead {actual_inserted:?} was"
+        );
+    }
+
+    #[test]
+    fn diff() {
+        assert_diff("one two three", "one five three", ["two"], ["five"]);
+        assert_diff("one two three", "one to three", ["two"], []);
+        assert_diff("one two three", "one three", ["two"], []);
+        assert_diff("one two three", "one t{o three", ["two"], []);
+        assert_diff("one foo three", "one fooo three", ["foo"], ["fooo"]);
+
+        // TODO: further testing. Consider setting the max word size smaller in tests.
+    }
+}