diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-16 04:38:32 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-16 04:38:32 +0200 |
| commit | 5e0cf1ede41b2887db98ca61c8100cbe1da61170 (patch) | |
| tree | d3d172643fb18d8e9b03af591125909babb9c0d8 | |
| parent | 409cec495ae619fa874e0e827ac620b881f84941 (diff) | |
Fix byte vs UTF-16 indexing in LSP position handling
Adds utf16OffsetToByteOffset helper to correctly convert LSP character
positions (UTF-16 code units) to Go string byte offsets. Fixes trigger
detection, prefix heuristic, and completion text slicing for files
containing multi-byte characters.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
| -rw-r--r-- | internal/lsp/handlers.go | 11 | ||||
| -rw-r--r-- | internal/lsp/handlers_completion.go | 14 | ||||
| -rw-r--r-- | internal/lsp/handlers_utils.go | 28 | ||||
| -rw-r--r-- | internal/lsp/utf16_offset_test.go | 59 |
4 files changed, 99 insertions, 13 deletions
diff --git a/internal/lsp/handlers.go b/internal/lsp/handlers.go index ebdefc1..ad2f98d 100644 --- a/internal/lsp/handlers.go +++ b/internal/lsp/handlers.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "strings" + "unicode/utf8" ) func (s *Server) handle(req Request) { @@ -274,16 +275,18 @@ func (s *Server) isTriggerEvent(p CompletionParams, current string) bool { } // For TriggerForIncomplete (3), require manual char check below } - // 2) Fallback: check the character immediately prior to cursor - idx := p.Position.Character - if idx <= 0 || idx > len(current) { + // 2) Fallback: check the character immediately prior to cursor. + // Convert UTF-16 offset to byte offset for correct multi-byte handling. + byteIdx := utf16OffsetToByteOffset(current, p.Position.Character) + if byteIdx <= 0 || byteIdx > len(current) { return false } // Bare double-open should not trigger via fallback char either (only when configured) if containsAny(current, doubleSeqs) && !hasDoubleOpenTrigger(current, open, openChar, closeChar) { return false } - ch := string(current[idx-1]) + r, _ := utf8.DecodeLastRuneInString(current[:byteIdx]) + ch := string(r) for _, c := range triggerChars { if c == ch { return true diff --git a/internal/lsp/handlers_completion.go b/internal/lsp/handlers_completion.go index aca830b..8ef67ab 100644 --- a/internal/lsp/handlers_completion.go +++ b/internal/lsp/handlers_completion.go @@ -318,7 +318,8 @@ func (s *Server) executeChatCompletion(ctx context.Context, plan completionPlan, _ = stats.Update(ctx, client.Name(), modelUsed, sentSize, len(text)) s.logLLMStats(modelUsed) trimmed := strings.TrimSpace(text) - cleaned := s.postProcessCompletion(trimmed, plan.current[:plan.params.Position.Character], plan.current) + cursorByte := utf16OffsetToByteOffset(plan.current, plan.params.Position.Character) + cleaned := s.postProcessCompletion(trimmed, plan.current[:cursorByte], plan.current) if cleaned == "" { return nil, false } @@ -374,9 +375,9 @@ func (s *Server) shouldSuppressForChatTriggerEOL(current string, p CompletionPar // prefixHeuristicAllows applies minimal prefix rules unless inlinePrompt or structural triggers apply. func (s *Server) prefixHeuristicAllows(inlinePrompt bool, current string, p CompletionParams, manualInvoke bool) bool { - // Determine the effective cursor index within current line, clamped, and - // skip over trailing spaces/tabs to support cases like "type Matrix| ". - idx := p.Position.Character + // Convert UTF-16 offset to byte offset for correct multi-byte handling, + // then clamp to the line length. + idx := utf16OffsetToByteOffset(current, p.Position.Character) if idx > len(current) { idx = len(current) } @@ -454,11 +455,12 @@ func (s *Server) tryProviderNativeCompletion(ctx context.Context, plan completio if cleaned == "" { return nil, false } - cleaned = stripDuplicateAssignmentPrefix(current[:p.Position.Character], cleaned) + cByte := utf16OffsetToByteOffset(current, p.Position.Character) + cleaned = stripDuplicateAssignmentPrefix(current[:cByte], cleaned) if cleaned == "" { return nil, false } - cleaned = stripDuplicateGeneralPrefix(current[:p.Position.Character], cleaned) + cleaned = stripDuplicateGeneralPrefix(current[:cByte], cleaned) if cleaned == "" { return nil, false } diff --git a/internal/lsp/handlers_utils.go b/internal/lsp/handlers_utils.go index 408fdb1..bede7a0 100644 --- a/internal/lsp/handlers_utils.go +++ b/internal/lsp/handlers_utils.go @@ -6,6 +6,7 @@ import ( "fmt" "strings" "time" + "unicode/utf8" "codeberg.org/snonux/hexai/internal/appconfig" "codeberg.org/snonux/hexai/internal/llm" @@ -211,9 +212,11 @@ func computeTextEditAndFilter(cleaned string, inParams bool, current string, p C return te, filter } } - startChar := computeWordStart(current, p.Position.Character) - te := &TextEdit{Range: Range{Start: Position{Line: p.Position.Line, Character: startChar}, End: Position{Line: p.Position.Line, Character: p.Position.Character}}, NewText: cleaned} - filter := strings.TrimLeft(current[startChar:p.Position.Character], " \t") + cursorByte := utf16OffsetToByteOffset(current, p.Position.Character) + startByte := computeWordStart(current, cursorByte) + // TextEdit ranges use UTF-16 offsets; for ASCII identifiers byte == UTF-16. + te := &TextEdit{Range: Range{Start: Position{Line: p.Position.Line, Character: startByte}, End: Position{Line: p.Position.Line, Character: p.Position.Character}}, NewText: cleaned} + filter := strings.TrimLeft(current[startByte:cursorByte], " \t") return te, filter } @@ -733,3 +736,22 @@ func collectSemicolonMarkers(line string, lineNum int, openStr string, open, clo } return edits } + +// utf16OffsetToByteOffset converts an LSP UTF-16 code-unit offset to a byte +// offset within a Go (UTF-8) string. BMP characters (most code) are 1 UTF-16 +// unit, while supplementary characters (e.g. emoji) are 2. Returns len(s) +// if the offset exceeds the string length. +func utf16OffsetToByteOffset(s string, utf16Offset int) int { + byteIdx := 0 + units := 0 + for byteIdx < len(s) && units < utf16Offset { + r, size := utf8.DecodeRuneInString(s[byteIdx:]) + byteIdx += size + if r >= 0x10000 { + units += 2 // surrogate pair in UTF-16 + } else { + units++ + } + } + return byteIdx +} diff --git a/internal/lsp/utf16_offset_test.go b/internal/lsp/utf16_offset_test.go new file mode 100644 index 0000000..49a0fa0 --- /dev/null +++ b/internal/lsp/utf16_offset_test.go @@ -0,0 +1,59 @@ +package lsp + +import "testing" + +func TestUTF16OffsetToByteOffset_ASCII(t *testing.T) { + s := "hello world" + if got := utf16OffsetToByteOffset(s, 5); got != 5 { + t.Fatalf("expected 5, got %d", got) + } +} + +func TestUTF16OffsetToByteOffset_MultiByte(t *testing.T) { + // "aé" — 'a' is 1 byte/1 UTF-16 unit, 'é' is 2 bytes/1 UTF-16 unit + s := "aé" + // UTF-16 offset 1 → byte 1 (after 'a') + if got := utf16OffsetToByteOffset(s, 1); got != 1 { + t.Fatalf("expected 1 after 'a', got %d", got) + } + // UTF-16 offset 2 → byte 3 (after 'é' which is 2 UTF-8 bytes) + if got := utf16OffsetToByteOffset(s, 2); got != 3 { + t.Fatalf("expected 3 after 'é', got %d", got) + } +} + +func TestUTF16OffsetToByteOffset_Emoji(t *testing.T) { + // "a🎉b" — 'a' is 1/1, '🎉' is 4 bytes / 2 UTF-16 units, 'b' is 1/1 + s := "a🎉b" + // UTF-16 offset 1 → byte 1 (after 'a') + if got := utf16OffsetToByteOffset(s, 1); got != 1 { + t.Fatalf("expected 1, got %d", got) + } + // UTF-16 offset 3 → byte 5 (after '🎉' which is 4 bytes, 2 UTF-16 units) + if got := utf16OffsetToByteOffset(s, 3); got != 5 { + t.Fatalf("expected 5 after emoji, got %d", got) + } + // UTF-16 offset 4 → byte 6 (after 'b') + if got := utf16OffsetToByteOffset(s, 4); got != 6 { + t.Fatalf("expected 6, got %d", got) + } +} + +func TestUTF16OffsetToByteOffset_BeyondEnd(t *testing.T) { + s := "abc" + if got := utf16OffsetToByteOffset(s, 10); got != 3 { + t.Fatalf("expected len(s)=3 for offset beyond end, got %d", got) + } +} + +func TestUTF16OffsetToByteOffset_Empty(t *testing.T) { + if got := utf16OffsetToByteOffset("", 0); got != 0 { + t.Fatalf("expected 0 for empty string, got %d", got) + } +} + +func TestUTF16OffsetToByteOffset_Zero(t *testing.T) { + if got := utf16OffsetToByteOffset("hello", 0); got != 0 { + t.Fatalf("expected 0 for offset 0, got %d", got) + } +} |
