Fix byte vs UTF-16 indexing in LSP position handling

Adds utf16OffsetToByteOffset helper to correctly convert LSP character positions (UTF-16 code units) to Go string byte offsets. Fixes trigger detection, prefix heuristic, and completion text slicing for files containing multi-byte characters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-16 04:38:32 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-16 04:38:32 +0200
commit: 5e0cf1ede41b2887db98ca61c8100cbe1da61170 (patch)
tree: d3d172643fb18d8e9b03af591125909babb9c0d8
parent: 409cec495ae619fa874e0e827ac620b881f84941 (diff)
4 files changed, 99 insertions, 13 deletions
diff --git a/internal/lsp/handlers.go b/internal/lsp/handlers.go
index ebdefc1..ad2f98d 100644
--- a/internal/lsp/handlers.go
+++ b/internal/lsp/handlers.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
+	"unicode/utf8"
 )
 
 func (s *Server) handle(req Request) {
@@ -274,16 +275,18 @@ func (s *Server) isTriggerEvent(p CompletionParams, current string) bool {
 		}
 		// For TriggerForIncomplete (3), require manual char check below
 	}
-	// 2) Fallback: check the character immediately prior to cursor
-	idx := p.Position.Character
-	if idx <= 0 || idx > len(current) {
+	// 2) Fallback: check the character immediately prior to cursor.
+	// Convert UTF-16 offset to byte offset for correct multi-byte handling.
+	byteIdx := utf16OffsetToByteOffset(current, p.Position.Character)
+	if byteIdx <= 0 || byteIdx > len(current) {
 		return false
 	}
 	// Bare double-open should not trigger via fallback char either (only when configured)
 	if containsAny(current, doubleSeqs) && !hasDoubleOpenTrigger(current, open, openChar, closeChar) {
 		return false
 	}
-	ch := string(current[idx-1])
+	r, _ := utf8.DecodeLastRuneInString(current[:byteIdx])
+	ch := string(r)
 	for _, c := range triggerChars {
 		if c == ch {
 			return true
diff --git a/internal/lsp/handlers_completion.go b/internal/lsp/handlers_completion.go
index aca830b..8ef67ab 100644
--- a/internal/lsp/handlers_completion.go
+++ b/internal/lsp/handlers_completion.go
@@ -318,7 +318,8 @@ func (s *Server) executeChatCompletion(ctx context.Context, plan completionPlan,
 	_ = stats.Update(ctx, client.Name(), modelUsed, sentSize, len(text))
 	s.logLLMStats(modelUsed)
 	trimmed := strings.TrimSpace(text)
-	cleaned := s.postProcessCompletion(trimmed, plan.current[:plan.params.Position.Character], plan.current)
+	cursorByte := utf16OffsetToByteOffset(plan.current, plan.params.Position.Character)
+	cleaned := s.postProcessCompletion(trimmed, plan.current[:cursorByte], plan.current)
 	if cleaned == "" {
 		return nil, false
 	}
@@ -374,9 +375,9 @@ func (s *Server) shouldSuppressForChatTriggerEOL(current string, p CompletionPar
 
 // prefixHeuristicAllows applies minimal prefix rules unless inlinePrompt or structural triggers apply.
 func (s *Server) prefixHeuristicAllows(inlinePrompt bool, current string, p CompletionParams, manualInvoke bool) bool {
-	// Determine the effective cursor index within current line, clamped, and
-	// skip over trailing spaces/tabs to support cases like "type Matrix| ".
-	idx := p.Position.Character
+	// Convert UTF-16 offset to byte offset for correct multi-byte handling,
+	// then clamp to the line length.
+	idx := utf16OffsetToByteOffset(current, p.Position.Character)
 	if idx > len(current) {
 		idx = len(current)
 	}
@@ -454,11 +455,12 @@ func (s *Server) tryProviderNativeCompletion(ctx context.Context, plan completio
 	if cleaned == "" {
 		return nil, false
 	}
-	cleaned = stripDuplicateAssignmentPrefix(current[:p.Position.Character], cleaned)
+	cByte := utf16OffsetToByteOffset(current, p.Position.Character)
+	cleaned = stripDuplicateAssignmentPrefix(current[:cByte], cleaned)
 	if cleaned == "" {
 		return nil, false
 	}
-	cleaned = stripDuplicateGeneralPrefix(current[:p.Position.Character], cleaned)
+	cleaned = stripDuplicateGeneralPrefix(current[:cByte], cleaned)
 	if cleaned == "" {
 		return nil, false
 	}
diff --git a/internal/lsp/handlers_utils.go b/internal/lsp/handlers_utils.go
index 408fdb1..bede7a0 100644
--- a/internal/lsp/handlers_utils.go
+++ b/internal/lsp/handlers_utils.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"strings"
 	"time"
+	"unicode/utf8"
 
 	"codeberg.org/snonux/hexai/internal/appconfig"
 	"codeberg.org/snonux/hexai/internal/llm"
@@ -211,9 +212,11 @@ func computeTextEditAndFilter(cleaned string, inParams bool, current string, p C
 			return te, filter
 		}
 	}
-	startChar := computeWordStart(current, p.Position.Character)
-	te := &TextEdit{Range: Range{Start: Position{Line: p.Position.Line, Character: startChar}, End: Position{Line: p.Position.Line, Character: p.Position.Character}}, NewText: cleaned}
-	filter := strings.TrimLeft(current[startChar:p.Position.Character], " \t")
+	cursorByte := utf16OffsetToByteOffset(current, p.Position.Character)
+	startByte := computeWordStart(current, cursorByte)
+	// TextEdit ranges use UTF-16 offsets; for ASCII identifiers byte == UTF-16.
+	te := &TextEdit{Range: Range{Start: Position{Line: p.Position.Line, Character: startByte}, End: Position{Line: p.Position.Line, Character: p.Position.Character}}, NewText: cleaned}
+	filter := strings.TrimLeft(current[startByte:cursorByte], " \t")
 	return te, filter
 }
 
@@ -733,3 +736,22 @@ func collectSemicolonMarkers(line string, lineNum int, openStr string, open, clo
 	}
 	return edits
 }
+
+// utf16OffsetToByteOffset converts an LSP UTF-16 code-unit offset to a byte
+// offset within a Go (UTF-8) string. BMP characters (most code) are 1 UTF-16
+// unit, while supplementary characters (e.g. emoji) are 2. Returns len(s)
+// if the offset exceeds the string length.
+func utf16OffsetToByteOffset(s string, utf16Offset int) int {
+	byteIdx := 0
+	units := 0
+	for byteIdx < len(s) && units < utf16Offset {
+		r, size := utf8.DecodeRuneInString(s[byteIdx:])
+		byteIdx += size
+		if r >= 0x10000 {
+			units += 2 // surrogate pair in UTF-16
+		} else {
+			units++
+		}
+	}
+	return byteIdx
+}
diff --git a/internal/lsp/utf16_offset_test.go b/internal/lsp/utf16_offset_test.go
new file mode 100644
index 0000000..49a0fa0
--- /dev/null
+++ b/internal/lsp/utf16_offset_test.go
@@ -0,0 +1,59 @@
+package lsp
+
+import "testing"
+
+func TestUTF16OffsetToByteOffset_ASCII(t *testing.T) {
+	s := "hello world"
+	if got := utf16OffsetToByteOffset(s, 5); got != 5 {
+		t.Fatalf("expected 5, got %d", got)
+	}
+}
+
+func TestUTF16OffsetToByteOffset_MultiByte(t *testing.T) {
+	// "aé" — 'a' is 1 byte/1 UTF-16 unit, 'é' is 2 bytes/1 UTF-16 unit
+	s := "aé"
+	// UTF-16 offset 1 → byte 1 (after 'a')
+	if got := utf16OffsetToByteOffset(s, 1); got != 1 {
+		t.Fatalf("expected 1 after 'a', got %d", got)
+	}
+	// UTF-16 offset 2 → byte 3 (after 'é' which is 2 UTF-8 bytes)
+	if got := utf16OffsetToByteOffset(s, 2); got != 3 {
+		t.Fatalf("expected 3 after 'é', got %d", got)
+	}
+}
+
+func TestUTF16OffsetToByteOffset_Emoji(t *testing.T) {
+	// "a🎉b" — 'a' is 1/1, '🎉' is 4 bytes / 2 UTF-16 units, 'b' is 1/1
+	s := "a🎉b"
+	// UTF-16 offset 1 → byte 1 (after 'a')
+	if got := utf16OffsetToByteOffset(s, 1); got != 1 {
+		t.Fatalf("expected 1, got %d", got)
+	}
+	// UTF-16 offset 3 → byte 5 (after '🎉' which is 4 bytes, 2 UTF-16 units)
+	if got := utf16OffsetToByteOffset(s, 3); got != 5 {
+		t.Fatalf("expected 5 after emoji, got %d", got)
+	}
+	// UTF-16 offset 4 → byte 6 (after 'b')
+	if got := utf16OffsetToByteOffset(s, 4); got != 6 {
+		t.Fatalf("expected 6, got %d", got)
+	}
+}
+
+func TestUTF16OffsetToByteOffset_BeyondEnd(t *testing.T) {
+	s := "abc"
+	if got := utf16OffsetToByteOffset(s, 10); got != 3 {
+		t.Fatalf("expected len(s)=3 for offset beyond end, got %d", got)
+	}
+}
+
+func TestUTF16OffsetToByteOffset_Empty(t *testing.T) {
+	if got := utf16OffsetToByteOffset("", 0); got != 0 {
+		t.Fatalf("expected 0 for empty string, got %d", got)
+	}
+}
+
+func TestUTF16OffsetToByteOffset_Zero(t *testing.T) {
+	if got := utf16OffsetToByteOffset("hello", 0); got != 0 {
+		t.Fatalf("expected 0 for offset 0, got %d", got)
+	}
+}
author	Paul Buetow <paul@buetow.org>	2026-03-16 04:38:32 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-16 04:38:32 +0200
commit	5e0cf1ede41b2887db98ca61c8100cbe1da61170 (patch)
tree	d3d172643fb18d8e9b03af591125909babb9c0d8
parent	409cec495ae619fa874e0e827ac620b881f84941 (diff)