Fix byte vs UTF-16 indexing in LSP position handling

Adds utf16OffsetToByteOffset helper to correctly convert LSP character positions (UTF-16 code units) to Go string byte offsets. Fixes trigger detection, prefix heuristic, and completion text slicing for files containing multi-byte characters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-16 04:38:32 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-16 04:38:32 +0200
commit: 5e0cf1ede41b2887db98ca61c8100cbe1da61170 (patch)
tree: d3d172643fb18d8e9b03af591125909babb9c0d8 /internal/lsp/handlers_utils.go
parent: 409cec495ae619fa874e0e827ac620b881f84941 (diff)
1 files changed, 25 insertions, 3 deletions
diff --git a/internal/lsp/handlers_utils.go b/internal/lsp/handlers_utils.go
index 408fdb1..bede7a0 100644
--- a/internal/lsp/handlers_utils.go
+++ b/internal/lsp/handlers_utils.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"strings"
 	"time"
+	"unicode/utf8"
 
 	"codeberg.org/snonux/hexai/internal/appconfig"
 	"codeberg.org/snonux/hexai/internal/llm"
@@ -211,9 +212,11 @@ func computeTextEditAndFilter(cleaned string, inParams bool, current string, p C
 			return te, filter
 		}
 	}
-	startChar := computeWordStart(current, p.Position.Character)
-	te := &TextEdit{Range: Range{Start: Position{Line: p.Position.Line, Character: startChar}, End: Position{Line: p.Position.Line, Character: p.Position.Character}}, NewText: cleaned}
-	filter := strings.TrimLeft(current[startChar:p.Position.Character], " \t")
+	cursorByte := utf16OffsetToByteOffset(current, p.Position.Character)
+	startByte := computeWordStart(current, cursorByte)
+	// TextEdit ranges use UTF-16 offsets; for ASCII identifiers byte == UTF-16.
+	te := &TextEdit{Range: Range{Start: Position{Line: p.Position.Line, Character: startByte}, End: Position{Line: p.Position.Line, Character: p.Position.Character}}, NewText: cleaned}
+	filter := strings.TrimLeft(current[startByte:cursorByte], " \t")
 	return te, filter
 }
 
@@ -733,3 +736,22 @@ func collectSemicolonMarkers(line string, lineNum int, openStr string, open, clo
 	}
 	return edits
 }
+
+// utf16OffsetToByteOffset converts an LSP UTF-16 code-unit offset to a byte
+// offset within a Go (UTF-8) string. BMP characters (most code) are 1 UTF-16
+// unit, while supplementary characters (e.g. emoji) are 2. Returns len(s)
+// if the offset exceeds the string length.
+func utf16OffsetToByteOffset(s string, utf16Offset int) int {
+	byteIdx := 0
+	units := 0
+	for byteIdx < len(s) && units < utf16Offset {
+		r, size := utf8.DecodeRuneInString(s[byteIdx:])
+		byteIdx += size
+		if r >= 0x10000 {
+			units += 2 // surrogate pair in UTF-16
+		} else {
+			units++
+		}
+	}
+	return byteIdx
+}
author	Paul Buetow <paul@buetow.org>	2026-03-16 04:38:32 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-16 04:38:32 +0200
commit	5e0cf1ede41b2887db98ca61c8100cbe1da61170 (patch)
tree	d3d172643fb18d8e9b03af591125909babb9c0d8 /internal/lsp/handlers_utils.go
parent	409cec495ae619fa874e0e827ac620b881f84941 (diff)