From 28f6319b77d35c6da6b99ad7e35d0d5602dc2ee6 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Fri, 10 Apr 2026 18:03:29 +0300 Subject: Fix known-hosts trust deadlock, host key stat, and optional nozstd build - stdout logger: release mutex while waiting on pause resume so prompt callbacks can log (fixes hang after trusting new hosts; known_hosts was written but Resume never ran). - known hosts callback: stop borrowing the SSH dial throttle channel (could block or interact badly with parallel handshakes). - host key path: use errors.Is(..., fs.ErrNotExist) for RootedPath.Stat wrapped errors; stat errors now fail fast instead of mis-read. - public key path: same ErrNotExist check for authorized_keys miss. - Build: optional DTAIL_NO_ZSTD=yes / nozstd tag for CGO-free builds; split zstd readers into tagged files. - Docs/examples: firewalld note for port 2222, log prune timer+script, SSHBindAddress note, dserver unit disabled-by-default comment; firewalld helper script example. - Regression test for stdout pause/mutex behavior. Made-with: Cursor --- AGENTS.md | 3 ++ Makefile | 3 ++ doc/installation.md | 59 +++++++++++++++++++++++++- examples/dserver-prune-logs.service.example | 8 ++++ examples/dserver-prune-logs.timer.example | 9 ++++ examples/dserver.service.example | 2 + examples/firewalld-dserver-port.sh.example | 21 +++++++++ examples/prune_dserver_logs.sh.example | 10 +++++ internal/clients/baseclient.go | 2 +- internal/io/dlog/loggers/stdout.go | 9 ++-- internal/io/dlog/loggers/stdout_test.go | 36 ++++++++++++++++ internal/io/fs/readfile.go | 7 +-- internal/io/fs/readfile_nozstd.go | 16 +++++++ internal/io/fs/readfile_zstd.go | 20 +++++++++ internal/ssh/client/authmethods.go | 8 ++-- internal/ssh/client/knownhostscallback.go | 9 +--- internal/ssh/client/knownhostscallback_test.go | 5 +-- internal/ssh/server/hostkey.go | 28 ++++++------ internal/ssh/server/publickeycallback.go | 4 +- 19 files changed, 218 insertions(+), 41 deletions(-) create mode 100644 examples/dserver-prune-logs.service.example create mode 100644 examples/dserver-prune-logs.timer.example create mode 100644 examples/firewalld-dserver-port.sh.example create mode 100644 examples/prune_dserver_logs.sh.example create mode 100644 internal/io/dlog/loggers/stdout_test.go create mode 100644 internal/io/fs/readfile_nozstd.go create mode 100644 internal/io/fs/readfile_zstd.go diff --git a/AGENTS.md b/AGENTS.md index 1ee56e5..2cf357d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,6 +26,9 @@ make clean # Enable ACL support (requires libacl-devel) DTAIL_USE_ACL=yes make build +# Build without zstd (CGO-free cross-compiles; .zst logs unsupported) +DTAIL_NO_ZSTD=yes make build + # Enable proprietary features DTAIL_USE_PROPRIETARY=yes make build diff --git a/Makefile b/Makefile index c16c508..4e6a658 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,9 @@ endif ifdef DTAIL_USE_PROPRIETARY GO_TAGS+=proprietary endif +ifdef DTAIL_NO_ZSTD +GO_TAGS+=nozstd +endif all: build build: dserver dcat dgrep dmap dtail dtailhealth dtail-tools build-pgo: pgo-build-binaries diff --git a/doc/installation.md b/doc/installation.md index 1f54050..c80e011 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -25,7 +25,17 @@ Set the `DTAIL_USE_ACL` environment variable before invoking the make command. % export DTAIL_USE_ACL=yes ``` -Alternatively, you could add `-tags linuxacl` to the Go compiler. +Alternatively, you could add `-tags linuxacl` to the Go compiler. + +## Build without zstd (optional) + +For targets where CGO-based zstd is unavailable (for example cross-compiling `dserver` for another architecture), build with the `nozstd` tag. Compressed `.zst` log files will not be supported in that binary. + +```console +% export DTAIL_NO_ZSTD=yes +``` + +This sets `-tags nozstd` via the Makefile. Plain `go build` users can pass `-tags nozstd` directly. # Install it @@ -61,15 +71,28 @@ uid=1001(dserver) 1001=670(dserver) groups=1001(dserver) sudo tee /etc/dserver/dtail.json ``` +### SSH listen address (``SSHBindAddress``) + +The example config sets ``Server.SSHBindAddress`` to ``0.0.0.0``, so dserver listens on **every** local IPv4 address, including your LAN (e.g. ``192.168.1.x`` on eth0) and any other interface (loopback, WireGuard, etc.). Clients reach it as ``:2222``; you do **not** need to change this for normal LAN access. + +To listen **only** on a specific address—for example only the home LAN and not on a VPN—set ``SSHBindAddress`` in ``/etc/dserver/dtail.json`` to **that machine’s** address (each host needs its own value), e.g. ``192.168.1.125`` on ``pi0``, ``192.168.1.126`` on ``pi1``. Alternatively, override from the command line (after ``-cfg``): ``dserver -cfg /etc/dserver/dtail.json -bindAddress 192.168.1.125``. Then reload or restart dserver. + 5. It is recommended to configure DTail server as a service to ``systemd``. An example unit file for ``systemd`` can be found [here](../examples/dserver.service.example). ```console % curl https://raw.githubusercontent.com/mimecast/dtail/master/examples/dserver.service.example | sudo tee /etc/systemd/system/dserver.service % sudo systemctl daemon-reload -% sudo systemctl enable dserver ``` +The unit is intended to stay **disabled** until you opt in. Start DTail server manually when needed: + +```console +% sudo systemctl start dserver +``` + +To start it automatically at boot, run once: `sudo systemctl enable dserver`. + # Start it To start the DTail server via ``systemd`` run: @@ -93,6 +116,20 @@ To start the DTail server via ``systemd`` run: Dec 06 13:21:24 serv-001.lan.example.org dserver[12296]: SERVER|serv-001|INFO|Binding server|1.2.3.4:2222 ``` +### Firewall (firewalld on RHEL, Rocky Linux, Fedora, …) + +The DTail server listens on TCP port ``2222`` (see ``SSHPort`` in ``dtail.json``). **ICMP (ping) may work while TCP to 2222 is blocked**, because host firewalls often allow ping but not arbitrary ports. + +If ``firewalld`` is active, allow the DTail port permanently and reload: + +```console +% sudo firewall-cmd --permanent --add-port=2222/tcp +% sudo firewall-cmd --reload +% sudo firewall-cmd --list-ports +``` + +Clients may report ``dial tcp …: connect: no route to host`` when the firewall rejects the connection with an ICMP unreachable—opening ``2222/tcp`` fixes that. For other firewalls (nftables, ufw, …), add an equivalent allow rule for ``2222/tcp``. A small helper script is [firewalld-dserver-port.sh.example](../examples/firewalld-dserver-port.sh.example). + # Register SSH public keys in DTail server The DTail server now runs as a ``systemd`` service under system user ``dserver``. However, the system user ``dserver`` has no permissions to read the SSH public keys from ``/home/USER/.ssh/authorized_keys``. Therefore, no user would be able to establish an SSH session to DTail server. As an alternative path DTail server also checks for public SSH key files in ``/var/run/dserver/cache/USER.authorized_keys``. @@ -113,6 +150,24 @@ It is recommended to execute [update_key_cache.sh](../examples/update_key_cache. % sudo systemctl start dserver-update-keycache.timer ``` +# Prune old dserver log files + +Log files live under ``/var/run/dserver/log`` (see ``LogDir`` in ``dtail.json``). To remove ``*.log`` files **older than seven days**, install [prune_dserver_logs.sh](../examples/prune_dserver_logs.sh.example) and a systemd timer (runs daily with a randomized delay): + +```console +% curl https://raw.githubusercontent.com/mimecast/dtail/master/examples/prune_dserver_logs.sh.example | + sudo tee /var/run/dserver/prune_dserver_logs.sh +% sudo chmod 755 /var/run/dserver/prune_dserver_logs.sh +% curl https://raw.githubusercontent.com/mimecast/dtail/master/examples/dserver-prune-logs.service.example | + sudo tee /etc/systemd/system/dserver-prune-logs.service +% curl https://raw.githubusercontent.com/mimecast/dtail/master/examples/dserver-prune-logs.timer.example | + sudo tee /etc/systemd/system/dserver-prune-logs.timer +% sudo systemctl daemon-reload +% sudo systemctl enable --now dserver-prune-logs.timer +``` + +The script uses ``find /var/run/dserver/log -type f -name '*.log' -mtime +7 -delete``. + # Run DTail client Now you should be able to use DTail client like outlined in the [Quick Starting Guide](quickstart.md). Also, have a look at the [Examples](examples.md). diff --git a/examples/dserver-prune-logs.service.example b/examples/dserver-prune-logs.service.example new file mode 100644 index 0000000..8899487 --- /dev/null +++ b/examples/dserver-prune-logs.service.example @@ -0,0 +1,8 @@ +[Unit] +Description=Delete dserver log files older than 7 days + +[Service] +Type=oneshot +User=dserver +Group=dserver +ExecStart=/var/run/dserver/prune_dserver_logs.sh diff --git a/examples/dserver-prune-logs.timer.example b/examples/dserver-prune-logs.timer.example new file mode 100644 index 0000000..2ec13b6 --- /dev/null +++ b/examples/dserver-prune-logs.timer.example @@ -0,0 +1,9 @@ +[Unit] +Description=Daily cleanup of dserver logs older than 7 days + +[Timer] +OnCalendar=daily +RandomizedDelaySec=1800 + +[Install] +WantedBy=timers.target diff --git a/examples/dserver.service.example b/examples/dserver.service.example index c5e5e59..f299025 100644 --- a/examples/dserver.service.example +++ b/examples/dserver.service.example @@ -1,3 +1,5 @@ +# Installs disabled by default: do not run `systemctl enable dserver` unless you +# want it at boot. Start manually with: sudo systemctl start dserver [Unit] Description=DTail server After=network.target diff --git a/examples/firewalld-dserver-port.sh.example b/examples/firewalld-dserver-port.sh.example new file mode 100644 index 0000000..f10ce08 --- /dev/null +++ b/examples/firewalld-dserver-port.sh.example @@ -0,0 +1,21 @@ +#!/bin/bash +# Allow inbound TCP to dserver (default port 2222) when firewalld is used. +# Run once on the server as root, or fold into your config management. + +set -euo pipefail + +PORT="${DTAIL_FIREWALL_PORT:-2222}" + +if ! command -v firewall-cmd >/dev/null 2>&1; then + echo "firewall-cmd not found; skip or configure your firewall manually." >&2 + exit 0 +fi + +if ! firewall-cmd --state >/dev/null 2>&1; then + echo "firewalld not running; nothing to do." >&2 + exit 0 +fi + +firewall-cmd --permanent "--add-port=${PORT}/tcp" +firewall-cmd --reload +echo "Opened ${PORT}/tcp. Current ports: $(firewall-cmd --list-ports)" diff --git a/examples/prune_dserver_logs.sh.example b/examples/prune_dserver_logs.sh.example new file mode 100644 index 0000000..56a68cd --- /dev/null +++ b/examples/prune_dserver_logs.sh.example @@ -0,0 +1,10 @@ +#!/bin/bash + +declare -r LOGDIR=/var/run/dserver/log + +if [ ! -d "$LOGDIR" ]; then + exit 0 +fi + +# Daily rotated logs: YYYYMMDD.log — remove files not modified in the last 7 days. +/usr/bin/find "$LOGDIR" -type f -name '*.log' -mtime +7 -delete diff --git a/internal/clients/baseclient.go b/internal/clients/baseclient.go index 71b8d02..bc7c2f1 100644 --- a/internal/clients/baseclient.go +++ b/internal/clients/baseclient.go @@ -73,7 +73,7 @@ func (c *baseClient) init() { } c.sshAuthMethods, c.hostKeyCallback = client.InitSSHAuthMethods( c.Args.SSHAuthMethods, c.Args.SSHHostKeyCallback, c.Args.TrustAllHosts, - c.throttleCh, c.Args.SSHPrivateKeyFilePath, c.Args.SSHAgentKeyIndex) + c.Args.SSHPrivateKeyFilePath, c.Args.SSHAgentKeyIndex) } func (c *baseClient) makeConnections(maker maker) { diff --git a/internal/io/dlog/loggers/stdout.go b/internal/io/dlog/loggers/stdout.go index a2575c8..b4e695a 100644 --- a/internal/io/dlog/loggers/stdout.go +++ b/internal/io/dlog/loggers/stdout.go @@ -44,14 +44,17 @@ func (s *stdout) RawWithColors(now time.Time, message, coloredMessage string) { func (s *stdout) log(message string, nl bool) { s.mutex.Lock() - defer s.mutex.Unlock() - select { case <-s.pauseCh: - // Pause until resumed. + // Wait for Resume without holding the mutex: the prompt path calls + // dlog after the user answers while Pause is still active; holding the + // mutex here would deadlock (Info blocks on Lock, Resume never runs). + s.mutex.Unlock() <-s.resumeCh + s.mutex.Lock() default: } + defer s.mutex.Unlock() if nl { fmt.Println(message) diff --git a/internal/io/dlog/loggers/stdout_test.go b/internal/io/dlog/loggers/stdout_test.go new file mode 100644 index 0000000..4f70efc --- /dev/null +++ b/internal/io/dlog/loggers/stdout_test.go @@ -0,0 +1,36 @@ +package loggers + +import ( + "testing" + "time" +) + +// Regression: during an interactive prompt, dlog.Common.Pause() unblocks when some +// goroutine hits stdout.log(); that goroutine must not hold the stdout mutex while +// waiting on resume, or dlog.Client.Info from the prompt callback deadlocks forever. +func TestStdoutSecondLogDuringPauseWaitDoesNotDeadlock(t *testing.T) { + s := newStdout() + + go s.Pause() + time.Sleep(50 * time.Millisecond) + + go func() { + s.Log(time.Now(), "first log consumes pause and waits on resume") + }() + time.Sleep(50 * time.Millisecond) + + secondDone := make(chan struct{}) + go func() { + s.Log(time.Now(), "second log must acquire mutex while first waits for Resume") + close(secondDone) + }() + + select { + case <-secondDone: + case <-time.After(2 * time.Second): + t.Fatal("deadlock: second Log blocked on mutex while first waits for Resume") + } + + s.Resume() + time.Sleep(50 * time.Millisecond) +} diff --git a/internal/io/fs/readfile.go b/internal/io/fs/readfile.go index d305c4d..5241556 100644 --- a/internal/io/fs/readfile.go +++ b/internal/io/fs/readfile.go @@ -19,8 +19,6 @@ import ( "github.com/mimecast/dtail/internal/io/pool" "github.com/mimecast/dtail/internal/lcontext" "github.com/mimecast/dtail/internal/regex" - - "github.com/DataDog/zstd" ) type readStatus int @@ -193,10 +191,7 @@ func (f *readFile) makeCompressedFileReader(fd *os.File) (reader *bufio.Reader, decompressor = gzipReader reader = bufio.NewReader(gzipReader) case strings.HasSuffix(f.FilePath(), ".zst"): - dlog.Common.Info(f.FilePath(), "Detected zstd compression format") - zstdReader := zstd.NewReader(fd) - decompressor = zstdReader - reader = bufio.NewReader(zstdReader) + return f.makeZstdReader(fd) default: reader = bufio.NewReader(fd) } diff --git a/internal/io/fs/readfile_nozstd.go b/internal/io/fs/readfile_nozstd.go new file mode 100644 index 0000000..afd4523 --- /dev/null +++ b/internal/io/fs/readfile_nozstd.go @@ -0,0 +1,16 @@ +//go:build nozstd + +package fs + +import ( + "bufio" + "fmt" + "io" + "os" +) + +func (f *readFile) makeZstdReader(fd *os.File) (reader *bufio.Reader, decompressor io.Closer, err error) { + _ = fd + err = fmt.Errorf("%s: zstd is not supported in this build (built with -tags nozstd)", f.FilePath()) + return +} diff --git a/internal/io/fs/readfile_zstd.go b/internal/io/fs/readfile_zstd.go new file mode 100644 index 0000000..a7e479b --- /dev/null +++ b/internal/io/fs/readfile_zstd.go @@ -0,0 +1,20 @@ +//go:build !nozstd + +package fs + +import ( + "bufio" + "io" + "os" + + "github.com/DataDog/zstd" + "github.com/mimecast/dtail/internal/io/dlog" +) + +func (f *readFile) makeZstdReader(fd *os.File) (reader *bufio.Reader, decompressor io.Closer, err error) { + dlog.Common.Info(f.FilePath(), "Detected zstd compression format") + zstdReader := zstd.NewReader(fd) + decompressor = zstdReader + reader = bufio.NewReader(zstdReader) + return +} diff --git a/internal/ssh/client/authmethods.go b/internal/ssh/client/authmethods.go index 7ac4d0c..3cd1bb3 100644 --- a/internal/ssh/client/authmethods.go +++ b/internal/ssh/client/authmethods.go @@ -18,7 +18,7 @@ var ( // InitSSHAuthMethods initialises all known SSH auth methods on the client side. func InitSSHAuthMethods(sshAuthMethods []gossh.AuthMethod, - hostKeyCallback gossh.HostKeyCallback, trustAllHosts bool, throttleCh chan struct{}, + hostKeyCallback gossh.HostKeyCallback, trustAllHosts bool, privateKeyPath string, agentKeyIndex int) ([]gossh.AuthMethod, HostKeyCallback) { if len(sshAuthMethods) > 0 { @@ -28,10 +28,10 @@ func InitSSHAuthMethods(sshAuthMethods []gossh.AuthMethod, } return sshAuthMethods, simpleCallback } - return initKnownHostsAuthMethods(trustAllHosts, throttleCh, privateKeyPath, agentKeyIndex) + return initKnownHostsAuthMethods(trustAllHosts, privateKeyPath, agentKeyIndex) } -func initKnownHostsAuthMethods(trustAllHosts bool, throttleCh chan struct{}, +func initKnownHostsAuthMethods(trustAllHosts bool, privateKeyPath string, agentKeyIndex int) ([]gossh.AuthMethod, HostKeyCallback) { knownHostsFile := fmt.Sprintf("%s/.ssh/known_hosts", os.Getenv("HOME")) @@ -40,7 +40,7 @@ func initKnownHostsAuthMethods(trustAllHosts bool, throttleCh chan struct{}, knownHostsFile = "./known_hosts" } - knownHostsCallback, err := NewKnownHostsCallback(knownHostsFile, trustAllHosts, throttleCh) + knownHostsCallback, err := NewKnownHostsCallback(knownHostsFile, trustAllHosts) if err != nil { dlog.Client.FatalPanic(knownHostsFile, err) } diff --git a/internal/ssh/client/knownhostscallback.go b/internal/ssh/client/knownhostscallback.go index 174f6aa..da1b29b 100644 --- a/internal/ssh/client/knownhostscallback.go +++ b/internal/ssh/client/knownhostscallback.go @@ -41,7 +41,6 @@ type KnownHostsCallback struct { knownHostsPath string knownHostsFile fs.RootedPath unknownCh chan unknownHost - throttleCh chan struct{} trustAllHostsCh chan struct{} untrustedHosts map[string]bool mutex *sync.Mutex @@ -50,8 +49,7 @@ type KnownHostsCallback struct { var _ HostKeyCallback = (*KnownHostsCallback)(nil) // NewKnownHostsCallback returns a new wrapper. -func NewKnownHostsCallback(knownHostsPath string, trustAllHosts bool, - throttleCh chan struct{}) (HostKeyCallback, error) { +func NewKnownHostsCallback(knownHostsPath string, trustAllHosts bool) (HostKeyCallback, error) { knownHostsFile, err := fs.NewRootedPath(knownHostsPath) if err != nil { @@ -65,7 +63,6 @@ func NewKnownHostsCallback(knownHostsPath string, trustAllHosts bool, knownHostsFile: knownHostsFile, unknownCh: make(chan unknownHost), trustAllHostsCh: make(chan struct{}), - throttleCh: throttleCh, untrustedHosts: untrustedHosts, mutex: &sync.Mutex{}, } @@ -103,10 +100,6 @@ func (c *KnownHostsCallback) Wrap() ssh.HostKeyCallback { // OK return nil } - // Make sure that interactive user callback does not interfere with - // SSH connection throttler. - <-c.throttleCh - defer func() { c.throttleCh <- struct{}{} }() unknown := unknownHost{ server: server, diff --git a/internal/ssh/client/knownhostscallback_test.go b/internal/ssh/client/knownhostscallback_test.go index 596aea8..1765598 100644 --- a/internal/ssh/client/knownhostscallback_test.go +++ b/internal/ssh/client/knownhostscallback_test.go @@ -112,10 +112,7 @@ func TestTrustHostsRejectsEscapingKnownHostsSymlink(t *testing.T) { func testKnownHostsCallback(t *testing.T, knownHostsPath string) *KnownHostsCallback { t.Helper() - throttleCh := make(chan struct{}, 1) - throttleCh <- struct{}{} - - callback, err := NewKnownHostsCallback(knownHostsPath, false, throttleCh) + callback, err := NewKnownHostsCallback(knownHostsPath, false) if err != nil { t.Fatalf("NewKnownHostsCallback failed: %v", err) } diff --git a/internal/ssh/server/hostkey.go b/internal/ssh/server/hostkey.go index 1df2287..1315351 100644 --- a/internal/ssh/server/hostkey.go +++ b/internal/ssh/server/hostkey.go @@ -1,7 +1,8 @@ package server import ( - "os" + "errors" + iofs "io/fs" "github.com/mimecast/dtail/internal/config" "github.com/mimecast/dtail/internal/io/dlog" @@ -31,18 +32,21 @@ func PrivateHostKey(hostKeyFile string, hostKeyBits int) []byte { } _, err = hostKeyPath.Stat() - - if os.IsNotExist(err) { - dlog.Server.Info("Generating private server RSA host key") - pem, err := generatePrivateHostKey(hostKeyBits) - if err != nil { - dlog.Server.FatalPanic("Failed to generate private server RSA host key", err) - } - if err := storePrivateHostKey(hostKeyPath, pem); err != nil { - dlog.Server.Error("Unable to write private server RSA host key to file", - hostKeyFile, err) + if err != nil { + // os.IsNotExist does not unwrap fmt.Errorf chains from RootedPath.Stat; use errors.Is. + if errors.Is(err, iofs.ErrNotExist) { + dlog.Server.Info("Generating private server RSA host key") + pem, genErr := generatePrivateHostKey(hostKeyBits) + if genErr != nil { + dlog.Server.FatalPanic("Failed to generate private server RSA host key", genErr) + } + if storeErr := storePrivateHostKey(hostKeyPath, pem); storeErr != nil { + dlog.Server.Error("Unable to write private server RSA host key to file", + hostKeyFile, storeErr) + } + return pem } - return pem + dlog.Server.FatalPanic("Cannot stat private server RSA host key path", hostKeyFile, err) } dlog.Server.Info("Reading private server RSA host key from file", hostKeyFile) diff --git a/internal/ssh/server/publickeycallback.go b/internal/ssh/server/publickeycallback.go index 3afbfba..df83bf6 100644 --- a/internal/ssh/server/publickeycallback.go +++ b/internal/ssh/server/publickeycallback.go @@ -1,7 +1,9 @@ package server import ( + "errors" "fmt" + iofs "io/fs" "os" goUser "os/user" "path/filepath" @@ -142,7 +144,7 @@ func findAuthorizedKeysPath(user *user.User, cacheDir, cwd string, if _, err = rootedAuthorizedKeysPath.Stat(); err == nil { return rootedAuthorizedKeysPath, nil } - if !os.IsNotExist(err) { + if !errors.Is(err, iofs.ErrNotExist) { return fs.RootedPath{}, err } -- cgit v1.2.3