From 67a6b9d8e8e8dc83d5ea3e5859e631a0dfa9dabe Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Wed, 18 Jun 2025 09:10:52 +0300
Subject: Complete channelless migration for DTail operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Implement channelless MapReduce with streaming aggregation
- Add channelless tail with proper file following capability
- Fix TestDTailWithServer by implementing ServerHandlerWriter for client-server mode
- Add proper serverless mode detection for standalone operations
- Remove temporary benchmark scripts
- All integration tests now pass

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 scripts/benchmark_channelless.sh | 215 ---------------------------------------
 scripts/corrected_benchmark.sh   |  89 ----------------
 scripts/profile_channelless.sh   |  50 ---------
 3 files changed, 354 deletions(-)
 delete mode 100755 scripts/benchmark_channelless.sh
 delete mode 100755 scripts/corrected_benchmark.sh
 delete mode 100755 scripts/profile_channelless.sh

(limited to 'scripts')

diff --git a/scripts/benchmark_channelless.sh b/scripts/benchmark_channelless.sh
deleted file mode 100755
index 4ac532f..0000000
--- a/scripts/benchmark_channelless.sh
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/bin/bash
-
-# Comprehensive benchmark: Channel-based vs Channelless Cat Implementation
-# Tests performance improvements achieved by eliminating channel overhead
-
-set -e
-
-echo "=== DTail Channelless Performance Benchmark ==="
-echo "Comparing channel-based vs channelless cat implementation"
-echo "Date: $(date)"
-echo
-
-# Test configuration
-TEST_FILES=("test_100mb.txt" "test_200mb.txt")
-ITERATIONS=5
-WARMUP_RUNS=2
-
-# Results storage
-RESULTS_DIR="benchmark_results_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$RESULTS_DIR"
-
-# Ensure we're in the correct directory
-cd "$(dirname "$0")/.."
-
-# Build both implementations
-echo "Building DTail binaries..."
-make clean > /dev/null 2>&1
-make build > /dev/null 2>&1
-echo "✓ Build complete"
-echo
-
-# Function to run benchmark for a specific configuration
-run_benchmark() {
-    local use_channelless=$1
-    local test_file=$2
-    local impl_name=$3
-    local results_file="$RESULTS_DIR/${impl_name}_$(basename $test_file .txt).results"
-    
-    echo "Testing $impl_name with $test_file..."
-    
-    # Warmup runs
-    for ((i=1; i<=WARMUP_RUNS; i++)); do
-        echo -n "  Warmup $i/$WARMUP_RUNS... "
-        DTAIL_USE_CHANNELLESS=$use_channelless DTAIL_INTEGRATION_TEST_RUN_MODE=yes \
-            timeout 30s ./dcat --logLevel error --cfg none "scripts/$test_file" > /dev/null 2>&1
-        echo "done"
-    done
-    
-    # Actual benchmark runs
-    echo "  Running $ITERATIONS benchmark iterations:"
-    for ((i=1; i<=ITERATIONS; i++)); do
-        echo -n "    Run $i/$ITERATIONS... "
-        
-        # Clear caches
-        sync
-        echo 3 > /proc/sys/vm/drop_caches 2>/dev/null || true
-        
-        # Run benchmark with time measurement
-        start_time=$(date +%s.%N)
-        DTAIL_USE_CHANNELLESS=$use_channelless DTAIL_INTEGRATION_TEST_RUN_MODE=yes \
-            timeout 30s ./dcat --logLevel error --cfg none "scripts/$test_file" > /dev/null 2>&1
-        end_time=$(date +%s.%N)
-        
-        # Calculate duration
-        duration=$(echo "$end_time - $start_time" | bc -l)
-        echo "$duration" >> "$results_file"
-        
-        printf "%.3fs\n" "$duration"
-    done
-    echo
-}
-
-# Function to calculate statistics
-calculate_stats() {
-    local file=$1
-    local values=($(cat "$file"))
-    local sum=0
-    local count=${#values[@]}
-    
-    # Calculate mean
-    for val in "${values[@]}"; do
-        sum=$(echo "$sum + $val" | bc -l)
-    done
-    local mean=$(echo "scale=6; $sum / $count" | bc -l)
-    
-    # Calculate standard deviation
-    local variance_sum=0
-    for val in "${values[@]}"; do
-        local diff=$(echo "$val - $mean" | bc -l)
-        local squared=$(echo "$diff * $diff" | bc -l)
-        variance_sum=$(echo "$variance_sum + $squared" | bc -l)
-    done
-    local variance=$(echo "scale=6; $variance_sum / $count" | bc -l)
-    local stddev=$(echo "scale=6; sqrt($variance)" | bc -l)
-    
-    # Find min and max
-    local min=${values[0]}
-    local max=${values[0]}
-    for val in "${values[@]}"; do
-        if (( $(echo "$val < $min" | bc -l) )); then
-            min=$val
-        fi
-        if (( $(echo "$val > $max" | bc -l) )); then
-            max=$val
-        fi
-    done
-    
-    echo "$mean $stddev $min $max"
-}
-
-# Function to calculate throughput
-calculate_throughput() {
-    local file_size_mb=$1
-    local time_seconds=$2
-    echo "scale=2; $file_size_mb / $time_seconds" | bc -l
-}
-
-# Run benchmarks
-echo "Starting benchmarks..."
-echo
-
-for test_file in "${TEST_FILES[@]}"; do
-    echo "=== Benchmarking with $test_file ==="
-    
-    # Get file size in MB
-    file_size_bytes=$(stat -c%s "scripts/$test_file")
-    file_size_mb=$(echo "scale=2; $file_size_bytes / 1024 / 1024" | bc -l)
-    echo "File size: ${file_size_mb} MB"
-    echo
-    
-    # Test channel-based implementation
-    run_benchmark "false" "$test_file" "channel_based"
-    
-    # Test channelless implementation  
-    run_benchmark "true" "$test_file" "channelless"
-    
-    echo "--- Results for $test_file ---"
-    
-    # Calculate statistics for channel-based
-    channel_stats=($(calculate_stats "$RESULTS_DIR/channel_based_$(basename $test_file .txt).results"))
-    channel_mean=${channel_stats[0]}
-    channel_stddev=${channel_stats[1]}
-    channel_min=${channel_stats[2]}
-    channel_max=${channel_stats[3]}
-    channel_throughput=$(calculate_throughput "$file_size_mb" "$channel_mean")
-    
-    # Calculate statistics for channelless
-    channelless_stats=($(calculate_stats "$RESULTS_DIR/channelless_$(basename $test_file .txt).results"))
-    channelless_mean=${channelless_stats[0]}
-    channelless_stddev=${channelless_stats[1]}
-    channelless_min=${channelless_stats[2]}
-    channelless_max=${channelless_stats[3]}
-    channelless_throughput=$(calculate_throughput "$file_size_mb" "$channelless_mean")
-    
-    # Calculate improvement
-    improvement=$(echo "scale=2; (($channel_mean - $channelless_mean) / $channel_mean) * 100" | bc -l)
-    speedup=$(echo "scale=2; $channel_mean / $channelless_mean" | bc -l)
-    throughput_improvement=$(echo "scale=2; (($channelless_throughput - $channel_throughput) / $channel_throughput) * 100" | bc -l)
-    
-    echo "Channel-based:"
-    printf "  Time: %.3f ± %.3f seconds (min: %.3f, max: %.3f)\n" "$channel_mean" "$channel_stddev" "$channel_min" "$channel_max"
-    printf "  Throughput: %.2f MB/s\n" "$channel_throughput"
-    echo
-    echo "Channelless:"
-    printf "  Time: %.3f ± %.3f seconds (min: %.3f, max: %.3f)\n" "$channelless_mean" "$channelless_stddev" "$channelless_min" "$channelless_max"
-    printf "  Throughput: %.2f MB/s\n" "$channelless_throughput"
-    echo
-    echo "Performance Improvement:"
-    printf "  Time reduction: %.2f%% (%.2fx speedup)\n" "$improvement" "$speedup"
-    printf "  Throughput increase: %.2f%%\n" "$throughput_improvement"
-    echo
-    echo "=========================================="
-    echo
-done
-
-# Generate summary report
-echo "=== BENCHMARK SUMMARY ==="
-echo
-
-summary_file="$RESULTS_DIR/benchmark_summary.txt"
-{
-    echo "DTail Channelless Performance Benchmark Summary"
-    echo "Date: $(date)"
-    echo "Iterations per test: $ITERATIONS"
-    echo "Warmup runs: $WARMUP_RUNS"
-    echo
-    
-    for test_file in "${TEST_FILES[@]}"; do
-        file_size_bytes=$(stat -c%s "scripts/$test_file")
-        file_size_mb=$(echo "scale=2; $file_size_bytes / 1024 / 1024" | bc -l)
-        
-        channel_stats=($(calculate_stats "$RESULTS_DIR/channel_based_$(basename $test_file .txt).results"))
-        channelless_stats=($(calculate_stats "$RESULTS_DIR/channelless_$(basename $test_file .txt).results"))
-        
-        channel_mean=${channel_stats[0]}
-        channelless_mean=${channelless_stats[0]}
-        
-        improvement=$(echo "scale=2; (($channel_mean - $channelless_mean) / $channel_mean) * 100" | bc -l)
-        speedup=$(echo "scale=2; $channel_mean / $channelless_mean" | bc -l)
-        
-        channel_throughput=$(calculate_throughput "$file_size_mb" "$channel_mean")
-        channelless_throughput=$(calculate_throughput "$file_size_mb" "$channelless_mean")
-        
-        echo "$test_file (${file_size_mb} MB):"
-        printf "  Channel-based: %.3f seconds (%.2f MB/s)\n" "$channel_mean" "$channel_throughput"
-        printf "  Channelless:   %.3f seconds (%.2f MB/s)\n" "$channelless_mean" "$channelless_throughput"
-        printf "  Improvement:   %.2f%% faster (%.2fx speedup)\n" "$improvement" "$speedup"
-        echo
-    done
-} | tee "$summary_file"
-
-echo "Detailed results saved in: $RESULTS_DIR/"
-echo "Summary report: $summary_file"
-echo
-echo "=== BENCHMARK COMPLETE ==="
\ No newline at end of file
diff --git a/scripts/corrected_benchmark.sh b/scripts/corrected_benchmark.sh
deleted file mode 100755
index aa42aec..0000000
--- a/scripts/corrected_benchmark.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-# Corrected benchmark: Channel-based vs Channelless Cat Implementation
-# This accounts for the fact that channel-based doesn't process all data
-
-set -e
-
-echo "=== CORRECTED DTail Channelless Performance Benchmark ==="
-echo "Channel-based implementation appears to have a bug - it only processes ~67% of data"
-echo "Benchmarking actual throughput per line processed"
-echo
-
-# Test with 100MB file
-TEST_FILE="scripts/test_100mb.txt"
-TOTAL_LINES=$(wc -l < "$TEST_FILE")
-FILE_SIZE_MB=$(echo "scale=2; $(stat -c%s "$TEST_FILE") / 1024 / 1024" | bc -l)
-
-echo "Test file: $TEST_FILE"
-echo "Total lines in file: $TOTAL_LINES"
-echo "File size: ${FILE_SIZE_MB} MB"
-echo
-
-# Run both implementations and measure
-echo "Testing channel-based implementation..."
-start_time=$(date +%s.%N)
-CHANNEL_LINES=$(DTAIL_USE_CHANNELLESS=false DTAIL_INTEGRATION_TEST_RUN_MODE=yes ./dcat --logLevel error --cfg none "$TEST_FILE" | wc -l)
-end_time=$(date +%s.%N)
-channel_time=$(echo "$end_time - $start_time" | bc -l)
-
-echo "Testing channelless implementation..."
-start_time=$(date +%s.%N)
-CHANNELLESS_LINES=$(DTAIL_USE_CHANNELLESS=true DTAIL_INTEGRATION_TEST_RUN_MODE=yes ./dcat --logLevel error --cfg none "$TEST_FILE" | wc -l)
-end_time=$(date +%s.%N)
-channelless_time=$(echo "$end_time - $start_time" | bc -l)
-
-# Calculate metrics
-channel_throughput_lines=$(echo "scale=2; $CHANNEL_LINES / $channel_time" | bc -l)
-channelless_throughput_lines=$(echo "scale=2; $CHANNELLESS_LINES / $channelless_time" | bc -l)
-
-channel_coverage=$(echo "scale=2; ($CHANNEL_LINES * 100) / $TOTAL_LINES" | bc -l)
-channelless_coverage=$(echo "scale=2; ($CHANNELLESS_LINES * 100) / $TOTAL_LINES" | bc -l)
-
-# Effective data processed
-channel_data_mb=$(echo "scale=2; ($CHANNEL_LINES * $FILE_SIZE_MB) / $TOTAL_LINES" | bc -l)
-channelless_data_mb=$FILE_SIZE_MB
-
-channel_throughput_mb=$(echo "scale=2; $channel_data_mb / $channel_time" | bc -l)
-channelless_throughput_mb=$(echo "scale=2; $channelless_data_mb / $channelless_time" | bc -l)
-
-# Calculate relative performance for same amount of work
-extrapolated_channel_time=$(echo "scale=2; ($channel_time * $TOTAL_LINES) / $CHANNEL_LINES" | bc -l)
-performance_improvement=$(echo "scale=2; (($extrapolated_channel_time - $channelless_time) / $extrapolated_channel_time) * 100" | bc -l)
-speedup=$(echo "scale=2; $extrapolated_channel_time / $channelless_time" | bc -l)
-
-echo
-echo "=== RESULTS ==="
-echo
-echo "Channel-based implementation:"
-printf "  Time: %.3f seconds\n" "$channel_time"
-printf "  Lines processed: %d (%.1f%% of file)\n" "$CHANNEL_LINES" "$channel_coverage"
-printf "  Data processed: %.2f MB\n" "$channel_data_mb"
-printf "  Throughput: %.0f lines/sec, %.2f MB/s\n" "$channel_throughput_lines" "$channel_throughput_mb"
-printf "  Extrapolated time for full file: %.3f seconds\n" "$extrapolated_channel_time"
-echo
-
-echo "Channelless implementation:"
-printf "  Time: %.3f seconds\n" "$channelless_time"
-printf "  Lines processed: %d (%.1f%% of file)\n" "$CHANNELLESS_LINES" "$channelless_coverage"
-printf "  Data processed: %.2f MB\n" "$channelless_data_mb"
-printf "  Throughput: %.0f lines/sec, %.2f MB/s\n" "$channelless_throughput_lines" "$channelless_throughput_mb"
-echo
-
-echo "Performance comparison (for processing complete file):"
-printf "  Channelless improvement: %.2f%% faster\n" "$performance_improvement"
-printf "  Speedup: %.2fx\n" "$speedup"
-echo
-
-if (( $(echo "$performance_improvement > 0" | bc -l) )); then
-    echo "✅ Channelless implementation is FASTER and processes ALL data correctly"
-else
-    echo "❌ Channelless implementation is slower"
-fi
-echo
-
-echo "=== CONCLUSION ==="
-echo "The channel-based implementation has a bug where it stops processing"
-echo "at approximately 67% of the input file. This makes direct time comparisons"
-echo "invalid. When extrapolated to process the same amount of data, the"
-echo "channelless implementation shows the expected performance improvement."
\ No newline at end of file
diff --git a/scripts/profile_channelless.sh b/scripts/profile_channelless.sh
deleted file mode 100755
index fb6ec3d..0000000
--- a/scripts/profile_channelless.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-# Profile channelless vs channel-based implementations to understand performance difference
-
-set -e
-
-echo "=== Profiling Channelless vs Channel-based Cat Implementation ==="
-echo
-
-# Build with profiling enabled
-echo "Building DTail binaries..."
-make clean > /dev/null 2>&1
-make build > /dev/null 2>&1
-
-echo "Profiling channel-based implementation..."
-DTAIL_USE_CHANNELLESS=false DTAIL_INTEGRATION_TEST_RUN_MODE=yes \
-    go tool pprof -cpuprofile=channel_based_cpu.prof \
-    -o channel_based_cpu.prof \
-    -- ./dcat --logLevel error --cfg none scripts/test_100mb.txt > /dev/null 2>&1 &
-CHANNEL_PID=$!
-
-# Profile with Go's built-in profiling
-DTAIL_USE_CHANNELLESS=false DTAIL_INTEGRATION_TEST_RUN_MODE=yes \
-    timeout 10s go run -cpuprofile=channel_based_go.prof ./cmd/dcat/main.go --logLevel error --cfg none scripts/test_100mb.txt > /dev/null 2>&1 || true
-
-echo "Profiling channelless implementation..."
-DTAIL_USE_CHANNELLESS=true DTAIL_INTEGRATION_TEST_RUN_MODE=yes \
-    timeout 10s go run -cpuprofile=channelless_go.prof ./cmd/dcat/main.go --logLevel error --cfg none scripts/test_100mb.txt > /dev/null 2>&1 || true
-
-echo "Analyzing profiles..."
-
-echo
-echo "=== Channel-based CPU Profile ==="
-if [ -f channel_based_go.prof ]; then
-    go tool pprof -top -cum channel_based_go.prof | head -20
-else
-    echo "Channel-based profile not found"
-fi
-
-echo
-echo "=== Channelless CPU Profile ==="
-if [ -f channelless_go.prof ]; then
-    go tool pprof -top -cum channelless_go.prof | head -20
-else
-    echo "Channelless profile not found"
-fi
-
-echo
-echo "Profile files generated:"
-ls -la *_go.prof 2>/dev/null || echo "No profile files found"
\ No newline at end of file
-- 
cgit v1.2.3