From 67a6b9d8e8e8dc83d5ea3e5859e631a0dfa9dabe Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Wed, 18 Jun 2025 09:10:52 +0300 Subject: Complete channelless migration for DTail operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implement channelless MapReduce with streaming aggregation - Add channelless tail with proper file following capability - Fix TestDTailWithServer by implementing ServerHandlerWriter for client-server mode - Add proper serverless mode detection for standalone operations - Remove temporary benchmark scripts - All integration tests now pass 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- scripts/benchmark_channelless.sh | 215 --------------------------------------- scripts/corrected_benchmark.sh | 89 ---------------- scripts/profile_channelless.sh | 50 --------- 3 files changed, 354 deletions(-) delete mode 100755 scripts/benchmark_channelless.sh delete mode 100755 scripts/corrected_benchmark.sh delete mode 100755 scripts/profile_channelless.sh (limited to 'scripts') diff --git a/scripts/benchmark_channelless.sh b/scripts/benchmark_channelless.sh deleted file mode 100755 index 4ac532f..0000000 --- a/scripts/benchmark_channelless.sh +++ /dev/null @@ -1,215 +0,0 @@ -#!/bin/bash - -# Comprehensive benchmark: Channel-based vs Channelless Cat Implementation -# Tests performance improvements achieved by eliminating channel overhead - -set -e - -echo "=== DTail Channelless Performance Benchmark ===" -echo "Comparing channel-based vs channelless cat implementation" -echo "Date: $(date)" -echo - -# Test configuration -TEST_FILES=("test_100mb.txt" "test_200mb.txt") -ITERATIONS=5 -WARMUP_RUNS=2 - -# Results storage -RESULTS_DIR="benchmark_results_$(date +%Y%m%d_%H%M%S)" -mkdir -p "$RESULTS_DIR" - -# Ensure we're in the correct directory -cd "$(dirname "$0")/.." - -# Build both implementations -echo "Building DTail binaries..." -make clean > /dev/null 2>&1 -make build > /dev/null 2>&1 -echo "✓ Build complete" -echo - -# Function to run benchmark for a specific configuration -run_benchmark() { - local use_channelless=$1 - local test_file=$2 - local impl_name=$3 - local results_file="$RESULTS_DIR/${impl_name}_$(basename $test_file .txt).results" - - echo "Testing $impl_name with $test_file..." - - # Warmup runs - for ((i=1; i<=WARMUP_RUNS; i++)); do - echo -n " Warmup $i/$WARMUP_RUNS... " - DTAIL_USE_CHANNELLESS=$use_channelless DTAIL_INTEGRATION_TEST_RUN_MODE=yes \ - timeout 30s ./dcat --logLevel error --cfg none "scripts/$test_file" > /dev/null 2>&1 - echo "done" - done - - # Actual benchmark runs - echo " Running $ITERATIONS benchmark iterations:" - for ((i=1; i<=ITERATIONS; i++)); do - echo -n " Run $i/$ITERATIONS... " - - # Clear caches - sync - echo 3 > /proc/sys/vm/drop_caches 2>/dev/null || true - - # Run benchmark with time measurement - start_time=$(date +%s.%N) - DTAIL_USE_CHANNELLESS=$use_channelless DTAIL_INTEGRATION_TEST_RUN_MODE=yes \ - timeout 30s ./dcat --logLevel error --cfg none "scripts/$test_file" > /dev/null 2>&1 - end_time=$(date +%s.%N) - - # Calculate duration - duration=$(echo "$end_time - $start_time" | bc -l) - echo "$duration" >> "$results_file" - - printf "%.3fs\n" "$duration" - done - echo -} - -# Function to calculate statistics -calculate_stats() { - local file=$1 - local values=($(cat "$file")) - local sum=0 - local count=${#values[@]} - - # Calculate mean - for val in "${values[@]}"; do - sum=$(echo "$sum + $val" | bc -l) - done - local mean=$(echo "scale=6; $sum / $count" | bc -l) - - # Calculate standard deviation - local variance_sum=0 - for val in "${values[@]}"; do - local diff=$(echo "$val - $mean" | bc -l) - local squared=$(echo "$diff * $diff" | bc -l) - variance_sum=$(echo "$variance_sum + $squared" | bc -l) - done - local variance=$(echo "scale=6; $variance_sum / $count" | bc -l) - local stddev=$(echo "scale=6; sqrt($variance)" | bc -l) - - # Find min and max - local min=${values[0]} - local max=${values[0]} - for val in "${values[@]}"; do - if (( $(echo "$val < $min" | bc -l) )); then - min=$val - fi - if (( $(echo "$val > $max" | bc -l) )); then - max=$val - fi - done - - echo "$mean $stddev $min $max" -} - -# Function to calculate throughput -calculate_throughput() { - local file_size_mb=$1 - local time_seconds=$2 - echo "scale=2; $file_size_mb / $time_seconds" | bc -l -} - -# Run benchmarks -echo "Starting benchmarks..." -echo - -for test_file in "${TEST_FILES[@]}"; do - echo "=== Benchmarking with $test_file ===" - - # Get file size in MB - file_size_bytes=$(stat -c%s "scripts/$test_file") - file_size_mb=$(echo "scale=2; $file_size_bytes / 1024 / 1024" | bc -l) - echo "File size: ${file_size_mb} MB" - echo - - # Test channel-based implementation - run_benchmark "false" "$test_file" "channel_based" - - # Test channelless implementation - run_benchmark "true" "$test_file" "channelless" - - echo "--- Results for $test_file ---" - - # Calculate statistics for channel-based - channel_stats=($(calculate_stats "$RESULTS_DIR/channel_based_$(basename $test_file .txt).results")) - channel_mean=${channel_stats[0]} - channel_stddev=${channel_stats[1]} - channel_min=${channel_stats[2]} - channel_max=${channel_stats[3]} - channel_throughput=$(calculate_throughput "$file_size_mb" "$channel_mean") - - # Calculate statistics for channelless - channelless_stats=($(calculate_stats "$RESULTS_DIR/channelless_$(basename $test_file .txt).results")) - channelless_mean=${channelless_stats[0]} - channelless_stddev=${channelless_stats[1]} - channelless_min=${channelless_stats[2]} - channelless_max=${channelless_stats[3]} - channelless_throughput=$(calculate_throughput "$file_size_mb" "$channelless_mean") - - # Calculate improvement - improvement=$(echo "scale=2; (($channel_mean - $channelless_mean) / $channel_mean) * 100" | bc -l) - speedup=$(echo "scale=2; $channel_mean / $channelless_mean" | bc -l) - throughput_improvement=$(echo "scale=2; (($channelless_throughput - $channel_throughput) / $channel_throughput) * 100" | bc -l) - - echo "Channel-based:" - printf " Time: %.3f ± %.3f seconds (min: %.3f, max: %.3f)\n" "$channel_mean" "$channel_stddev" "$channel_min" "$channel_max" - printf " Throughput: %.2f MB/s\n" "$channel_throughput" - echo - echo "Channelless:" - printf " Time: %.3f ± %.3f seconds (min: %.3f, max: %.3f)\n" "$channelless_mean" "$channelless_stddev" "$channelless_min" "$channelless_max" - printf " Throughput: %.2f MB/s\n" "$channelless_throughput" - echo - echo "Performance Improvement:" - printf " Time reduction: %.2f%% (%.2fx speedup)\n" "$improvement" "$speedup" - printf " Throughput increase: %.2f%%\n" "$throughput_improvement" - echo - echo "==========================================" - echo -done - -# Generate summary report -echo "=== BENCHMARK SUMMARY ===" -echo - -summary_file="$RESULTS_DIR/benchmark_summary.txt" -{ - echo "DTail Channelless Performance Benchmark Summary" - echo "Date: $(date)" - echo "Iterations per test: $ITERATIONS" - echo "Warmup runs: $WARMUP_RUNS" - echo - - for test_file in "${TEST_FILES[@]}"; do - file_size_bytes=$(stat -c%s "scripts/$test_file") - file_size_mb=$(echo "scale=2; $file_size_bytes / 1024 / 1024" | bc -l) - - channel_stats=($(calculate_stats "$RESULTS_DIR/channel_based_$(basename $test_file .txt).results")) - channelless_stats=($(calculate_stats "$RESULTS_DIR/channelless_$(basename $test_file .txt).results")) - - channel_mean=${channel_stats[0]} - channelless_mean=${channelless_stats[0]} - - improvement=$(echo "scale=2; (($channel_mean - $channelless_mean) / $channel_mean) * 100" | bc -l) - speedup=$(echo "scale=2; $channel_mean / $channelless_mean" | bc -l) - - channel_throughput=$(calculate_throughput "$file_size_mb" "$channel_mean") - channelless_throughput=$(calculate_throughput "$file_size_mb" "$channelless_mean") - - echo "$test_file (${file_size_mb} MB):" - printf " Channel-based: %.3f seconds (%.2f MB/s)\n" "$channel_mean" "$channel_throughput" - printf " Channelless: %.3f seconds (%.2f MB/s)\n" "$channelless_mean" "$channelless_throughput" - printf " Improvement: %.2f%% faster (%.2fx speedup)\n" "$improvement" "$speedup" - echo - done -} | tee "$summary_file" - -echo "Detailed results saved in: $RESULTS_DIR/" -echo "Summary report: $summary_file" -echo -echo "=== BENCHMARK COMPLETE ===" \ No newline at end of file diff --git a/scripts/corrected_benchmark.sh b/scripts/corrected_benchmark.sh deleted file mode 100755 index aa42aec..0000000 --- a/scripts/corrected_benchmark.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash - -# Corrected benchmark: Channel-based vs Channelless Cat Implementation -# This accounts for the fact that channel-based doesn't process all data - -set -e - -echo "=== CORRECTED DTail Channelless Performance Benchmark ===" -echo "Channel-based implementation appears to have a bug - it only processes ~67% of data" -echo "Benchmarking actual throughput per line processed" -echo - -# Test with 100MB file -TEST_FILE="scripts/test_100mb.txt" -TOTAL_LINES=$(wc -l < "$TEST_FILE") -FILE_SIZE_MB=$(echo "scale=2; $(stat -c%s "$TEST_FILE") / 1024 / 1024" | bc -l) - -echo "Test file: $TEST_FILE" -echo "Total lines in file: $TOTAL_LINES" -echo "File size: ${FILE_SIZE_MB} MB" -echo - -# Run both implementations and measure -echo "Testing channel-based implementation..." -start_time=$(date +%s.%N) -CHANNEL_LINES=$(DTAIL_USE_CHANNELLESS=false DTAIL_INTEGRATION_TEST_RUN_MODE=yes ./dcat --logLevel error --cfg none "$TEST_FILE" | wc -l) -end_time=$(date +%s.%N) -channel_time=$(echo "$end_time - $start_time" | bc -l) - -echo "Testing channelless implementation..." -start_time=$(date +%s.%N) -CHANNELLESS_LINES=$(DTAIL_USE_CHANNELLESS=true DTAIL_INTEGRATION_TEST_RUN_MODE=yes ./dcat --logLevel error --cfg none "$TEST_FILE" | wc -l) -end_time=$(date +%s.%N) -channelless_time=$(echo "$end_time - $start_time" | bc -l) - -# Calculate metrics -channel_throughput_lines=$(echo "scale=2; $CHANNEL_LINES / $channel_time" | bc -l) -channelless_throughput_lines=$(echo "scale=2; $CHANNELLESS_LINES / $channelless_time" | bc -l) - -channel_coverage=$(echo "scale=2; ($CHANNEL_LINES * 100) / $TOTAL_LINES" | bc -l) -channelless_coverage=$(echo "scale=2; ($CHANNELLESS_LINES * 100) / $TOTAL_LINES" | bc -l) - -# Effective data processed -channel_data_mb=$(echo "scale=2; ($CHANNEL_LINES * $FILE_SIZE_MB) / $TOTAL_LINES" | bc -l) -channelless_data_mb=$FILE_SIZE_MB - -channel_throughput_mb=$(echo "scale=2; $channel_data_mb / $channel_time" | bc -l) -channelless_throughput_mb=$(echo "scale=2; $channelless_data_mb / $channelless_time" | bc -l) - -# Calculate relative performance for same amount of work -extrapolated_channel_time=$(echo "scale=2; ($channel_time * $TOTAL_LINES) / $CHANNEL_LINES" | bc -l) -performance_improvement=$(echo "scale=2; (($extrapolated_channel_time - $channelless_time) / $extrapolated_channel_time) * 100" | bc -l) -speedup=$(echo "scale=2; $extrapolated_channel_time / $channelless_time" | bc -l) - -echo -echo "=== RESULTS ===" -echo -echo "Channel-based implementation:" -printf " Time: %.3f seconds\n" "$channel_time" -printf " Lines processed: %d (%.1f%% of file)\n" "$CHANNEL_LINES" "$channel_coverage" -printf " Data processed: %.2f MB\n" "$channel_data_mb" -printf " Throughput: %.0f lines/sec, %.2f MB/s\n" "$channel_throughput_lines" "$channel_throughput_mb" -printf " Extrapolated time for full file: %.3f seconds\n" "$extrapolated_channel_time" -echo - -echo "Channelless implementation:" -printf " Time: %.3f seconds\n" "$channelless_time" -printf " Lines processed: %d (%.1f%% of file)\n" "$CHANNELLESS_LINES" "$channelless_coverage" -printf " Data processed: %.2f MB\n" "$channelless_data_mb" -printf " Throughput: %.0f lines/sec, %.2f MB/s\n" "$channelless_throughput_lines" "$channelless_throughput_mb" -echo - -echo "Performance comparison (for processing complete file):" -printf " Channelless improvement: %.2f%% faster\n" "$performance_improvement" -printf " Speedup: %.2fx\n" "$speedup" -echo - -if (( $(echo "$performance_improvement > 0" | bc -l) )); then - echo "✅ Channelless implementation is FASTER and processes ALL data correctly" -else - echo "❌ Channelless implementation is slower" -fi -echo - -echo "=== CONCLUSION ===" -echo "The channel-based implementation has a bug where it stops processing" -echo "at approximately 67% of the input file. This makes direct time comparisons" -echo "invalid. When extrapolated to process the same amount of data, the" -echo "channelless implementation shows the expected performance improvement." \ No newline at end of file diff --git a/scripts/profile_channelless.sh b/scripts/profile_channelless.sh deleted file mode 100755 index fb6ec3d..0000000 --- a/scripts/profile_channelless.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -# Profile channelless vs channel-based implementations to understand performance difference - -set -e - -echo "=== Profiling Channelless vs Channel-based Cat Implementation ===" -echo - -# Build with profiling enabled -echo "Building DTail binaries..." -make clean > /dev/null 2>&1 -make build > /dev/null 2>&1 - -echo "Profiling channel-based implementation..." -DTAIL_USE_CHANNELLESS=false DTAIL_INTEGRATION_TEST_RUN_MODE=yes \ - go tool pprof -cpuprofile=channel_based_cpu.prof \ - -o channel_based_cpu.prof \ - -- ./dcat --logLevel error --cfg none scripts/test_100mb.txt > /dev/null 2>&1 & -CHANNEL_PID=$! - -# Profile with Go's built-in profiling -DTAIL_USE_CHANNELLESS=false DTAIL_INTEGRATION_TEST_RUN_MODE=yes \ - timeout 10s go run -cpuprofile=channel_based_go.prof ./cmd/dcat/main.go --logLevel error --cfg none scripts/test_100mb.txt > /dev/null 2>&1 || true - -echo "Profiling channelless implementation..." -DTAIL_USE_CHANNELLESS=true DTAIL_INTEGRATION_TEST_RUN_MODE=yes \ - timeout 10s go run -cpuprofile=channelless_go.prof ./cmd/dcat/main.go --logLevel error --cfg none scripts/test_100mb.txt > /dev/null 2>&1 || true - -echo "Analyzing profiles..." - -echo -echo "=== Channel-based CPU Profile ===" -if [ -f channel_based_go.prof ]; then - go tool pprof -top -cum channel_based_go.prof | head -20 -else - echo "Channel-based profile not found" -fi - -echo -echo "=== Channelless CPU Profile ===" -if [ -f channelless_go.prof ]; then - go tool pprof -top -cum channelless_go.prof | head -20 -else - echo "Channelless profile not found" -fi - -echo -echo "Profile files generated:" -ls -la *_go.prof 2>/dev/null || echo "No profile files found" \ No newline at end of file -- cgit v1.2.3