summaryrefslogtreecommitdiff
path: root/profiling
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2025-06-26 21:10:07 +0300
committerPaul Buetow <paul@buetow.org>2025-06-26 21:10:07 +0300
commit513c70e297059822384140ee7e5939d20fd0bdc1 (patch)
treed6619230b54c4956d138c17c43df0fc72bb6f71a /profiling
parent4a657e44e7111d7d3b9a9ba5e453901e19af2ecb (diff)
refactor: move profiling scripts from benchmarks/ to profiling/
- Moved profile_benchmarks.sh, profile_dmap.sh, and profile_quick.sh to the profiling/ directory where they belong - Updated Makefile targets to reference new locations - Fixed profile_dmap.sh to remove outfile clauses since they're not needed for profiling and were preventing proper execution - Updated .gitignore to exclude generated files in profiling/ This better separates benchmarking (performance comparison) from profiling (performance analysis). 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
Diffstat (limited to 'profiling')
-rwxr-xr-xprofiling/profile_benchmarks.sh211
-rwxr-xr-xprofiling/profile_dmap.sh154
-rwxr-xr-xprofiling/profile_quick.sh86
3 files changed, 451 insertions, 0 deletions
diff --git a/profiling/profile_benchmarks.sh b/profiling/profile_benchmarks.sh
new file mode 100755
index 0000000..6be86cd
--- /dev/null
+++ b/profiling/profile_benchmarks.sh
@@ -0,0 +1,211 @@
+#!/bin/bash
+
+# Profile benchmarks script for dtail commands
+# This script runs profiling on dcat, dgrep, and dmap with various workloads
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Default values
+PROFILE_DIR="${PROFILE_DIR:-profiles}"
+TEST_DATA_DIR="${TEST_DATA_DIR:-testdata}"
+PROFILE_RUNS=1
+
+# Create directories
+mkdir -p "$PROFILE_DIR"
+mkdir -p "$TEST_DATA_DIR"
+
+echo -e "${GREEN}DTail Profiling Framework${NC}"
+echo "=========================="
+echo
+
+# Function to generate test data
+generate_test_data() {
+ local size=$1
+ local filename=$2
+
+ if [ ! -f "$filename" ]; then
+ echo -e "${YELLOW}Generating test data: $filename (${size})${NC}"
+ # Use the standalone generator
+ echo " Command: go run cmd/generate_profile_data.go -size \"${size}\" -output \"$filename\" -format log"
+ go run cmd/generate_profile_data.go -size "${size}" -output "$filename" -format log
+ fi
+}
+
+# Function to run profiling
+run_profile() {
+ local cmd=$1
+ local name=$2
+ local args=$3
+
+ echo -e "${GREEN}Profiling $cmd - $name${NC}"
+
+ for i in $(seq 1 $PROFILE_RUNS); do
+ echo " Run $i/$PROFILE_RUNS..."
+ echo " Command: timeout 30s $cmd -profile -profiledir $PROFILE_DIR $args"
+
+ # Run with CPU and memory profiling with timeout
+ timeout 30s $cmd -profile -profiledir "$PROFILE_DIR" $args > /dev/null 2>&1
+ local exit_code=$?
+
+ if [ $exit_code -eq 124 ]; then
+ echo -e " ${YELLOW}Warning: Run $i timed out after 30s${NC}"
+ elif [ $exit_code -ne 0 ]; then
+ echo -e " ${RED}Error: Run $i failed with exit code $exit_code${NC}"
+ fi
+
+ # Small delay between runs
+ sleep 1
+ done
+
+ echo
+}
+
+# Special function for profiling dmap which runs continuously
+run_profile_dmap() {
+ local cmd=$1
+ local name=$2
+ local args=$3
+
+ echo -e "${GREEN}Profiling $cmd - $name${NC}"
+
+ for i in $(seq 1 $PROFILE_RUNS); do
+ echo " Run $i/$PROFILE_RUNS..."
+ echo " Command: $cmd -profile -profiledir $PROFILE_DIR $args (will interrupt after 3s)"
+
+ # Run dmap in background, wait a bit for it to process, then interrupt it
+ $cmd -profile -profiledir "$PROFILE_DIR" $args > /dev/null 2>&1 &
+ local pid=$!
+
+ # Wait for dmap to process the file and generate initial results
+ sleep 3
+
+ # Send interrupt signal to make it exit cleanly
+ # We expect this to return non-zero, so we ignore the exit code
+ kill -INT $pid 2>/dev/null || true
+ wait $pid 2>/dev/null || true
+
+ echo " Completed"
+
+ # Small delay between runs
+ sleep 1
+ done
+
+ echo
+}
+
+# Generate test data
+echo -e "${GREEN}Preparing test data...${NC}"
+generate_test_data "1MB" "$TEST_DATA_DIR/small.log"
+generate_test_data "10MB" "$TEST_DATA_DIR/medium.log"
+# Skip large file for faster testing
+# generate_test_data "1GB" "$TEST_DATA_DIR/large.log"
+
+# Generate CSV data for dmap (smaller size for faster processing)
+if [ ! -f "$TEST_DATA_DIR/test.csv" ]; then
+ echo -e "${YELLOW}Generating CSV test data${NC}"
+ echo " Command: go run cmd/generate_profile_data.go -size \"10MB\" -output \"$TEST_DATA_DIR/test.csv\" -format csv"
+ go run cmd/generate_profile_data.go -size "10MB" -output "$TEST_DATA_DIR/test.csv" -format csv
+fi
+
+echo
+
+# Build commands
+echo -e "${GREEN}Building commands...${NC}"
+echo " Command: cd .. && make dcat dgrep dmap"
+cd ..
+make dcat dgrep dmap
+cd "$SCRIPT_DIR"
+
+echo
+
+# Profile dcat
+echo -e "${GREEN}=== Profiling dcat ===${NC}"
+run_profile "../dcat" "small_file" "-plain -cfg none $TEST_DATA_DIR/small.log"
+# Skip medium file for faster profiling
+# run_profile "../dcat" "medium_file" "-plain -cfg none $TEST_DATA_DIR/medium.log"
+# Skip large file for faster profiling - uncomment if needed
+# run_profile "../dcat" "large_file" "-plain -cfg none $TEST_DATA_DIR/large.log"
+
+# Profile dgrep
+echo -e "${GREEN}=== Profiling dgrep ===${NC}"
+run_profile "../dgrep" "simple_regex" "-plain -cfg none -regex 'user[0-9]+' $TEST_DATA_DIR/small.log"
+# Use small file for faster profiling
+# run_profile "../dgrep" "complex_regex" "-plain -cfg none -regex '\\d{4}-\\d{2}-\\d{2}.*login.*\\d{3}' $TEST_DATA_DIR/medium.log"
+# run_profile "../dgrep" "with_context" "-plain -cfg none -regex 'login' -before 2 -after 2 $TEST_DATA_DIR/medium.log"
+
+# Profile dmap
+echo -e "${GREEN}=== Profiling dmap ===${NC}"
+
+# Generate DTail default format test data for dmap
+if [ ! -f "$TEST_DATA_DIR/dtail_format.log" ]; then
+ echo -e "${YELLOW}Generating DTail format test data for dmap${NC}"
+ echo " Command: Creating DTail format log file"
+ # Generate DTail default format log lines
+ for i in $(seq 1 1000); do
+ hostname="host$((i % 10))"
+ goroutines=$((40 + i % 40))
+ cgocalls=$((i % 100))
+ cpus=$((1 + i % 8))
+ loadavg=$(printf "%.2f" $(echo "scale=2; $i % 100 / 100" | bc))
+ uptime="${i}h0m0s"
+ connections=$((i % 10))
+ lifetime=$((1000 + i))
+
+ echo "INFO|$(date +%m%d-%H%M%S)|1|stats.go:56|$cpus|$goroutines|$cgocalls|$loadavg|$uptime|MAPREDUCE:STATS|currentConnections=$connections|lifetimeConnections=$lifetime"
+ done > "$TEST_DATA_DIR/dtail_format.log"
+fi
+
+# Profile dmap with DTail format
+run_profile_dmap "../dmap" "simple_count" "-plain -cfg none -query 'from STATS select count(*)' -files $TEST_DATA_DIR/dtail_format.log"
+run_profile_dmap "../dmap" "aggregations" "-plain -cfg none -query 'from STATS select sum(\$goroutines),avg(\$cgocalls),max(lifetimeConnections)' -files $TEST_DATA_DIR/dtail_format.log"
+run_profile_dmap "../dmap" "group_by_connections" "-plain -cfg none -query 'from STATS select currentConnections,count(*) group by currentConnections' -files $TEST_DATA_DIR/dtail_format.log"
+
+# Also test CSV format
+echo -e "\n${YELLOW}Testing CSV format with dmap${NC}"
+run_profile_dmap "../dmap" "csv_query" "-plain -cfg none -query 'select user,action,count(*) where status=\"success\" group by user,action logformat csv' -files $TEST_DATA_DIR/test.csv"
+
+echo
+echo -e "${GREEN}Profiling complete!${NC}"
+echo
+
+# Analyze profiles
+echo -e "${GREEN}=== Profile Analysis ===${NC}"
+echo "Profile files generated in: $PROFILE_DIR"
+echo
+
+# List recent profiles
+echo "Recent CPU profiles:"
+ls -lt "$PROFILE_DIR"/*_cpu_*.prof 2>/dev/null | head -5 || echo " No CPU profiles found"
+
+echo
+echo "Recent memory profiles:"
+ls -lt "$PROFILE_DIR"/*_mem_*.prof 2>/dev/null | head -5 || echo " No memory profiles found"
+
+echo
+echo "Recent allocation profiles:"
+ls -lt "$PROFILE_DIR"/*_alloc_*.prof 2>/dev/null | head -5 || echo " No allocation profiles found"
+
+echo
+echo -e "${GREEN}To analyze a profile, use:${NC}"
+echo " go tool pprof <profile_file>"
+echo " ../profiling/profile.sh <profile_file>"
+echo
+echo -e "${GREEN}Examples:${NC}"
+echo " # Interactive analysis"
+echo " go tool pprof $PROFILE_DIR/dcat_cpu_*.prof"
+echo
+echo " # Generate flame graph"
+echo " go tool pprof -http=:8080 $PROFILE_DIR/dcat_cpu_*.prof"
+echo
+echo " # Quick summary with dprofile"
+echo " ../profiling/profile.sh $PROFILE_DIR/dcat_cpu_*.prof"
+echo \ No newline at end of file
diff --git a/profiling/profile_dmap.sh b/profiling/profile_dmap.sh
new file mode 100755
index 0000000..1abf629
--- /dev/null
+++ b/profiling/profile_dmap.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+
+# Profile script specifically for dmap with MapReduce format data
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Default values
+PROFILE_DIR="${PROFILE_DIR:-profiles}"
+TEST_DATA_DIR="${TEST_DATA_DIR:-testdata}"
+
+# Create directories
+mkdir -p "$PROFILE_DIR"
+mkdir -p "$TEST_DATA_DIR"
+
+echo -e "${GREEN}DTail dmap Profiling${NC}"
+echo "===================="
+echo
+
+# Function to generate MapReduce format test data (generickv format)
+generate_mapreduce_data() {
+ local filename=$1
+ local lines=$2
+
+ if [ ! -f "$filename" ]; then
+ echo -e "${YELLOW}Generating MapReduce format test data: $filename${NC}"
+ echo " Command: Creating $filename with $lines lines (generickv format)"
+
+ # Generate data in generickv format: field1=value1|field2=value2|...
+ for i in $(seq 1 $lines); do
+ hostname="host$((i % 10))"
+ # Simple timestamp generation without date command
+ hour=$((10 + (i / 3600) % 24))
+ min=$(((i / 60) % 60))
+ sec=$((i % 60))
+ timestamp=$(printf "2024-01-01T%02d:%02d:%02d.000Z" $hour $min $sec)
+ goroutines=$((40 + i % 40))
+ openFiles=$((100 + i % 50))
+ connections=$((10 + i % 20))
+ currentConnections=$((i % 10))
+ lifetimeConnections=$((1000 + i))
+
+ echo "table=STATS|hostname=$hostname|timestamp=$timestamp|goroutines=$goroutines|openFiles=$openFiles|connections=$connections|currentConnections=$currentConnections|lifetimeConnections=$lifetimeConnections" >> "$filename"
+ done
+ fi
+}
+
+# Generate test data in DTail default format instead
+echo -e "${GREEN}Preparing MapReduce test data...${NC}"
+
+# Function to generate DTail default format test data
+generate_dtail_format_data() {
+ local filename=$1
+ local lines=$2
+
+ if [ ! -f "$filename" ]; then
+ echo -e "${YELLOW}Generating DTail default format test data: $filename${NC}"
+ echo " Command: Creating $filename with $lines lines (DTail default format)"
+
+ # Generate DTail default format log lines
+ for i in $(seq 1 $lines); do
+ hostname="host$((i % 10))"
+ goroutines=$((40 + i % 40))
+ cgocalls=$((i % 100))
+ cpus=$((1 + i % 8))
+ loadavg=$(printf "%.2f" $(echo "scale=2; $i % 100 / 100" | bc))
+ uptime="${i}h0m0s"
+ connections=$((i % 10))
+ lifetime=$((1000 + i))
+
+ # DTail default format: INFO|date-time|pid|caller|cpus|goroutines|cgocalls|loadavg|uptime|MAPREDUCE:STATS|key=value|...
+ echo "INFO|$(date +%m%d-%H%M%S)|1|stats.go:56|$cpus|$goroutines|$cgocalls|$loadavg|$uptime|MAPREDUCE:STATS|hostname=$hostname|currentConnections=$connections|lifetimeConnections=$lifetime" >> "$filename"
+ done
+ fi
+}
+
+generate_dtail_format_data "$TEST_DATA_DIR/stats_small.log" 100
+generate_dtail_format_data "$TEST_DATA_DIR/stats_medium.log" 1000
+
+# Build dmap
+echo -e "${GREEN}Building commands...${NC}"
+echo " Command: cd .. && make dmap"
+cd ..
+make dmap 2>/dev/null || true
+cd "$SCRIPT_DIR"
+
+echo
+
+# Profile different dmap queries
+echo -e "${GREEN}Profiling dmap queries...${NC}"
+
+# Query 1: Simple count
+echo -e "\n${YELLOW}Query: Count by hostname${NC}"
+QUERY="from STATS select count(\$line) group by hostname"
+echo "Command: ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log (will interrupt after 3s)"
+# Run dmap in background and interrupt after 3 seconds
+../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -10 &
+DMAP_PID=$!
+sleep 3
+kill -INT $DMAP_PID 2>/dev/null || true
+wait $DMAP_PID 2>/dev/null || true
+
+# Query 2: Aggregations
+echo -e "\n${YELLOW}Query: Sum and average${NC}"
+QUERY="from STATS select sum(\$goroutines),avg(\$goroutines) group by hostname"
+echo "Command: ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log (will interrupt after 3s)"
+../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -10 &
+DMAP_PID=$!
+sleep 3
+kill -INT $DMAP_PID 2>/dev/null || true
+wait $DMAP_PID 2>/dev/null || true
+
+# Query 3: Min/Max
+echo -e "\n${YELLOW}Query: Min and max${NC}"
+QUERY="from STATS select min(currentConnections),max(lifetimeConnections) group by hostname"
+echo "Command: ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log (will interrupt after 3s)"
+../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -10 &
+DMAP_PID=$!
+sleep 3
+kill -INT $DMAP_PID 2>/dev/null || true
+wait $DMAP_PID 2>/dev/null || true
+
+echo
+echo -e "${GREEN}Analyzing dmap profiles...${NC}"
+
+# Find and analyze latest dmap profiles
+DMAP_CPU=$(ls -t "$PROFILE_DIR"/dmap_cpu_*.prof 2>/dev/null | head -1)
+if [ -n "$DMAP_CPU" ]; then
+ echo -e "\nCPU Profile: $(basename "$DMAP_CPU")"
+ ../profiling/profile.sh -top 5 "$DMAP_CPU" 2>/dev/null || echo " Analysis failed"
+fi
+
+DMAP_MEM=$(ls -t "$PROFILE_DIR"/dmap_mem_*.prof 2>/dev/null | head -1)
+if [ -n "$DMAP_MEM" ]; then
+ echo -e "\nMemory Profile: $(basename "$DMAP_MEM")"
+ ../profiling/profile.sh -top 5 "$DMAP_MEM" 2>/dev/null || echo " Analysis failed"
+fi
+
+echo
+echo -e "${GREEN}dmap profiling complete!${NC}"
+echo
+echo "To analyze profiles in detail:"
+echo " go tool pprof $PROFILE_DIR/dmap_cpu_*.prof"
+echo " go tool pprof -alloc_space $PROFILE_DIR/dmap_mem_*.prof"
+
+# No cleanup needed - no output files are created during profiling
diff --git a/profiling/profile_quick.sh b/profiling/profile_quick.sh
new file mode 100755
index 0000000..1aa9425
--- /dev/null
+++ b/profiling/profile_quick.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Quick profile script for dtail commands
+# This runs profiling with smaller datasets for faster results
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Default values
+PROFILE_DIR="${PROFILE_DIR:-profiles}"
+TEST_DATA_DIR="${TEST_DATA_DIR:-testdata}"
+
+# Create directories
+mkdir -p "$PROFILE_DIR"
+mkdir -p "$TEST_DATA_DIR"
+
+echo -e "${GREEN}DTail Quick Profiling${NC}"
+echo "====================="
+echo
+
+# Generate test data if needed
+if [ ! -f "$TEST_DATA_DIR/quick_test.log" ]; then
+ echo -e "${YELLOW}Generating test data...${NC}"
+ echo " Command: go run generate_profile_data.go -size \"10MB\" -output \"$TEST_DATA_DIR/quick_test.log\" -format log"
+ go run generate_profile_data.go -size "10MB" -output "$TEST_DATA_DIR/quick_test.log" -format log
+ echo " Command: go run generate_profile_data.go -size \"10MB\" -output \"$TEST_DATA_DIR/quick_test.csv\" -format csv"
+ go run generate_profile_data.go -size "10MB" -output "$TEST_DATA_DIR/quick_test.csv" -format csv
+fi
+
+# Build commands
+echo -e "${GREEN}Building commands...${NC}"
+echo " Command: cd .. && make dcat dgrep dmap"
+cd ..
+make dcat dgrep dmap 2>/dev/null || true
+cd "$SCRIPT_DIR"
+
+echo
+echo -e "${GREEN}Running quick profiles...${NC}"
+
+# Profile dcat
+echo -e "\n${YELLOW}Profiling dcat...${NC}"
+echo "Command: ../dcat -profile -profiledir $PROFILE_DIR -plain -cfg none $TEST_DATA_DIR/quick_test.log"
+../dcat -profile -profiledir "$PROFILE_DIR" -plain -cfg none "$TEST_DATA_DIR/quick_test.log" > /dev/null 2>&1
+DCAT_CPU=$(ls -t "$PROFILE_DIR"/dcat_cpu_*.prof 2>/dev/null | head -1)
+if [ -n "$DCAT_CPU" ]; then
+ echo " Generated: $(basename "$DCAT_CPU")"
+ echo " Analysis: ../profiling/profile.sh -top 3 $DCAT_CPU"
+ ../profiling/profile.sh -top 3 "$DCAT_CPU" | grep -A 5 "Top 3 functions"
+fi
+
+# Profile dgrep
+echo -e "\n${YELLOW}Profiling dgrep...${NC}"
+echo "Command: ../dgrep -profile -profiledir $PROFILE_DIR -plain -cfg none -regex \"user[0-9]+\" $TEST_DATA_DIR/quick_test.log"
+../dgrep -profile -profiledir "$PROFILE_DIR" -plain -cfg none -regex "user[0-9]+" "$TEST_DATA_DIR/quick_test.log" > /dev/null 2>&1
+DGREP_CPU=$(ls -t "$PROFILE_DIR"/dgrep_cpu_*.prof 2>/dev/null | head -1)
+if [ -n "$DGREP_CPU" ]; then
+ echo " Generated: $(basename "$DGREP_CPU")"
+ echo " Analysis: ../profiling/profile.sh -top 3 $DGREP_CPU"
+ ../profiling/profile.sh -top 3 "$DGREP_CPU" | grep -A 5 "Top 3 functions"
+fi
+
+# Profile dmap
+echo -e "\n${YELLOW}Profiling dmap...${NC}"
+echo "Command: ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"select count(*) from $TEST_DATA_DIR/quick_test.csv\""
+../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "select count(*) from $TEST_DATA_DIR/quick_test.csv" > /dev/null 2>&1
+DMAP_CPU=$(ls -t "$PROFILE_DIR"/dmap_cpu_*.prof 2>/dev/null | head -1)
+if [ -n "$DMAP_CPU" ]; then
+ echo " Generated: $(basename "$DMAP_CPU")"
+ echo " Analysis: ../profiling/profile.sh -top 3 $DMAP_CPU"
+ ../profiling/profile.sh -top 3 "$DMAP_CPU" | grep -A 5 "Top 3 functions"
+fi
+
+echo
+echo -e "${GREEN}Quick profiling complete!${NC}"
+echo
+echo "To analyze in detail:"
+echo " go tool pprof $PROFILE_DIR/<profile_file>"
+echo " make profile-flamegraph PROFILE=$PROFILE_DIR/<profile_file>"
+echo \ No newline at end of file