summaryrefslogtreecommitdiff
path: root/benchmarks
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2025-06-26 15:10:43 +0300
committerPaul Buetow <paul@buetow.org>2025-06-26 15:10:43 +0300
commita26d91c804b3d6c774c049868847b536d03aef1a (patch)
treee3929c6512b7cf428b3e7517d779ab7fa6e9bb3b /benchmarks
parent6664996ced62c77e0c62bc1619662cbed7fccff6 (diff)
fix: resolve dmap profiling issues and optimize profiling speed
Fixed multiple issues preventing dmap from being profiled correctly: - Updated profile_dmap.sh to use DTail default log format - Fixed MapReduce queries to use correct field syntax - Reduced file sizes and run counts for faster profiling - Added proper command echoing to all data generation steps Optimizations: - Reduced PROFILE_RUNS from 3 to 1 - Reduced test data sizes (1MB/10MB instead of 10MB/100MB/1GB) - Commented out medium/large file tests for faster runs - Reduced dmap test data from 1000/10000 to 100/1000 lines The profiling framework now successfully profiles all three commands (dcat, dgrep, dmap) with reasonable execution times. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
Diffstat (limited to 'benchmarks')
-rwxr-xr-xbenchmarks/profile_benchmarks.sh52
-rwxr-xr-xbenchmarks/profile_dmap.sh65
2 files changed, 82 insertions, 35 deletions
diff --git a/benchmarks/profile_benchmarks.sh b/benchmarks/profile_benchmarks.sh
index a78182d..b0bcf64 100755
--- a/benchmarks/profile_benchmarks.sh
+++ b/benchmarks/profile_benchmarks.sh
@@ -17,7 +17,7 @@ NC='\033[0m' # No Color
# Default values
PROFILE_DIR="${PROFILE_DIR:-profiles}"
TEST_DATA_DIR="${TEST_DATA_DIR:-testdata}"
-PROFILE_RUNS=3
+PROFILE_RUNS=1
# Create directories
mkdir -p "$PROFILE_DIR"
@@ -71,9 +71,10 @@ run_profile() {
# Generate test data
echo -e "${GREEN}Preparing test data...${NC}"
-generate_test_data "10MB" "$TEST_DATA_DIR/small.log"
-generate_test_data "100MB" "$TEST_DATA_DIR/medium.log"
-generate_test_data "1GB" "$TEST_DATA_DIR/large.log"
+generate_test_data "1MB" "$TEST_DATA_DIR/small.log"
+generate_test_data "10MB" "$TEST_DATA_DIR/medium.log"
+# Skip large file for faster testing
+# generate_test_data "1GB" "$TEST_DATA_DIR/large.log"
# Generate CSV data for dmap (smaller size for faster processing)
if [ ! -f "$TEST_DATA_DIR/test.csv" ]; then
@@ -96,23 +97,48 @@ echo
# Profile dcat
echo -e "${GREEN}=== Profiling dcat ===${NC}"
run_profile "../dcat" "small_file" "-plain -cfg none $TEST_DATA_DIR/small.log"
-run_profile "../dcat" "medium_file" "-plain -cfg none $TEST_DATA_DIR/medium.log"
+# Skip medium file for faster profiling
+# run_profile "../dcat" "medium_file" "-plain -cfg none $TEST_DATA_DIR/medium.log"
# Skip large file for faster profiling - uncomment if needed
# run_profile "../dcat" "large_file" "-plain -cfg none $TEST_DATA_DIR/large.log"
# Profile dgrep
echo -e "${GREEN}=== Profiling dgrep ===${NC}"
-run_profile "../dgrep" "simple_regex" "-plain -cfg none -regex 'user[0-9]+' $TEST_DATA_DIR/medium.log"
-run_profile "../dgrep" "complex_regex" "-plain -cfg none -regex '\\d{4}-\\d{2}-\\d{2}.*login.*\\d{3}' $TEST_DATA_DIR/medium.log"
-run_profile "../dgrep" "with_context" "-plain -cfg none -regex 'login' -before 2 -after 2 $TEST_DATA_DIR/medium.log"
+run_profile "../dgrep" "simple_regex" "-plain -cfg none -regex 'user[0-9]+' $TEST_DATA_DIR/small.log"
+# Use small file for faster profiling
+# run_profile "../dgrep" "complex_regex" "-plain -cfg none -regex '\\d{4}-\\d{2}-\\d{2}.*login.*\\d{3}' $TEST_DATA_DIR/medium.log"
+# run_profile "../dgrep" "with_context" "-plain -cfg none -regex 'login' -before 2 -after 2 $TEST_DATA_DIR/medium.log"
# Profile dmap
echo -e "${GREEN}=== Profiling dmap ===${NC}"
-# Note: dmap uses a special query format for MapReduce operations
-# For CSV files, we need to specify the format and fields correctly
-echo -e "${YELLOW}Note: Skipping dmap profiling - requires specific log format${NC}"
-echo -e "${YELLOW}To profile dmap, use files in MapReduce format with queries like:${NC}"
-echo -e "${YELLOW} from STATS select count(\$line) group by \$hostname${NC}"
+
+# Generate DTail default format test data for dmap
+if [ ! -f "$TEST_DATA_DIR/dtail_format.log" ]; then
+ echo -e "${YELLOW}Generating DTail format test data for dmap${NC}"
+ echo " Command: Creating DTail format log file"
+ # Generate DTail default format log lines
+ for i in $(seq 1 1000); do
+ hostname="host$((i % 10))"
+ goroutines=$((40 + i % 40))
+ cgocalls=$((i % 100))
+ cpus=$((1 + i % 8))
+ loadavg=$(printf "%.2f" $(echo "scale=2; $i % 100 / 100" | bc))
+ uptime="${i}h0m0s"
+ connections=$((i % 10))
+ lifetime=$((1000 + i))
+
+ echo "INFO|$(date +%m%d-%H%M%S)|1|stats.go:56|$cpus|$goroutines|$cgocalls|$loadavg|$uptime|MAPREDUCE:STATS|currentConnections=$connections|lifetimeConnections=$lifetime"
+ done > "$TEST_DATA_DIR/dtail_format.log"
+fi
+
+# Profile dmap with DTail format
+run_profile "../dmap" "simple_count" "-plain -cfg none -query 'from STATS select count(*)' -files $TEST_DATA_DIR/dtail_format.log"
+run_profile "../dmap" "aggregations" "-plain -cfg none -query 'from STATS select sum(\$goroutines),avg(\$cgocalls),max(lifetimeConnections)' -files $TEST_DATA_DIR/dtail_format.log"
+run_profile "../dmap" "group_by_connections" "-plain -cfg none -query 'from STATS select currentConnections,count(*) group by currentConnections' -files $TEST_DATA_DIR/dtail_format.log"
+
+# Also test CSV format
+echo -e "\n${YELLOW}Testing CSV format with dmap${NC}"
+run_profile "../dmap" "csv_query" "-plain -cfg none -query 'select user,action,count(*) where status=\"success\" group by user,action logformat csv' -files $TEST_DATA_DIR/test.csv"
echo
echo -e "${GREEN}Profiling complete!${NC}"
diff --git a/benchmarks/profile_dmap.sh b/benchmarks/profile_dmap.sh
index 89d148a..a3a1151 100755
--- a/benchmarks/profile_dmap.sh
+++ b/benchmarks/profile_dmap.sh
@@ -25,22 +25,16 @@ echo -e "${GREEN}DTail dmap Profiling${NC}"
echo "===================="
echo
-# Function to generate MapReduce format test data
+# Function to generate MapReduce format test data (generickv format)
generate_mapreduce_data() {
local filename=$1
local lines=$2
if [ ! -f "$filename" ]; then
echo -e "${YELLOW}Generating MapReduce format test data: $filename${NC}"
- echo " Command: Creating $filename with $lines lines"
+ echo " Command: Creating $filename with $lines lines (generickv format)"
- cat > "$filename" << EOF
-STATS|earth|2024-01-01T10:00:00.000Z|goroutines:50;openFiles:120;connections:15;currentConnections:5;lifetimeConnections:1500
-STATS|mars|2024-01-01T10:00:01.000Z|goroutines:45;openFiles:110;connections:12;currentConnections:4;lifetimeConnections:1200
-STATS|venus|2024-01-01T10:00:02.000Z|goroutines:60;openFiles:130;connections:20;currentConnections:8;lifetimeConnections:2000
-EOF
-
- # Repeat the pattern to create larger file
+ # Generate data in generickv format: field1=value1|field2=value2|...
for i in $(seq 1 $lines); do
hostname="host$((i % 10))"
# Simple timestamp generation without date command
@@ -54,15 +48,42 @@ EOF
currentConnections=$((i % 10))
lifetimeConnections=$((1000 + i))
- echo "STATS|$hostname|$timestamp|goroutines:$goroutines;openFiles:$openFiles;connections:$connections;currentConnections:$currentConnections;lifetimeConnections:$lifetimeConnections" >> "$filename"
+ echo "table=STATS|hostname=$hostname|timestamp=$timestamp|goroutines=$goroutines|openFiles=$openFiles|connections=$connections|currentConnections=$currentConnections|lifetimeConnections=$lifetimeConnections" >> "$filename"
done
fi
}
-# Generate test data
+# Generate test data in DTail default format instead
echo -e "${GREEN}Preparing MapReduce test data...${NC}"
-generate_mapreduce_data "$TEST_DATA_DIR/stats_small.log" 1000
-generate_mapreduce_data "$TEST_DATA_DIR/stats_medium.log" 10000
+
+# Function to generate DTail default format test data
+generate_dtail_format_data() {
+ local filename=$1
+ local lines=$2
+
+ if [ ! -f "$filename" ]; then
+ echo -e "${YELLOW}Generating DTail default format test data: $filename${NC}"
+ echo " Command: Creating $filename with $lines lines (DTail default format)"
+
+ # Generate DTail default format log lines
+ for i in $(seq 1 $lines); do
+ hostname="host$((i % 10))"
+ goroutines=$((40 + i % 40))
+ cgocalls=$((i % 100))
+ cpus=$((1 + i % 8))
+ loadavg=$(printf "%.2f" $(echo "scale=2; $i % 100 / 100" | bc))
+ uptime="${i}h0m0s"
+ connections=$((i % 10))
+ lifetime=$((1000 + i))
+
+ # DTail default format: INFO|date-time|pid|caller|cpus|goroutines|cgocalls|loadavg|uptime|MAPREDUCE:STATS|key=value|...
+ echo "INFO|$(date +%m%d-%H%M%S)|1|stats.go:56|$cpus|$goroutines|$cgocalls|$loadavg|$uptime|MAPREDUCE:STATS|hostname=$hostname|currentConnections=$connections|lifetimeConnections=$lifetime" >> "$filename"
+ done
+ fi
+}
+
+generate_dtail_format_data "$TEST_DATA_DIR/stats_small.log" 100
+generate_dtail_format_data "$TEST_DATA_DIR/stats_medium.log" 1000
# Build dmap
echo -e "${GREEN}Building commands...${NC}"
@@ -78,21 +99,21 @@ echo -e "${GREEN}Profiling dmap queries...${NC}"
# Query 1: Simple count
echo -e "\n${YELLOW}Query: Count by hostname${NC}"
-QUERY="from STATS select count(\$line) group by \$hostname outfile $TEST_DATA_DIR/count_output.csv"
-echo "Command: timeout 30s ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log"
-timeout 30s ../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -5
+QUERY="from STATS select count(\$line) group by hostname outfile $TEST_DATA_DIR/count_output.csv"
+echo "Command: timeout 10s ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log"
+timeout 10s ../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -10
# Query 2: Aggregations
echo -e "\n${YELLOW}Query: Sum and average${NC}"
-QUERY="from STATS select sum(\$goroutines),avg(\$goroutines) group by \$hostname outfile $TEST_DATA_DIR/sum_avg_output.csv"
-echo "Command: timeout 30s ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log"
-timeout 30s ../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -5
+QUERY="from STATS select sum(\$goroutines),avg(\$goroutines) group by hostname outfile $TEST_DATA_DIR/sum_avg_output.csv"
+echo "Command: timeout 10s ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log"
+timeout 10s ../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -10
# Query 3: Min/Max
echo -e "\n${YELLOW}Query: Min and max${NC}"
-QUERY="from STATS select min(currentConnections),max(lifetimeConnections) group by \$hostname outfile $TEST_DATA_DIR/min_max_output.csv"
-echo "Command: timeout 30s ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log"
-timeout 30s ../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -5
+QUERY="from STATS select min(currentConnections),max(lifetimeConnections) group by hostname outfile $TEST_DATA_DIR/min_max_output.csv"
+echo "Command: timeout 10s ../dmap -profile -profiledir $PROFILE_DIR -plain -cfg none -query \"$QUERY\" -files $TEST_DATA_DIR/stats_small.log"
+timeout 10s ../dmap -profile -profiledir "$PROFILE_DIR" -plain -cfg none -query "$QUERY" -files "$TEST_DATA_DIR/stats_small.log" 2>&1 | head -10
echo
echo -e "${GREEN}Analyzing dmap profiles...${NC}"