Azure · meecethereese · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -98,9 +98,21 @@ Scenarios accept CLI arguments (`--ingress-url`, `--rate`, `--duration`, `--work
 
 ### Data flow for results
 
-The vegeta run pipeline streams results in real time: `vegeta attack | tee statefile.bin | vegeta encode | jaggr | tee statefile.json`. This saves raw binary results to `.bin` alongside the per-second jaggr JSON. The scenario script then copies statefile.json content to the `--output-file`. The result file contains **one JSON line per second** of the test. The CI validation sums `code.hist["200"]` across **all lines** and fails only if the total is zero (i.e., no connection was ever successfully routed).
+The vegeta run pipeline streams results in real time:
 
-For multi-pod tests, `modules/vegeta/merge/merge.sh` combines multiple `.bin` files into unified per-second JSON. It uses actual request timestamps for bucketing (not wall-clock time), so it correctly interleaves results from pods that started at slightly different times. The first and last second-buckets may be partial.
+```
+vegeta attack | tee statefile.bin | vegeta encode --to csv | gawk -f modules/vegeta/lib/aggregate.awk | tee statefile.json
+```
+
+This saves raw binary results to `.bin` alongside the per-second JSON. The scenario script then copies statefile.json content to the `--output-file`. The result file contains **one JSON line per second** of the test, with shape:
+
+```json
+{"rps":N,"responded_count":R_n,"response_rate":R_r,"successful_count":M,"success_rate":S_r,"code":{"hist":{...exact codes...},"family":{"0":A,"100":B,"200":C,"300":D,"400":E,"500":F}},"latency":{"p25":x,"p50":y,"p99":z},"bytes_in":{"sum":...},"bytes_out":{"sum":...}}
+```
+
+Two count/rate pairs describe distinct phases of the request lifecycle: `responded_count` / `response_rate` is how many of the `rps` requests came back as a non-0 status (server actually responded), and `successful_count` / `success_rate` is how many of the responded records were 2xx (server accepted and fulfilled the request). Latency percentiles are computed over `responded_count` records — any non-0 status is a real measurement of server response time; vegeta uses `status_code=0` for transport-level failures whose "latency" is failure-detection time. `success_rate = successful_count / responded_count`, or 0 when no responses came back. `code.hist` preserves the exact status codes seen this second (e.g. `"200":5,"503":2`); `code.family` aggregates the same counts into hundreds families (`0`, `100`, `200`, `300`, `400`, `500`) and always emits all six keys, so downstream plotters see a stable schema. `sum(code.hist[*]) == sum(code.family[*]) == rps` is an invariant. The CI validation sums `code.hist["200"]` across **all lines** and fails only if the total is zero (i.e., no connection was ever successfully routed).
+
+For multi-pod tests, `modules/vegeta/merge/merge.sh` combines multiple `.bin` files into unified per-second JSON. It shares the `modules/vegeta/lib/aggregate.awk` aggregator with the live runner, so live and merged outputs have identical schemas. The merge path sorts by timestamp before aggregating, so it correctly interleaves results from pods that started at slightly different times. The first and last second-buckets may be partial.
 
 ### Helm chart (charts/server)
 

@@ -4,7 +4,14 @@ set -e
 
 function vegeta() {
     echo "Plotting vegeta results..."
-    jplot rps+code.hist.100+code.hist.200+code.hist.300+code.hist.400+code.hist.500 \
+    # code.family bins exact codes into hundreds families (0/100/200/300/400/500)
+    # and always emits all six keys, so jplot has a stable schema regardless of
+    # which exact codes a particular controller returns. response_rate and
+    # success_rate are layered: response_rate = responded_count / rps,
+    # success_rate = successful_count (2xx) / responded_count.
+    jplot rps+responded_count+successful_count \
+      response_rate+success_rate \
+      code.family.0+code.family.100+code.family.200+code.family.300+code.family.400+code.family.500 \
       latency.p99+latency.p50+latency.p25 \
       bytes_in.sum+bytes_out.sum
 }

@@ -6,14 +6,129 @@ echo "Testing JPlot module..."
 
 echo "1. Testing JPlot installation..."
 MODULE_DIR=$( cd "$(dirname "${BASH_SOURCE[0]}")/.." ; pwd -P )
+PROJECT_ROOT=$( cd "${MODULE_DIR}/.." ; pwd -P )
 chmod +x "${MODULE_DIR}/install/install.sh"
 echo "Running JPlot installation..."
-"${MODULE_DIR}/install/install.sh" 
+"${MODULE_DIR}/install/install.sh"
 
 # Verify JPlot is installed
 if ! command -v jplot &> /dev/null; then
     echo "ERROR: JPlot installation failed - jplot command not found"
     exit 1
-fi 
+fi
+
+echo "2. Verifying jplot field paths match the vegeta aggregator schema..."
+# Drift between modules/jplot/run/run.sh's hardcoded jplot args and the schema
+# emitted by modules/vegeta/lib/aggregate.awk silently breaks the live
+# dashboard. This test:
+#   - generates a representative statefile.json via aggregate.awk
+#   - dispatches the jplot run script with `jplot` overridden to print its args
+#   - resolves each printed field path against the sample JSON with jq
+# Any path that doesn't resolve (e.g. removed field, renamed key) fails CI.
+
+AGG="${PROJECT_ROOT}/modules/vegeta/lib/aggregate.awk"
+if [[ ! -f "$AGG" ]]; then
+    echo "ERROR: vegeta aggregator not found at ${AGG}"
+    exit 1
+fi
+
+SAMPLE_CSV=$(mktemp -t jplot-test.XXXXXX.csv)
+SAMPLE_JSON=$(mktemp -t jplot-test.XXXXXX.json)
+trap 'rm -f "${SAMPLE_CSV}" "${SAMPLE_JSON}"' RETURN
+
+# One record per hundreds family in the same second-bucket so every family
+# bin (0/100/200/300/400/500) has a non-zero count. This exercises every
+# code.family.* path jplot might reference, not just the families that happen
+# to appear in lighter tests.
+#   0   transport failure                  → code.family.0   = 1
+#   101 1xx (Switching Protocols)          → code.family.100 = 1
+#   200 2xx (success — successful_count)   → code.family.200 = 1
+#   301 3xx (redirect)                     → code.family.300 = 1
+#   404 4xx (client error)                 → code.family.400 = 1
+#   503 5xx (server error)                 → code.family.500 = 1
+cat > "${SAMPLE_CSV}" <<'EOF'
+1700000001000000,101,5000000,0,13,
+1700000002000000,200,5000000,0,13,
+1700000003000000,301,5000000,0,13,
+1700000004000000,404,5000000,0,13,
+1700000005000000,503,5000000,0,13,
+1700000006000000,0,30000000000,0,0,
+EOF
+gawk -f "${AGG}" "${SAMPLE_CSV}" > "${SAMPLE_JSON}"
+SAMPLE_LINE=$(head -n1 "${SAMPLE_JSON}")
+if [[ -z "${SAMPLE_LINE}" ]]; then
+    echo "ERROR: aggregate.awk produced no sample output"
+    exit 1
+fi
+
+# Sanity: confirm every family bin is populated as designed. If a future
+# aggregator change accidentally collapses some families, this catches it
+# before the schema-resolution loop does.
+echo "${SAMPLE_LINE}" | jq -e \
+    '.code.family == {"0":1,"100":1,"200":1,"300":1,"400":1,"500":1}' > /dev/null \
+    || { echo "ERROR: sample code.family did not populate every bin"; echo "${SAMPLE_LINE}"; exit 1; }
+echo "  ✓ sample populates every family bin (0/100/200/300/400/500) with count 1"
+
+# Capture jplot's argument list by overriding it in a child bash invocation.
+# `source` with positional args dispatches run.sh's vegeta function, which
+# calls our stub `jplot` instead of the real binary.
+JPLOT_ARGS=$(bash <<EOF
+jplot() { printf '%s\n' "\$@"; }
+source "${MODULE_DIR}/run/run.sh" vegeta
+EOF
+)
+
+# Translate dotted paths (rps, code.family.0, latency.p99) into jq selectors.
+# Numeric path segments need bracket notation: code.family.0 → .code.family["0"].
+to_jq_path() {
+    gawk -F. '
+    {
+        out = ""
+        for (i = 1; i <= NF; i++) {
+            if ($i ~ /^[0-9]+$/) {
+                out = out "[\"" $i "\"]"
+            } else {
+                if (i > 1) out = out "."
+                out = out $i
+            }
+        }
+        if (substr(out, 1, 1) != "[") out = "." out
+        else out = "" out  # rare: leading numeric segment, leave bracket form
+        print out
+    }'
+}
+
+MISSING_PATHS=()
+PANEL_COUNT=0
+PATH_COUNT=0
+while IFS= read -r panel; do
+    [[ -z "$panel" ]] && continue
+    [[ "$panel" == "Plotting vegeta results..." ]] && continue
+    PANEL_COUNT=$((PANEL_COUNT + 1))
+    IFS='+' read -ra FIELDS <<< "$panel"
+    for field in "${FIELDS[@]}"; do
+        PATH_COUNT=$((PATH_COUNT + 1))
+        jq_path=$(printf '%s' "$field" | to_jq_path)
+        if ! echo "$SAMPLE_LINE" | jq -e "${jq_path}" > /dev/null 2>&1; then
+            MISSING_PATHS+=("$field   (jq: ${jq_path})")
+        fi
+    done
+done <<< "${JPLOT_ARGS}"
+
+if [[ "${PANEL_COUNT}" -eq 0 ]]; then
+    echo "ERROR: failed to extract any jplot panels from ${MODULE_DIR}/run/run.sh"
+    echo "Captured output was:"
+    echo "${JPLOT_ARGS}"
+    exit 1
+fi
+
+if [[ ${#MISSING_PATHS[@]} -gt 0 ]]; then
+    echo "ERROR: jplot is configured to plot fields that don't exist in aggregate.awk's output:"
+    printf '  - %s\n' "${MISSING_PATHS[@]}"
+    echo "Sample JSON line:"
+    echo "${SAMPLE_LINE}"
+    exit 1
+fi
+echo "  ✓ all ${PATH_COUNT} jplot field paths across ${PANEL_COUNT} panels resolve in the aggregator's JSON"
 
 echo "🎉 All JPlot module tests passed!"
@@ -1,8 +1,7 @@
 #!/bin/bash
 
-# Script to install Vegeta HTTP load testing tool and jaggr
+# Script to install Vegeta HTTP load testing tool
 # https://github.com/tsenart/vegeta
-# https://github.com/rs/jaggr
 
 set -e
 
@@ -43,25 +42,6 @@ sudo chmod +x /usr/local/bin/vegeta
 echo "Verifying Vegeta installation..."
 vegeta --version
 
-# Install jaggr
-JAGGR_VERSION="1.0.0"
-JAGGR_URL="https://github.com/rs/jaggr/releases/download/${JAGGR_VERSION}/jaggr_${JAGGR_VERSION}_linux_${ARCH}.tar.gz"
-
-echo "Downloading jaggr v${JAGGR_VERSION} for Linux ${ARCH}..."
-wget -q "${JAGGR_URL}" -O jaggr.tar.gz
-
-echo "Extracting jaggr..."
-tar xzf jaggr.tar.gz jaggr
-
-echo "Installing jaggr to /usr/local/bin..."
-sudo mv jaggr /usr/local/bin/
-rm jaggr.tar.gz
-
-echo "Setting executable permissions for jaggr..."
-sudo chmod +x /usr/local/bin/jaggr
-
 echo "Installation complete!"
 echo "Installed tools:"
-echo "- Vegeta: $(vegeta --version)"
-echo "- jaggr: "
-jaggr --version
+echo "- Vegeta: $(vegeta --version)"
@@ -0,0 +1,160 @@
+# Per-second bucketing of vegeta CSV records into JSON output.
+#
+# Used by both modules/vegeta/run/run.sh (live streaming during an attack) and
+# modules/vegeta/merge/merge.sh (post-hoc merge of multiple .bin files).
+#
+# Input: rows from `vegeta encode --to csv`. Columns:
+#   timestamp_ns, status_code, latency_ns, bytes_out, bytes_in, error
+#
+# Output: one JSON line per second-bucket, with shape:
+#   {"rps":N,
+#    "responded_count":R_n,"response_rate":R_r,
+#    "successful_count":M,"success_rate":S_r,
+#    "code":{"hist":{...exact codes...},"family":{"0":A,"100":B,"200":C,"300":D,"400":E,"500":F}},
+#    "latency":{"p25":x,"p50":y,"p99":z},
+#    "bytes_in":{"sum":...},"bytes_out":{"sum":...}}
+#
+# Two distinct count/rate pairs describe the request lifecycle:
+#
+#   responded_count / response_rate  — out of the rps requests sent this second,
+#     how many came back as a non-0 status (server actually responded). Latency
+#     percentiles are computed over these records — any non-zero status is a real
+#     measurement of server response time, regardless of HTTP outcome. Code-0
+#     transport failures (connection refused, EOF, timeout) are excluded because
+#     their "latency" is failure-detection time, often the full --timeout, and
+#     would skew percentiles. response_rate = responded_count / rps.
+#
+#   successful_count / success_rate  — out of the responded_count records that
+#     came back, how many were 2xx (server accepted and fulfilled the request).
+#     1xx interim, 3xx redirects, 4xx client errors, and 5xx server errors are
+#     NOT counted as successful. success_rate = successful_count / responded_count,
+#     or 0 when responded_count is 0 (no responses at all).
+#
+# code.hist preserves the exact status codes seen this second (e.g. "200":5,"503":2).
+# code.family aggregates the same counts into hundreds families (0, 100, 200, 300,
+# 400, 500) and always emits all six keys — the stable schema downstream plotters
+# (jplot etc.) need so series don't disappear when a controller's exact codes vary.
+# Both views describe the same records: sum(code.hist[*]) == sum(code.family[*]) == rps.
+#
+# response_rate and success_rate are floats in [0,1], formatted to 4 decimals.
+#
+# rps, code.hist, code.family, bytes_in.sum, bytes_out.sum count every record.
+#
+# Bucketing assumes records arrive roughly in timestamp order. The merge path
+# sorts explicitly; the live path relies on `vegeta encode` emitting records in
+# attack order (verified to be timestamp-sorted for steady-rate attacks). When
+# a late record arrives (this_second < current_second), it is absorbed into the
+# current bucket rather than triggering a new bucket flush — preventing the
+# same wall-second from appearing on two output lines under high-variance load.
+
+function floor_val(x) {
+    return int(x)
+}
+
+function flush_bucket() {
+    if (bucket_count == 0) return
+
+    if (responded_count > 0) {
+        asort(latencies, sorted_lat)
+        n = responded_count
+        p25_idx = floor_val(n * 0.25); if (p25_idx < 1) p25_idx = 1
+        p50_idx = floor_val(n * 0.50); if (p50_idx < 1) p50_idx = 1
+        p99_idx = floor_val(n * 0.99); if (p99_idx < 1) p99_idx = 1
+        p25_val = sorted_lat[p25_idx]
+        p50_val = sorted_lat[p50_idx]
+        p99_val = sorted_lat[p99_idx]
+    } else {
+        p25_val = 0; p50_val = 0; p99_val = 0
+    }
+
+    code_hist = ""
+    for (code in code_counts) {
+        if (code_hist != "") code_hist = code_hist ","
+        code_hist = code_hist "\"" code "\":" code_counts[code]
+    }
+
+    # Aggregate exact codes into hundreds families. Always emit all 6 keys
+    # (0, 100, 200, 300, 400, 500) so downstream plotters see a stable schema
+    # regardless of which exact codes appeared this second.
+    delete family_counts
+    for (code in code_counts) {
+        fam = int(code / 100) * 100
+        family_counts[fam] += code_counts[code]
+    }
+    code_family = ""
+    for (fam = 0; fam <= 500; fam += 100) {
+        if (code_family != "") code_family = code_family ","
+        cnt = (fam in family_counts) ? family_counts[fam] : 0
+        code_family = code_family "\"" fam "\":" cnt
+    }
+
+    response_rate = responded_count / bucket_count
+    if (responded_count > 0) {
+        success_rate = successful_count / responded_count
+    } else {
+        success_rate = 0
+    }
+
+    printf "{\"rps\":%d,\"responded_count\":%d,\"response_rate\":%.4f,\"successful_count\":%d,\"success_rate\":%.4f,\"code\":{\"hist\":{%s},\"family\":{%s}},\"latency\":{\"p25\":%d,\"p50\":%d,\"p99\":%d},\"bytes_in\":{\"sum\":%d},\"bytes_out\":{\"sum\":%d}}\n", \
+        bucket_count, responded_count, response_rate, successful_count, success_rate, code_hist, code_family, p25_val, p50_val, p99_val, bytes_in_sum, bytes_out_sum
+    fflush()
+}
+
+BEGIN {
+    FS = ","
+    current_second = -1
+    bucket_count = 0
+    responded_count = 0
+    successful_count = 0
+    bytes_in_sum = 0
+    bytes_out_sum = 0
+}
+
+{
+    timestamp_ns = $1
+    status_code = $2
+    latency_ns = $3
+    bytes_out = $4
+    bytes_in = $5
+
+    this_second = floor_val(timestamp_ns / 1000000000)
+
+    if (current_second == -1) {
+        current_second = this_second
+    }
+
+    if (this_second > current_second) {
+        flush_bucket()
+        current_second = this_second
+        bucket_count = 0
+        responded_count = 0
+        successful_count = 0
+        bytes_in_sum = 0
+        bytes_out_sum = 0
+        delete code_counts
+        delete latencies
+    }
+    # Late record (this_second < current_second): absorb into the current
+    # bucket rather than flushing and starting a new one. Splitting a second
+    # into two output lines is worse than slight count drift across buckets.
+
+    bucket_count++
+    # Any non-0 status means the server actually responded — its latency is a
+    # real measurement of server responsiveness regardless of HTTP outcome.
+    if (status_code != "0") {
+        responded_count++
+        latencies[responded_count] = latency_ns + 0
+    }
+    # Successful means the server accepted and fulfilled the request: 2xx only.
+    # 3xx redirects, 4xx client errors, and 5xx server errors don't count.
+    if (int(status_code / 100) == 2) {
+        successful_count++
+    }
+    code_counts[status_code] += 1
+    bytes_in_sum += bytes_in + 0
+    bytes_out_sum += bytes_out + 0
+}
+
+END {
+    flush_bucket()
+}