From 6b8fcffd00ad220439f892d72af8bfc325d54950 Mon Sep 17 00:00:00 2001
From: abbycin <abbytsing@gmail.com>
Date: Wed, 4 Mar 2026 23:47:40 +0800
Subject: [PATCH] docs: clarify comparison semantics and add baseline compare
 script

---
 README.md                   | 38 ++++++++++------
 docs/repro.md               |  8 ++--
 scripts/compare_baseline.py | 90 +++++++++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 17 deletions(-)
 create mode 100644 scripts/compare_baseline.py

diff --git a/README.md b/README.md
index 96bbb86..16266a2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # kv_bench (Mace vs RocksDB)
 
-Quick start for reproducible comparison. Full guide: [docs/repro.md](./docs/repro.md).
+Quick start for reproducible Mace vs RocksDB comparison. Full guide: [docs/repro.md](./docs/repro.md).
 
 ## 5-Minute Quickstart
 1. Set your storage root (any mount path, not hardcoded to `/nvme`):
@@ -13,13 +13,13 @@ mkdir -p "${KV_BENCH_STORAGE_ROOT}"
 2. Initialize Python env once:
 
 ```bash
-cd /home/abby/kv_bench/scripts
+cd "$HOME/kv_bench/scripts"
 ./init.sh
 source ./bin/activate
-cd /home/abby/kv_bench
+cd "$HOME/kv_bench"
 ```
 
-3. Run baseline comparison (both engines write to the same CSV):
+3. Run baseline comparison (both engines append to the same CSV):
 
 ```bash
 rm -rf "${KV_BENCH_STORAGE_ROOT}/basic_mace" "${KV_BENCH_STORAGE_ROOT}/basic_rocks"
@@ -29,20 +29,32 @@ mkdir -p "${KV_BENCH_STORAGE_ROOT}/basic_mace" "${KV_BENCH_STORAGE_ROOT}/basic_r
 ./scripts/rocksdb.sh "${KV_BENCH_STORAGE_ROOT}/basic_rocks" ./scripts/benchmark_results.csv
 ```
 
-4. View and plot results:
+4. Plot results:
 
 ```bash
 ./scripts/bin/python ./scripts/plot.py ./scripts/benchmark_results.csv ./scripts
 ```
 
-## Fast Result Reading
-- Raw input CSV: `./scripts/benchmark_results.csv`
-- Key columns:
-  - `engine` (`mace` / `rocksdb`)
-  - `workload_id` (`W1..W6`)
-  - `ops_per_sec` (higher is better)
-  - `p99_us` (lower is better)
-  - `error_ops` (must be 0 before drawing conclusions)
+5. Print a direct comparison table from the CSV:
+
+```bash
+./scripts/bin/python ./scripts/compare_baseline.py ./scripts/benchmark_results.csv
+```
+
+## What Is Compared
+- Comparison unit: rows with identical `workload_id`, `threads`, `key_size`, `value_size`, `durability_mode`, `read_path`
+- Throughput metric: workload-level `ops_per_sec` (higher is better)
+  - `W1/W2/W3/W4`: mixed read+update throughput
+  - `W5`: mixed read+update+scan throughput
+  - `W6`: scan throughput (counted by scan requests, not scanned key count)
+- Tail latency metric: workload-level `p99_us` (lower is better)
+  - This is the mixed p99 of all operations executed in that workload row, not per-op-type p99
+  - `W1/W2/W3/W4`: mixed read+update p99
+  - `W5`: mixed read+update+scan p99
+  - `W6`: scan p99
+- Reliability gate: if `error_ops > 0`, debug that case before drawing conclusions
+
+Raw CSV path: `./scripts/benchmark_results.csv`
 
 ## Phase Reports
 - Phase 1 (stability CV):
diff --git a/docs/repro.md b/docs/repro.md
index c9c4cc6..ca596d6 100644
--- a/docs/repro.md
+++ b/docs/repro.md
@@ -22,8 +22,8 @@ For the local profile, assume approximately:
 Before running, set paths and verify filesystem type/capacity:
 
 ```bash
-export KV_BENCH_STORAGE_ROOT=/home/abby/kv_bench/target/repro_storage
-export KV_BENCH_RESULT_ROOT=/home/abby/kv_bench/target/repro_results
+export KV_BENCH_STORAGE_ROOT="$HOME/kv_bench/target/repro_storage"
+export KV_BENCH_RESULT_ROOT="$HOME/kv_bench/target/repro_results"
 mkdir -p "${KV_BENCH_STORAGE_ROOT}" "${KV_BENCH_RESULT_ROOT}"
 
 df -hT "${KV_BENCH_STORAGE_ROOT}" "${KV_BENCH_RESULT_ROOT}"
@@ -36,10 +36,10 @@ Requirements:
 
 ## 3. Initialization
 ```bash
-cd /home/abby/kv_bench/scripts
+cd "$HOME/kv_bench/scripts"
 ./init.sh
 source ./bin/activate
-cd /home/abby/kv_bench
+cd "$HOME/kv_bench"
 ```
 
 ## 4. Quick Baseline (W1~W6)
diff --git a/scripts/compare_baseline.py b/scripts/compare_baseline.py
new file mode 100644
index 0000000..69a2838
--- /dev/null
+++ b/scripts/compare_baseline.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+
+import argparse
+import sys
+import pandas as pd
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Compare mace vs rocksdb from benchmark_results.csv"
+    )
+    parser.add_argument(
+        "csv_path",
+        nargs="?",
+        default="./scripts/benchmark_results.csv",
+        help="Path to benchmark CSV (default: ./scripts/benchmark_results.csv)",
+    )
+    args = parser.parse_args()
+
+    df = pd.read_csv(args.csv_path)
+
+    required = {
+        "engine",
+        "workload_id",
+        "threads",
+        "key_size",
+        "value_size",
+        "durability_mode",
+        "read_path",
+        "ops_per_sec",
+        "p99_us",
+        "error_ops",
+    }
+    missing = required - set(df.columns)
+    if missing:
+        raise ValueError(f"Missing columns in csv: {sorted(missing)}")
+
+    keys = [
+        "workload_id",
+        "threads",
+        "key_size",
+        "value_size",
+        "durability_mode",
+        "read_path",
+    ]
+
+    ok = df[df["error_ops"] == 0].copy()
+    if ok.empty:
+        print("No rows with error_ops == 0, cannot compare.")
+        return 0
+
+    agg = ok.groupby(keys + ["engine"], as_index=False).agg(
+        ops_per_sec=("ops_per_sec", "median"),
+        p99_us=("p99_us", "median"),
+    )
+
+    piv = agg.pivot_table(
+        index=keys,
+        columns="engine",
+        values=["ops_per_sec", "p99_us"],
+        aggfunc="first",
+    )
+    piv.columns = [f"{metric}_{engine}" for metric, engine in piv.columns]
+    out = piv.reset_index()
+
+    for col in [
+        "ops_per_sec_mace",
+        "ops_per_sec_rocksdb",
+        "p99_us_mace",
+        "p99_us_rocksdb",
+    ]:
+        if col not in out.columns:
+            out[col] = pd.NA
+
+    out["qps_ratio_mace_over_rocksdb"] = (
+        out["ops_per_sec_mace"] / out["ops_per_sec_rocksdb"]
+    )
+    out["p99_ratio_mace_over_rocksdb"] = out["p99_us_mace"] / out["p99_us_rocksdb"]
+    out = out.sort_values(keys)
+
+    print(out.to_string(index=False))
+    print("\nInterpretation:")
+    print("- qps_ratio_mace_over_rocksdb > 1: mace has higher throughput")
+    print("- p99_ratio_mace_over_rocksdb < 1: mace has lower p99 latency")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())