From f0044d1d62588bc0cfd83ca0dd6f7c6c6d83d268 Mon Sep 17 00:00:00 2001
From: abbycin <abbytsing@gmail.com>
Date: Mon, 9 Mar 2026 12:26:34 +0800
Subject: [PATCH] Clarify workloads and comparison filtering

---
 README.md                   | 16 +++++++++-------
 scripts/compare_baseline.py | 25 ++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index c3573ca..b40518c 100644
--- a/README.md
+++ b/README.md
@@ -47,14 +47,16 @@ mkdir -p "${KV_BENCH_STORAGE_ROOT}/basic_mace" "${KV_BENCH_STORAGE_ROOT}/basic_r
 - Comparison unit: rows with identical `workload_id`, `threads`, `key_size`, `value_size`, `durability_mode`, `read_path`
 - Fairness rule for read-heavy workloads: `get`, `scan`, and `W1`-`W6` run one GC/compaction pass after prefill and before warmup/measurement, so RocksDB is not compared with GC artificially disabled while reads may have to touch multiple SSTs
 - Throughput metric: workload-level `ops_per_sec` (higher is better)
-  - `W1/W2/W3/W4`: mixed read+update throughput
-  - `W5`: mixed read+update+scan throughput
-  - `W6`: scan throughput (counted by scan requests, not scanned key count)
 - Tail latency metric: workload-level `p99_us` (lower is better)
-  - This is the mixed p99 of all operations executed in that workload row, not per-op-type p99
-  - `W1/W2/W3/W4`: mixed read+update p99
-  - `W5`: mixed read+update+scan p99
-  - `W6`: scan p99
+  - This is the workload-level p99 of all operations executed in that row, not per-op-type p99
+
+## Workloads
+- `W1`: `95%` read + `5%` update, uniform distribution
+- `W2`: `95%` read + `5%` update, Zipf distribution
+- `W3`: `50%` read + `50%` update, uniform distribution
+- `W4`: `5%` read + `95%` update, uniform distribution
+- `W5`: `70%` read + `25%` update + `5%` scan, uniform distribution
+- `W6`: `100%` scan, uniform distribution; throughput is counted by scan requests, not scanned key count
 
 Raw CSV path: `./scripts/benchmark_results.csv`
 
diff --git a/scripts/compare_baseline.py b/scripts/compare_baseline.py
index 69a2838..de10864 100644
--- a/scripts/compare_baseline.py
+++ b/scripts/compare_baseline.py
@@ -15,6 +15,11 @@ def main() -> int:
         default="./scripts/benchmark_results.csv",
         help="Path to benchmark CSV (default: ./scripts/benchmark_results.csv)",
     )
+    parser.add_argument(
+        "--filter-errors",
+        action="store_true",
+        help="Only compare rows with error_ops == 0 (default: include all rows)",
+    )
     args = parser.parse_args()
 
     df = pd.read_csv(args.csv_path)
@@ -44,20 +49,28 @@ def main() -> int:
         "read_path",
     ]
 
-    ok = df[df["error_ops"] == 0].copy()
-    if ok.empty:
-        print("No rows with error_ops == 0, cannot compare.")
+    if args.filter_errors:
+        base = df[df["error_ops"] == 0].copy()
+    else:
+        base = df.copy()
+
+    if base.empty:
+        if args.filter_errors:
+            print("No rows with error_ops == 0, cannot compare.")
+        else:
+            print("No rows found in csv, cannot compare.")
         return 0
 
-    agg = ok.groupby(keys + ["engine"], as_index=False).agg(
+    agg = base.groupby(keys + ["engine"], as_index=False).agg(
         ops_per_sec=("ops_per_sec", "median"),
         p99_us=("p99_us", "median"),
+        error_ops=("error_ops", "median"),
     )
 
     piv = agg.pivot_table(
         index=keys,
         columns="engine",
-        values=["ops_per_sec", "p99_us"],
+        values=["ops_per_sec", "p99_us", "error_ops"],
         aggfunc="first",
     )
     piv.columns = [f"{metric}_{engine}" for metric, engine in piv.columns]
@@ -68,6 +81,8 @@ def main() -> int:
         "ops_per_sec_rocksdb",
         "p99_us_mace",
         "p99_us_rocksdb",
+        "error_ops_mace",
+        "error_ops_rocksdb",
     ]:
         if col not in out.columns:
             out[col] = pd.NA