phase0: align benchmark v2 workload protocol

This commit is contained in:
abbycin 2026-03-03 22:12:17 +08:00
parent abf82f735c
commit 0649db54e7
7 changed files with 2026 additions and 492 deletions

20
plan_exec.md Normal file
View File

@ -0,0 +1,20 @@
# kv_bench 执行记录benchmark_refactor
## Phase 0已完成
- 日期2026-03-03
- 范围:
- 重构 `src/main.rs``rocksdb/main.cpp`,完成 v2 方法学最小清单:
- workload preset`W1..W6`
- mixed/read/scan 的 prefill + shared keyspace
- 时长模式:`--warmup-secs` / `--measure-secs`
- 显式 read path parity`--read-path snapshot|rw_txn`
- 统一 schema 结果落盘CSV并自动附带机器/环境元数据
- 更新脚本:`scripts/mace.sh`、`scripts/rocksdb.sh`、`scripts/plot.py`、`scripts/init.sh`
- 默认数据目录切换为 `/nvme` 体系(脚本强制 db_root 在 `/nvme` 下)
- 编译验证:
- `cargo check -q` 通过
- `cargo build --release -q` 通过
- `cmake --build --preset release -j` 通过
- 运行烟测:
- `mace``rocksdb` 均可按新参数运行并写入统一 schema 结果文件
- 提交:待本阶段 commit

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
#!/usr/bin/env bash #!/usr/bin/env bash
python3 -m venv . python3 -m venv .
./bin/pip3 install pandas matplotlib adjustText ./bin/pip3 install pandas matplotlib
rm -f .gitignore rm -f .gitignore

View File

@ -2,55 +2,61 @@
set -euo pipefail set -euo pipefail
if [ "$#" -ne 1 ] if [ "$#" -lt 1 ] || [ "$#" -gt 2 ]; then
then printf "Usage: %s <db_root_under_/nvme> [result_csv]\n" "$0"
printf "\033[m$0 path\033[0m\n"
exit 1 exit 1
fi fi
script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
root_dir="$(cd -- "${script_dir}/.." && pwd)" root_dir="$(cd -- "${script_dir}/.." && pwd)"
cargo build --release --manifest-path "${root_dir}/Cargo.toml" 1>/dev/null 2>/dev/null # The runner creates per-case unique paths under this root; each path must not exist.
db_root="$1"
result_file="${2:-${script_dir}/benchmark_results.csv}"
function samples() { warmup_secs="${WARMUP_SECS:-10}"
export RUST_BACKTRACE=full measure_secs="${MEASURE_SECS:-20}"
kv_sz=(16 16 100 1024 1024 1024 16 10240) prefill_keys="${PREFILL_KEYS:-200000}"
mode=(insert get mixed scan) read_path="${READ_PATH:-snapshot}"
# set -x
db_root="$1"
cnt=100000 if [[ "${db_root}" != /nvme* ]]; then
for ((i = 1; i <= $(nproc); i *= 2)) printf "db_root must be under /nvme, got: %s\n" "${db_root}" >&2
do exit 1
for ((j = 0; j < ${#kv_sz[@]}; j += 2))
do
for ((k = 0; k < ${#mode[@]}; k += 1))
do
if [ "${mode[k]}" == "insert" ]
then
"${root_dir}/target/release/kv_bench" --path "${db_root}" --threads "${i}" --iterations "${cnt}" --mode "${mode[k]}" --key-size "${kv_sz[j]}" --value-size "${kv_sz[j+1]}" --random
if test $? -ne 0
then
echo "${mode[k]} threads $i ksz ${kv_sz[j]} vsz ${kv_sz[j+1]} random fail"
exit 1
fi
fi
"${root_dir}/target/release/kv_bench" --path "${db_root}" --threads "${i}" --iterations "${cnt}" --mode "${mode[k]}" --key-size "${kv_sz[j]}" --value-size "${kv_sz[j+1]}"
if test $? -ne 0
then
echo "${mode[k]} threads $i ksz ${kv_sz[j]} vsz ${kv_sz[j+1]} fail"
exit 1
fi
done
done
done
}
echo mode,threads,key_size,value_size,insert_ratio,ops,elapsed_us > "${script_dir}/mace.csv"
samples "$1" 1>> "${script_dir}/mace.csv"
if [ -x "${script_dir}/bin/python" ]; then
(cd "${script_dir}" && "${script_dir}/bin/python" plot.py mace.csv)
else
(cd "${script_dir}" && python3 plot.py mace.csv)
fi fi
mkdir -p "${db_root}"
mkdir -p "$(dirname -- "${result_file}")"
cargo build --release --manifest-path "${root_dir}/Cargo.toml"
workloads=(W1 W2 W3 W4 W5 W6)
threads=(1 6 12)
profiles=(
"32 1024"
"32 16384"
)
for workload in "${workloads[@]}"; do
for t in "${threads[@]}"; do
for kv in "${profiles[@]}"; do
read -r key_size value_size <<< "${kv}"
run_path="$(mktemp -u -p "${db_root}" "mace_${workload}_${t}_${key_size}_${value_size}_XXXXXX")"
printf "[mace] workload=%s threads=%s key=%s value=%s path=%s\n" \
"${workload}" "${t}" "${key_size}" "${value_size}" "${run_path}"
"${root_dir}/target/release/kv_bench" \
--path "${run_path}" \
--workload "${workload}" \
--threads "${t}" \
--key-size "${key_size}" \
--value-size "${value_size}" \
--prefill-keys "${prefill_keys}" \
--warmup-secs "${warmup_secs}" \
--measure-secs "${measure_secs}" \
--shared-keyspace true \
--read-path "${read_path}" \
--result-file "${result_file}"
done
done
done
printf "Mace runs finished. Results appended to: %s\n" "${result_file}"

View File

@ -1,65 +1,81 @@
import pandas as pd #!/usr/bin/env python3
import matplotlib.pyplot as plt
from adjustText import adjust_text
import sys import sys
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
def real_mode(m): def main() -> int:
if m == "mixed": if len(sys.argv) not in (2, 3):
return "Mixed (70% Get, 30% Insert)" print(f"Usage: {sys.argv[0]} <result_csv> [output_dir]")
elif m == "get": return 1
return "Random Get"
elif m == "scan": result_csv = Path(sys.argv[1])
return "Sequential Scan" output_dir = Path(sys.argv[2]) if len(sys.argv) == 3 else result_csv.parent
return m.capitalize() output_dir.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(result_csv)
required = {
"engine",
"workload_id",
"threads",
"key_size",
"value_size",
"ops_per_sec",
"p99_us",
}
missing = required - set(df.columns)
if missing:
raise ValueError(f"Missing required columns: {sorted(missing)}")
for engine in sorted(df["engine"].unique()):
engine_df = df[df["engine"] == engine]
profiles = (
engine_df[["key_size", "value_size"]]
.drop_duplicates()
.sort_values(["key_size", "value_size"])
.itertuples(index=False)
)
for key_size, value_size in profiles:
sub = engine_df[
(engine_df["key_size"] == key_size)
& (engine_df["value_size"] == value_size)
]
if sub.empty:
continue
for metric, ylabel in (("ops_per_sec", "OPS/s"), ("p99_us", "P99 Latency (us)")):
plt.figure(figsize=(12, 7))
for workload in sorted(sub["workload_id"].unique()):
wdf = sub[sub["workload_id"] == workload].sort_values("threads")
plt.plot(
wdf["threads"],
wdf[metric],
marker="o",
linewidth=2,
label=workload,
)
plt.title(
f"{engine.upper()} {metric} (key={key_size}, value={value_size})",
fontsize=14,
)
plt.xlabel("Threads")
plt.ylabel(ylabel)
plt.grid(True, linestyle="--", alpha=0.5)
plt.legend()
plt.tight_layout()
out = output_dir / f"{engine}_{metric}_k{key_size}_v{value_size}.png"
plt.savefig(out)
plt.close()
print(f"Charts written to: {output_dir}")
return 0
name = sys.argv[1] if __name__ == "__main__":
prefix = name.split(".")[0] raise SystemExit(main())
# read benchmark data
# keep compatibility with older csv files that used elapsed/elasped
# and normalize to elapsed_us
df = pd.read_csv(f"./{name}")
if "elapsed_us" not in df.columns:
if "elapsed" in df.columns:
df = df.rename(columns={"elapsed": "elapsed_us"})
elif "elasped" in df.columns:
df = df.rename(columns={"elasped": "elapsed_us"})
# group by mode
modes = df["mode"].unique()
for mode in modes:
plt.figure(figsize=(16, 9))
subset = df[df["mode"] == mode]
# group by key/value size
key_value_combinations = subset.groupby(["key_size", "value_size"])
texts = []
for (key_size, value_size), group in key_value_combinations:
label = f"key={key_size}B, val={value_size}B"
x = group["threads"]
y = group["ops"]
# draw line
line, = plt.plot(x, y, marker="o", label=label)
# add labels
for xi, yi, ops in zip(x, y, group["ops"]):
texts.append(
plt.text(xi, yi, f"{int(ops)}", color=line.get_color(), fontsize=12)
)
adjust_text(texts, arrowprops=dict(arrowstyle="->", color="gray"))
plt.title(f"{prefix.upper()}: {real_mode(mode)}", fontsize=16)
plt.xlabel("Threads", fontsize=14)
plt.ylabel("OPS", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.tight_layout()
plt.savefig(f"{prefix}_{mode}.png")
plt.close()

View File

@ -2,55 +2,61 @@
set -euo pipefail set -euo pipefail
if [ "$#" -ne 1 ] if [ "$#" -lt 1 ] || [ "$#" -gt 2 ]; then
then printf "Usage: %s <db_root_under_/nvme> [result_csv]\n" "$0"
printf "\033[m$0 path\033[0m\n" exit 1
exit 1
fi fi
script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
root_dir="$(cd -- "${script_dir}/.." && pwd)" root_dir="$(cd -- "${script_dir}/.." && pwd)"
rocksdb_dir="${root_dir}/rocksdb" rocksdb_dir="${root_dir}/rocksdb"
(cd "${rocksdb_dir}" && cmake --preset release 1>/dev/null 2>/dev/null) db_root="$1"
(cd "${rocksdb_dir}" && cmake --build --preset release 1>/dev/null 2>/dev/null) result_file="${2:-${script_dir}/benchmark_results.csv}"
function samples() { warmup_secs="${WARMUP_SECS:-10}"
kv_sz=(16 16 100 1024 1024 1024 16 10240) measure_secs="${MEASURE_SECS:-20}"
mode=(insert get mixed scan) prefill_keys="${PREFILL_KEYS:-200000}"
# set -x read_path="${READ_PATH:-snapshot}"
db_root="$1"
cnt=100000
for ((i = 1; i <= $(nproc); i *= 2))
do
for ((j = 0; j < ${#kv_sz[@]}; j += 2))
do
for ((k = 0; k < ${#mode[@]}; k += 1))
do
if [ "${mode[k]}" == "insert" ]
then
"${rocksdb_dir}/build/release/rocksdb_bench" --path "${db_root}" --threads "${i}" --iterations "${cnt}" --mode "${mode[k]}" --key-size "${kv_sz[j]}" --value-size "${kv_sz[j+1]}" --random
if test $? -ne 0
then
echo "${mode[k]} threads $i ksz ${kv_sz[j]} vsz ${kv_sz[j+1]} random fail"
exit 1
fi
fi
"${rocksdb_dir}/build/release/rocksdb_bench" --path "${db_root}" --threads "${i}" --iterations "${cnt}" --mode "${mode[k]}" --key-size "${kv_sz[j]}" --value-size "${kv_sz[j+1]}"
if test $? -ne 0
then
echo "${mode[k]} threads $i ksz ${kv_sz[j]} vsz ${kv_sz[j+1]} fail"
exit 1
fi
done
done
done
}
echo mode,threads,key_size,value_size,insert_ratio,ops,elapsed_us > "${script_dir}/rocksdb.csv" if [[ "${db_root}" != /nvme* ]]; then
samples "$1" 1>> "${script_dir}/rocksdb.csv" printf "db_root must be under /nvme, got: %s\n" "${db_root}" >&2
if [ -x "${script_dir}/bin/python" ]; then exit 1
(cd "${script_dir}" && "${script_dir}/bin/python" plot.py rocksdb.csv)
else
(cd "${script_dir}" && python3 plot.py rocksdb.csv)
fi fi
mkdir -p "${db_root}"
mkdir -p "$(dirname -- "${result_file}")"
(cd "${rocksdb_dir}" && cmake --preset release)
(cd "${rocksdb_dir}" && cmake --build --preset release)
workloads=(W1 W2 W3 W4 W5 W6)
threads=(1 6 12)
profiles=(
"32 1024"
"32 16384"
)
for workload in "${workloads[@]}"; do
for t in "${threads[@]}"; do
for kv in "${profiles[@]}"; do
read -r key_size value_size <<< "${kv}"
run_path="$(mktemp -u -p "${db_root}" "rocksdb_${workload}_${t}_${key_size}_${value_size}_XXXXXX")"
printf "[rocksdb] workload=%s threads=%s key=%s value=%s path=%s\n" \
"${workload}" "${t}" "${key_size}" "${value_size}" "${run_path}"
"${rocksdb_dir}/build/release/rocksdb_bench" \
--path "${run_path}" \
--workload "${workload}" \
--threads "${t}" \
--key-size "${key_size}" \
--value-size "${value_size}" \
--prefill-keys "${prefill_keys}" \
--warmup-secs "${warmup_secs}" \
--measure-secs "${measure_secs}" \
--read-path "${read_path}" \
--result-file "${result_file}"
done
done
done
printf "RocksDB runs finished. Results appended to: %s\n" "${result_file}"

File diff suppressed because it is too large Load Diff