ci.eval.compare: extend the performance comparison script (#443620)

Changed files
+286 -91
ci
eval
+245 -78
ci/eval/compare/cmp-stats.py
···
import json
import os
-
from scipy.stats import ttest_rel
import pandas as pd
-
import numpy as np
from pathlib import Path
-
# Define metrics of interest (can be expanded as needed)
-
METRIC_PREFIXES = ("nr", "gc")
def flatten_data(json_data: dict) -> dict:
"""
···
"gc.heapSize": 5404549120
...
Args:
json_data (dict): JSON data containing metrics.
Returns:
dict: Flattened metrics with keys as metric names.
"""
flat_metrics = {}
-
for k, v in json_data.items():
-
if isinstance(v, (int, float)):
-
flat_metrics[k] = v
-
elif isinstance(v, dict):
-
for sub_k, sub_v in v.items():
-
flat_metrics[f"{k}.{sub_k}"] = sub_v
return flat_metrics
-
def load_all_metrics(directory: Path) -> dict:
-
"""
-
Loads all stats JSON files in the specified directory and extracts metrics.
Args:
-
directory (Path): Directory containing JSON files.
Returns:
dict: Dictionary with filenames as keys and extracted metrics as values.
"""
metrics = {}
-
for system_dir in directory.iterdir():
-
assert system_dir.is_dir()
-
for chunk_output in system_dir.iterdir():
with chunk_output.open() as f:
data = json.load(f)
metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
return metrics
-
def dataframe_to_markdown(df: pd.DataFrame) -> str:
-
df = df.sort_values(by=df.columns[0], ascending=True)
-
markdown_lines = []
-
# Header (get column names and format them)
-
header = '\n| ' + ' | '.join(df.columns) + ' |'
-
markdown_lines.append(header)
-
markdown_lines.append("| - " * (len(df.columns)) + "|") # Separator line
-
# Iterate over rows to build Markdown rows
-
for _, row in df.iterrows():
-
# TODO: define threshold for highlighting
-
highlight = False
-
fmt = lambda x: f"**{x}**" if highlight else f"{x}"
-
# Check for no change and NaN in p_value/t_stat
-
row_values = []
-
for val in row:
-
if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat
-
row_values.append("-") # Custom symbol for NaN
-
elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0)
-
row_values.append("-") # Custom symbol for no change
-
else:
-
row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))
-
markdown_lines.append('| ' + ' | '.join(row_values) + ' |')
-
return '\n'.join(markdown_lines)
-
def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
common_files = sorted(set(before_metrics) & set(after_metrics))
-
all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })
-
results = []
for key in all_keys:
-
before_vals, after_vals = [], []
for fname in common_files:
if key in before_metrics[fname] and key in after_metrics[fname]:
before_vals.append(before_metrics[fname][key])
after_vals.append(after_metrics[fname][key])
-
if len(before_vals) >= 2:
-
before_arr = np.array(before_vals)
-
after_arr = np.array(after_vals)
-
diff = after_arr - before_arr
pct_change = 100 * diff / before_arr
-
t_stat, p_val = ttest_rel(after_arr, before_arr)
-
results.append({
-
"metric": key,
-
"mean_before": np.mean(before_arr),
-
"mean_after": np.mean(after_arr),
-
"mean_diff": np.mean(diff),
-
"mean_%_change": np.mean(pct_change),
-
"p_value": p_val,
-
"t_stat": t_stat
-
})
-
df = pd.DataFrame(results).sort_values("p_value")
-
return df
-
if __name__ == "__main__":
-
before_dir = os.environ.get("BEFORE_DIR")
-
after_dir = os.environ.get("AFTER_DIR")
-
if not before_dir or not after_dir:
-
print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
-
exit(1)
-
before_stats = Path(before_dir) / "stats"
-
after_stats = Path(after_dir) / "stats"
-
# This may happen if the pull request target does not include PR#399720 yet.
-
if not before_stats.exists():
-
print("⚠️ Skipping comparison: stats directory is missing in the target commit.")
-
exit(0)
-
# This should never happen, but we're exiting gracefully anyways
-
if not after_stats.exists():
-
print("⚠️ Skipping comparison: stats directory missing in current PR evaluation.")
-
exit(0)
before_metrics = load_all_metrics(before_stats)
after_metrics = load_all_metrics(after_stats)
-
df1 = perform_pairwise_tests(before_metrics, after_metrics)
-
markdown_table = dataframe_to_markdown(df1)
print(markdown_table)
···
+
import argparse
import json
+
import numpy as np
import os
import pandas as pd
+
import warnings
+
+
from dataclasses import asdict, dataclass
from pathlib import Path
+
from scipy.stats import ttest_rel
+
from tabulate import tabulate
+
from typing import Final
def flatten_data(json_data: dict) -> dict:
"""
···
"gc.heapSize": 5404549120
...
+
See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846
+
for the ultimate source of this data.
+
Args:
json_data (dict): JSON data containing metrics.
Returns:
dict: Flattened metrics with keys as metric names.
"""
flat_metrics = {}
+
for key, value in json_data.items():
+
# This key is duplicated as `time.cpu`; we keep that copy.
+
if key == "cpuTime":
+
continue
+
+
if isinstance(value, (int, float)):
+
flat_metrics[key] = value
+
elif isinstance(value, dict):
+
for subkey, subvalue in value.items():
+
assert isinstance(subvalue, (int, float)), subvalue
+
flat_metrics[f"{key}.{subkey}"] = subvalue
+
else:
+
assert isinstance(value, (float, int, dict)), (
+
f"Value `{value}` has unexpected type"
+
)
+
return flat_metrics
+
def load_all_metrics(path: Path) -> dict:
+
"""
+
Loads all stats JSON files in the specified file or directory and extracts metrics.
+
These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.
+
If the provided path is a directory, it must have the structure $path/$system/$stats,
+
where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
+
and $stats is a stats JSON file.
+
If the provided path is a file, it is a stats JSON file.
Args:
+
path (Path): Directory containing JSON files or a stats JSON file.
+
Returns:
dict: Dictionary with filenames as keys and extracted metrics as values.
"""
metrics = {}
+
if path.is_dir():
+
for system_dir in path.iterdir():
+
assert system_dir.is_dir()
+
for chunk_output in system_dir.iterdir():
with chunk_output.open() as f:
data = json.load(f)
+
metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
+
else:
+
with path.open() as f:
+
metrics[path.name] = flatten_data(json.load(f))
return metrics
+
+
def metric_table_name(name: str, explain: bool) -> str:
+
"""
+
Returns the name of the metric, plus a footnote to explain it if needed.
+
"""
+
return f"{name}[^{name}]" if explain else name
+
+
+
METRIC_EXPLANATION_FOOTNOTE: Final[str] = """
+
+
[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
+
[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
+
[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
+
[^gc.cycles]: Number of times garbage collection has been performed.
+
[^gc.heapSize]: Size in bytes of the garbage collector heap.
+
[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
+
[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
+
[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
+
[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
+
[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
+
[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
+
[^envs.number]: The count of all `Env` objects allocated.
+
[^nrAvoided]: The number of thunks avoided being created.
+
[^nrExprs]: The number of expression objects ever created.
+
[^nrFunctionCalls]: The number of function calls ever made.
+
[^nrLookups]: The number of lookups into an attrset ever made.
+
[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
+
[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
+
[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
+
[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
+
[^sets.number]: The number of attrsets ever made.
+
[^symbols.number]: The number of symbols ever added to the symbol table.
+
[^values.number]: The number of values ever made.
+
[^envs.elements]: The number of values contained within an `Env` object.
+
[^list.concats]: The number of list concatenation operations (`++`) performed.
+
[^list.elements]: The number of values contained within a list.
+
[^sets.elements]: The number of values contained within an attrset.
+
[^sizes.Attr]: Size in bytes of the `Attr` type.
+
[^sizes.Bindings]: Size in bytes of the `Bindings` type.
+
[^sizes.Env]: Size in bytes of the `Env` type.
+
[^sizes.Value]: Size in bytes of the `Value` type.
+
"""
+
+
+
@dataclass(frozen=True)
+
class PairwiseTestResults:
+
updated: pd.DataFrame
+
equivalent: pd.DataFrame
+
+
@staticmethod
+
def tabulate(table, headers) -> str:
+
return tabulate(
+
table, headers, tablefmt="github", floatfmt=".4f", missingval="-"
+
)
+
+
def updated_to_markdown(self, explain: bool) -> str:
+
assert not self.updated.empty
+
# Header (get column names and format them)
+
return self.tabulate(
+
headers=[str(column) for column in self.updated.columns],
+
table=[
+
[
+
# The metric acts as its own footnote name
+
metric_table_name(row["metric"], explain),
+
# Check for no change and NaN in p_value/t_stat
+
*[
+
None if np.isnan(val) or np.allclose(val, 0) else val
+
for val in row[1:]
+
],
+
]
+
for _, row in self.updated.iterrows()
+
],
+
)
+
+
def equivalent_to_markdown(self, explain: bool) -> str:
+
assert not self.equivalent.empty
+
return self.tabulate(
+
headers=[str(column) for column in self.equivalent.columns],
+
table=[
+
[
+
# The metric acts as its own footnote name
+
metric_table_name(row["metric"], explain),
+
row["value"],
+
]
+
for _, row in self.equivalent.iterrows()
+
],
+
)
+
+
def to_markdown(self, explain: bool) -> str:
+
result = ""
+
+
if not self.equivalent.empty:
+
result += "## Unchanged values\n\n"
+
result += self.equivalent_to_markdown(explain)
+
+
if not self.updated.empty:
+
result += ("\n\n" if result else "") + "## Updated values\n\n"
+
result += self.updated_to_markdown(explain)
+
+
if explain:
+
result += METRIC_EXPLANATION_FOOTNOTE
+
+
return result
+
+
+
@dataclass(frozen=True)
+
class Equivalent:
+
metric: str
+
value: float
+
@dataclass(frozen=True)
+
class Comparison:
+
metric: str
+
mean_before: float
+
mean_after: float
+
mean_diff: float
+
mean_pct_change: float
+
@dataclass(frozen=True)
+
class ComparisonWithPValue(Comparison):
+
p_value: float
+
t_stat: float
+
def metric_sort_key(name: str) -> str:
+
if name in ("time.cpu", "time.gc", "time.gcFraction"):
+
return (1, name)
+
elif name.startswith("gc"):
+
return (2, name)
+
elif name.endswith(("bytes", "Bytes")):
+
return (3, name)
+
elif name.startswith("nr") or name.endswith("number"):
+
return (4, name)
+
else:
+
return (5, name)
+
def perform_pairwise_tests(
+
before_metrics: dict, after_metrics: dict
+
) -> PairwiseTestResults:
common_files = sorted(set(before_metrics) & set(after_metrics))
+
all_keys = sorted(
+
{
+
metric_keys
+
for file_metrics in before_metrics.values()
+
for metric_keys in file_metrics.keys()
+
},
+
key=metric_sort_key,
+
)
+
updated = []
+
equivalent = []
for key in all_keys:
+
before_vals = []
+
after_vals = []
for fname in common_files:
if key in before_metrics[fname] and key in after_metrics[fname]:
before_vals.append(before_metrics[fname][key])
after_vals.append(after_metrics[fname][key])
+
if len(before_vals) == 0:
+
continue
+
+
before_arr = np.array(before_vals)
+
after_arr = np.array(after_vals)
+
+
diff = after_arr - before_arr
+
# If there's no difference, add it all to the equivalent output.
+
if np.allclose(diff, 0):
+
equivalent.append(Equivalent(metric=key, value=before_vals[0]))
+
else:
pct_change = 100 * diff / before_arr
+
result = Comparison(
+
metric=key,
+
mean_before=np.mean(before_arr),
+
mean_after=np.mean(after_arr),
+
mean_diff=np.mean(diff),
+
mean_pct_change=np.mean(pct_change),
+
)
+
# If there are enough values to perform a t-test, do so.
+
if len(before_vals) > 1:
+
t_stat, p_val = ttest_rel(after_arr, before_arr)
+
result = ComparisonWithPValue(
+
**asdict(result), p_value=p_val, t_stat=t_stat
+
)
+
updated.append(result)
+
return PairwiseTestResults(
+
updated=pd.DataFrame(map(asdict, updated)),
+
equivalent=pd.DataFrame(map(asdict, equivalent)),
+
)
+
def main():
+
parser = argparse.ArgumentParser(
+
description="Performance comparison of Nix evaluation statistics"
+
)
+
parser.add_argument(
+
"--explain", action="store_true", help="Explain the evaluation statistics"
+
)
+
parser.add_argument(
+
"before", help="File or directory containing baseline (data before)"
+
)
+
parser.add_argument(
+
"after", help="File or directory containing comparison (data after)"
+
)
+
options = parser.parse_args()
+
# Turn warnings into errors
+
warnings.simplefilter("error")
+
+
before_stats = Path(options.before)
+
after_stats = Path(options.after)
before_metrics = load_all_metrics(before_stats)
after_metrics = load_all_metrics(after_stats)
+
pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)
+
markdown_table = pairwise_test_results.to_markdown(explain=options.explain)
print(markdown_table)
+
+
+
if __name__ == "__main__":
+
main()
+41 -13
ci/eval/compare/default.nix
···
runCommand,
writeText,
python3,
}:
{
combinedDir,
touchedFilesJson,
···
# Don't depend on -dev outputs to reduce closure size for CI.
nativeBuildInputs = map lib.getBin [
jq
-
(python3.withPackages (
-
ps: with ps; [
-
numpy
-
pandas
-
scipy
-
]
-
))
-
];
maintainers = builtins.toJSON maintainers;
passAsFile = [ "maintainers" ];
-
env = {
-
BEFORE_DIR = "${combined}/before";
-
AFTER_DIR = "${combined}/after";
-
};
}
''
mkdir $out
···
echo
} >> $out/step-summary.md
-
python3 ${./cmp-stats.py} >> $out/step-summary.md
else
# Package chunks are the same in both revisions
···
runCommand,
writeText,
python3,
+
stdenvNoCC,
+
makeWrapper,
}:
+
let
+
python = python3.withPackages (ps: [
+
ps.numpy
+
ps.pandas
+
ps.scipy
+
ps.tabulate
+
]);
+
+
cmp-stats = stdenvNoCC.mkDerivation {
+
pname = "cmp-stats";
+
version = lib.trivial.release;
+
+
dontUnpack = true;
+
+
nativeBuildInputs = [ makeWrapper ];
+
+
installPhase = ''
+
runHook preInstall
+
+
mkdir -p $out/share/cmp-stats
+
+
cp ${./cmp-stats.py} "$out/share/cmp-stats/cmp-stats.py"
+
+
makeWrapper ${python.interpreter} "$out/bin/cmp-stats" \
+
--add-flags "$out/share/cmp-stats/cmp-stats.py"
+
+
runHook postInstall
+
'';
+
+
meta = {
+
description = "Performance comparison of Nix evaluation statistics";
+
license = lib.licenses.mit;
+
mainProgram = "cmp-stats";
+
maintainers = with lib.maintainers; [ philiptaron ];
+
};
+
};
+
in
{
combinedDir,
touchedFilesJson,
···
# Don't depend on -dev outputs to reduce closure size for CI.
nativeBuildInputs = map lib.getBin [
jq
+
cmp-stats
];
maintainers = builtins.toJSON maintainers;
passAsFile = [ "maintainers" ];
}
''
mkdir $out
···
echo
} >> $out/step-summary.md
+
cmp-stats --explain ${combined}/before/stats ${combined}/after/stats >> $out/step-summary.md
else
# Package chunks are the same in both revisions