···
3
-
from scipy.stats import ttest_rel
8
+
from dataclasses import asdict, dataclass
10
+
from scipy.stats import ttest_rel
11
+
from tabulate import tabulate
12
+
from typing import Final
8
-
# Define metrics of interest (can be expanded as needed)
9
-
METRIC_PREFIXES = ("nr", "gc")
def flatten_data(json_data: dict) -> dict:
···
"gc.heapSize": 5404549120
29
+
See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846
30
+
for the ultimate source of this data.
json_data (dict): JSON data containing metrics.
dict: Flattened metrics with keys as metric names.
31
-
for k, v in json_data.items():
32
-
if isinstance(v, (int, float)):
34
-
elif isinstance(v, dict):
35
-
for sub_k, sub_v in v.items():
36
-
flat_metrics[f"{k}.{sub_k}"] = sub_v
38
+
for key, value in json_data.items():
39
+
# This key is duplicated as `time.cpu`; we keep that copy.
40
+
if key == "cpuTime":
43
+
if isinstance(value, (int, float)):
44
+
flat_metrics[key] = value
45
+
elif isinstance(value, dict):
46
+
for subkey, subvalue in value.items():
47
+
assert isinstance(subvalue, (int, float)), subvalue
48
+
flat_metrics[f"{key}.{subkey}"] = subvalue
50
+
assert isinstance(value, (float, int, dict)), (
51
+
f"Value `{value}` has unexpected type"
57
+
def load_all_metrics(path: Path) -> dict:
59
+
Loads all stats JSON files in the specified file or directory and extracts metrics.
60
+
These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.
62
+
If the provided path is a directory, it must have the structure $path/$system/$stats,
63
+
where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
64
+
and $stats is a stats JSON file.
42
-
def load_all_metrics(directory: Path) -> dict:
44
-
Loads all stats JSON files in the specified directory and extracts metrics.
66
+
If the provided path is a file, it is a stats JSON file.
47
-
directory (Path): Directory containing JSON files.
69
+
path (Path): Directory containing JSON files or a stats JSON file.
dict: Dictionary with filenames as keys and extracted metrics as values.
52
-
for system_dir in directory.iterdir():
53
-
assert system_dir.is_dir()
76
+
for system_dir in path.iterdir():
77
+
assert system_dir.is_dir()
55
-
for chunk_output in system_dir.iterdir():
79
+
for chunk_output in system_dir.iterdir():
with chunk_output.open() as f:
metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
85
+
with path.open() as f:
86
+
metrics[path.name] = flatten_data(json.load(f))
62
-
def dataframe_to_markdown(df: pd.DataFrame) -> str:
63
-
df = df.sort_values(by=df.columns[0], ascending=True)
91
+
def metric_table_name(name: str, explain: bool) -> str:
93
+
Returns the name of the metric, plus a footnote to explain it if needed.
95
+
return f"{name}[^{name}]" if explain else name
98
+
METRIC_EXPLANATION_FOOTNOTE: Final[str] = """
100
+
[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
101
+
[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
102
+
[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
103
+
[^gc.cycles]: Number of times garbage collection has been performed.
104
+
[^gc.heapSize]: Size in bytes of the garbage collector heap.
105
+
[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
106
+
[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
107
+
[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
108
+
[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
109
+
[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
110
+
[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
111
+
[^envs.number]: The count of all `Env` objects allocated.
112
+
[^nrAvoided]: The number of thunks avoided being created.
113
+
[^nrExprs]: The number of expression objects ever created.
114
+
[^nrFunctionCalls]: The number of function calls ever made.
115
+
[^nrLookups]: The number of lookups into an attrset ever made.
116
+
[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
117
+
[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
118
+
[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
119
+
[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
120
+
[^sets.number]: The number of attrsets ever made.
121
+
[^symbols.number]: The number of symbols ever added to the symbol table.
122
+
[^values.number]: The number of values ever made.
123
+
[^envs.elements]: The number of values contained within an `Env` object.
124
+
[^list.concats]: The number of list concatenation operations (`++`) performed.
125
+
[^list.elements]: The number of values contained within a list.
126
+
[^sets.elements]: The number of values contained within an attrset.
127
+
[^sizes.Attr]: Size in bytes of the `Attr` type.
128
+
[^sizes.Bindings]: Size in bytes of the `Bindings` type.
129
+
[^sizes.Env]: Size in bytes of the `Env` type.
130
+
[^sizes.Value]: Size in bytes of the `Value` type.
134
+
@dataclass(frozen=True)
135
+
class PairwiseTestResults:
136
+
updated: pd.DataFrame
137
+
equivalent: pd.DataFrame
140
+
def tabulate(table, headers) -> str:
142
+
table, headers, tablefmt="github", floatfmt=".4f", missingval="-"
145
+
def updated_to_markdown(self, explain: bool) -> str:
146
+
assert not self.updated.empty
147
+
# Header (get column names and format them)
148
+
return self.tabulate(
149
+
headers=[str(column) for column in self.updated.columns],
152
+
# The metric acts as its own footnote name
153
+
metric_table_name(row["metric"], explain),
154
+
# Check for no change and NaN in p_value/t_stat
156
+
None if np.isnan(val) or np.allclose(val, 0) else val
160
+
for _, row in self.updated.iterrows()
164
+
def equivalent_to_markdown(self, explain: bool) -> str:
165
+
assert not self.equivalent.empty
166
+
return self.tabulate(
167
+
headers=[str(column) for column in self.equivalent.columns],
170
+
# The metric acts as its own footnote name
171
+
metric_table_name(row["metric"], explain),
174
+
for _, row in self.equivalent.iterrows()
178
+
def to_markdown(self, explain: bool) -> str:
181
+
if not self.equivalent.empty:
182
+
result += "## Unchanged values\n\n"
183
+
result += self.equivalent_to_markdown(explain)
185
+
if not self.updated.empty:
186
+
result += ("\n\n" if result else "") + "## Updated values\n\n"
187
+
result += self.updated_to_markdown(explain)
190
+
result += METRIC_EXPLANATION_FOOTNOTE
195
+
@dataclass(frozen=True)
66
-
# Header (get column names and format them)
67
-
header = '\n| ' + ' | '.join(df.columns) + ' |'
68
-
markdown_lines.append(header)
69
-
markdown_lines.append("| - " * (len(df.columns)) + "|") # Separator line
71
-
# Iterate over rows to build Markdown rows
72
-
for _, row in df.iterrows():
73
-
# TODO: define threshold for highlighting
201
+
@dataclass(frozen=True)
207
+
mean_pct_change: float
76
-
fmt = lambda x: f"**{x}**" if highlight else f"{x}"
78
-
# Check for no change and NaN in p_value/t_stat
81
-
if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat
82
-
row_values.append("-") # Custom symbol for NaN
83
-
elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0)
84
-
row_values.append("-") # Custom symbol for no change
86
-
row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))
210
+
@dataclass(frozen=True)
211
+
class ComparisonWithPValue(Comparison):
88
-
markdown_lines.append('| ' + ' | '.join(row_values) + ' |')
90
-
return '\n'.join(markdown_lines)
216
+
def metric_sort_key(name: str) -> str:
217
+
if name in ("time.cpu", "time.gc", "time.gcFraction"):
219
+
elif name.startswith("gc"):
221
+
elif name.endswith(("bytes", "Bytes")):
223
+
elif name.startswith("nr") or name.endswith("number"):
93
-
def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
229
+
def perform_pairwise_tests(
230
+
before_metrics: dict, after_metrics: dict
231
+
) -> PairwiseTestResults:
common_files = sorted(set(before_metrics) & set(after_metrics))
95
-
all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })
236
+
for file_metrics in before_metrics.values()
237
+
for metric_keys in file_metrics.keys()
239
+
key=metric_sort_key,
100
-
before_vals, after_vals = [], []
for fname in common_files:
if key in before_metrics[fname] and key in after_metrics[fname]:
before_vals.append(before_metrics[fname][key])
after_vals.append(after_metrics[fname][key])
107
-
if len(before_vals) >= 2:
108
-
before_arr = np.array(before_vals)
109
-
after_arr = np.array(after_vals)
254
+
if len(before_vals) == 0:
257
+
before_arr = np.array(before_vals)
258
+
after_arr = np.array(after_vals)
260
+
diff = after_arr - before_arr
111
-
diff = after_arr - before_arr
262
+
# If there's no difference, add it all to the equivalent output.
263
+
if np.allclose(diff, 0):
264
+
equivalent.append(Equivalent(metric=key, value=before_vals[0]))
pct_change = 100 * diff / before_arr
113
-
t_stat, p_val = ttest_rel(after_arr, before_arr)
117
-
"mean_before": np.mean(before_arr),
118
-
"mean_after": np.mean(after_arr),
119
-
"mean_diff": np.mean(diff),
120
-
"mean_%_change": np.mean(pct_change),
268
+
result = Comparison(
270
+
mean_before=np.mean(before_arr),
271
+
mean_after=np.mean(after_arr),
272
+
mean_diff=np.mean(diff),
273
+
mean_pct_change=np.mean(pct_change),
125
-
df = pd.DataFrame(results).sort_values("p_value")
276
+
# If there are enough values to perform a t-test, do so.
277
+
if len(before_vals) > 1:
278
+
t_stat, p_val = ttest_rel(after_arr, before_arr)
279
+
result = ComparisonWithPValue(
280
+
**asdict(result), p_value=p_val, t_stat=t_stat
283
+
updated.append(result)
129
-
if __name__ == "__main__":
130
-
before_dir = os.environ.get("BEFORE_DIR")
131
-
after_dir = os.environ.get("AFTER_DIR")
285
+
return PairwiseTestResults(
286
+
updated=pd.DataFrame(map(asdict, updated)),
287
+
equivalent=pd.DataFrame(map(asdict, equivalent)),
133
-
if not before_dir or not after_dir:
134
-
print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
137
-
before_stats = Path(before_dir) / "stats"
138
-
after_stats = Path(after_dir) / "stats"
292
+
parser = argparse.ArgumentParser(
293
+
description="Performance comparison of Nix evaluation statistics"
295
+
parser.add_argument(
296
+
"--explain", action="store_true", help="Explain the evaluation statistics"
298
+
parser.add_argument(
299
+
"before", help="File or directory containing baseline (data before)"
301
+
parser.add_argument(
302
+
"after", help="File or directory containing comparison (data after)"
140
-
# This may happen if the pull request target does not include PR#399720 yet.
141
-
if not before_stats.exists():
142
-
print("⚠️ Skipping comparison: stats directory is missing in the target commit.")
305
+
options = parser.parse_args()
145
-
# This should never happen, but we're exiting gracefully anyways
146
-
if not after_stats.exists():
147
-
print("⚠️ Skipping comparison: stats directory missing in current PR evaluation.")
307
+
# Turn warnings into errors
308
+
warnings.simplefilter("error")
310
+
before_stats = Path(options.before)
311
+
after_stats = Path(options.after)
before_metrics = load_all_metrics(before_stats)
after_metrics = load_all_metrics(after_stats)
152
-
df1 = perform_pairwise_tests(before_metrics, after_metrics)
153
-
markdown_table = dataframe_to_markdown(df1)
315
+
pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)
316
+
markdown_table = pairwise_test_results.to_markdown(explain=options.explain)
320
+
if __name__ == "__main__":