1import argparse
2import json
3import numpy as np
4import os
5import pandas as pd
6
7from dataclasses import asdict, dataclass
8from pathlib import Path
9from scipy.stats import ttest_rel
10from tabulate import tabulate
11from typing import Final
12
13
14def flatten_data(json_data: dict) -> dict:
15 """
16 Extracts and flattens metrics from JSON data.
17 This is needed because the JSON data can be nested.
18 For example, the JSON data entry might look like this:
19
20 "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}
21
22 Flattened:
23
24 "gc.cycles": 13
25 "gc.heapSize": 5404549120
26 ...
27
28 See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846
29 for the ultimate source of this data.
30
31 Args:
32 json_data (dict): JSON data containing metrics.
33 Returns:
34 dict: Flattened metrics with keys as metric names.
35 """
36 flat_metrics = {}
37 for key, value in json_data.items():
38 # This key is duplicated as `time.cpu`; we keep that copy.
39 if key == "cpuTime":
40 continue
41
42 if isinstance(value, (int, float)):
43 flat_metrics[key] = value
44 elif isinstance(value, dict):
45 for subkey, subvalue in value.items():
46 assert isinstance(subvalue, (int, float)), subvalue
47 flat_metrics[f"{key}.{subkey}"] = subvalue
48 else:
49 assert isinstance(value, (float, int, dict)), (
50 f"Value `{value}` has unexpected type"
51 )
52
53 return flat_metrics
54
55
56def load_all_metrics(path: Path) -> dict:
57 """
58 Loads all stats JSON files in the specified file or directory and extracts metrics.
59 These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.
60
61 If the provided path is a directory, it must have the structure $path/$system/$stats,
62 where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
63 and $stats is a stats JSON file.
64
65 If the provided path is a file, it is a stats JSON file.
66
67 Args:
68 path (Path): Directory containing JSON files or a stats JSON file.
69
70 Returns:
71 dict: Dictionary with filenames as keys and extracted metrics as values.
72 """
73 metrics = {}
74 if path.is_dir():
75 for system_dir in path.iterdir():
76 assert system_dir.is_dir()
77
78 for chunk_output in system_dir.iterdir():
79 with chunk_output.open() as f:
80 data = json.load(f)
81
82 metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
83 else:
84 with path.open() as f:
85 metrics[path.name] = flatten_data(json.load(f))
86
87 return metrics
88
89
90def metric_table_name(name: str, explain: bool) -> str:
91 """
92 Returns the name of the metric, plus a footnote to explain it if needed.
93 """
94 return f"{name}[^{name}]" if explain else name
95
96
97METRIC_EXPLANATION_FOOTNOTE: Final[str] = """
98
99[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
100[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
101[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
102[^gc.cycles]: Number of times garbage collection has been performed.
103[^gc.heapSize]: Size in bytes of the garbage collector heap.
104[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
105[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
106[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
107[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
108[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
109[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
110[^envs.number]: The count of all `Env` objects allocated.
111[^nrAvoided]: The number of thunks avoided being created.
112[^nrExprs]: The number of expression objects ever created.
113[^nrFunctionCalls]: The number of function calls ever made.
114[^nrLookups]: The number of lookups into an attrset ever made.
115[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
116[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
117[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
118[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
119[^sets.number]: The number of attrsets ever made.
120[^symbols.number]: The number of symbols ever added to the symbol table.
121[^values.number]: The number of values ever made.
122[^envs.elements]: The number of values contained within an `Env` object.
123[^list.concats]: The number of list concatenation operations (`++`) performed.
124[^list.elements]: The number of values contained within a list.
125[^sets.elements]: The number of values contained within an attrset.
126[^sizes.Attr]: Size in bytes of the `Attr` type.
127[^sizes.Bindings]: Size in bytes of the `Bindings` type.
128[^sizes.Env]: Size in bytes of the `Env` type.
129[^sizes.Value]: Size in bytes of the `Value` type.
130"""
131
132
133@dataclass(frozen=True)
134class PairwiseTestResults:
135 updated: pd.DataFrame
136 equivalent: pd.DataFrame
137
138 @staticmethod
139 def tabulate(table, headers) -> str:
140 return tabulate(
141 table, headers, tablefmt="github", floatfmt=".4f", missingval="-"
142 )
143
144 def updated_to_markdown(self, explain: bool) -> str:
145 assert not self.updated.empty
146 # Header (get column names and format them)
147 return self.tabulate(
148 headers=[str(column) for column in self.updated.columns],
149 table=[
150 [
151 # The metric acts as its own footnote name
152 metric_table_name(row["metric"], explain),
153 # Check for no change and NaN in p_value/t_stat
154 *[
155 None if np.isnan(val) or np.allclose(val, 0) else val
156 for val in row[1:]
157 ],
158 ]
159 for _, row in self.updated.iterrows()
160 ],
161 )
162
163 def equivalent_to_markdown(self, explain: bool) -> str:
164 assert not self.equivalent.empty
165 return self.tabulate(
166 headers=[str(column) for column in self.equivalent.columns],
167 table=[
168 [
169 # The metric acts as its own footnote name
170 metric_table_name(row["metric"], explain),
171 row["value"],
172 ]
173 for _, row in self.equivalent.iterrows()
174 ],
175 )
176
177 def to_markdown(self, explain: bool) -> str:
178 result = ""
179
180 if not self.equivalent.empty:
181 result += "## Unchanged values\n\n"
182 result += self.equivalent_to_markdown(explain)
183
184 if not self.updated.empty:
185 result += ("\n\n" if result else "") + "## Updated values\n\n"
186 result += self.updated_to_markdown(explain)
187
188 if explain:
189 result += METRIC_EXPLANATION_FOOTNOTE
190
191 return result
192
193
194@dataclass(frozen=True)
195class Equivalent:
196 metric: str
197 value: float
198
199
200@dataclass(frozen=True)
201class Comparison:
202 metric: str
203 mean_before: float
204 mean_after: float
205 mean_diff: float
206 mean_pct_change: float
207
208
209@dataclass(frozen=True)
210class ComparisonWithPValue(Comparison):
211 p_value: float
212 t_stat: float
213
214
215def metric_sort_key(name: str) -> str:
216 if name in ("time.cpu", "time.gc", "time.gcFraction"):
217 return (1, name)
218 elif name.startswith("gc"):
219 return (2, name)
220 elif name.endswith(("bytes", "Bytes")):
221 return (3, name)
222 elif name.startswith("nr") or name.endswith("number"):
223 return (4, name)
224 else:
225 return (5, name)
226
227
228def perform_pairwise_tests(
229 before_metrics: dict, after_metrics: dict
230) -> PairwiseTestResults:
231 common_files = sorted(set(before_metrics) & set(after_metrics))
232 all_keys = sorted(
233 {
234 metric_keys
235 for file_metrics in before_metrics.values()
236 for metric_keys in file_metrics.keys()
237 },
238 key=metric_sort_key,
239 )
240
241 updated = []
242 equivalent = []
243
244 for key in all_keys:
245 before_vals = []
246 after_vals = []
247
248 for fname in common_files:
249 if key in before_metrics[fname] and key in after_metrics[fname]:
250 before_vals.append(before_metrics[fname][key])
251 after_vals.append(after_metrics[fname][key])
252
253 if len(before_vals) == 0:
254 continue
255
256 before_arr = np.array(before_vals)
257 after_arr = np.array(after_vals)
258
259 diff = after_arr - before_arr
260
261 # If there's no difference, add it all to the equivalent output.
262 if np.allclose(diff, 0):
263 equivalent.append(Equivalent(metric=key, value=before_vals[0]))
264 else:
265 pct_change = 100 * diff / before_arr
266
267 result = Comparison(
268 metric=key,
269 mean_before=np.mean(before_arr),
270 mean_after=np.mean(after_arr),
271 mean_diff=np.mean(diff),
272 mean_pct_change=np.mean(pct_change),
273 )
274
275 # If there are enough values to perform a t-test, do so.
276 if len(before_vals) > 1:
277 t_stat, p_val = ttest_rel(after_arr, before_arr)
278 result = ComparisonWithPValue(
279 **asdict(result), p_value=p_val, t_stat=t_stat
280 )
281
282 updated.append(result)
283
284 return PairwiseTestResults(
285 updated=pd.DataFrame(map(asdict, updated)),
286 equivalent=pd.DataFrame(map(asdict, equivalent)),
287 )
288
289
290def main():
291 parser = argparse.ArgumentParser(
292 description="Performance comparison of Nix evaluation statistics"
293 )
294 parser.add_argument(
295 "--explain", action="store_true", help="Explain the evaluation statistics"
296 )
297 parser.add_argument(
298 "before", help="File or directory containing baseline (data before)"
299 )
300 parser.add_argument(
301 "after", help="File or directory containing comparison (data after)"
302 )
303
304 options = parser.parse_args()
305
306 before_stats = Path(options.before)
307 after_stats = Path(options.after)
308
309 before_metrics = load_all_metrics(before_stats)
310 after_metrics = load_all_metrics(after_stats)
311 pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)
312 markdown_table = pairwise_test_results.to_markdown(explain=options.explain)
313 print(markdown_table)
314
315
316if __name__ == "__main__":
317 main()