at master 11 kB view raw
1import argparse 2import json 3import numpy as np 4import os 5import pandas as pd 6 7from dataclasses import asdict, dataclass 8from pathlib import Path 9from scipy.stats import ttest_rel 10from tabulate import tabulate 11from typing import Final 12 13 14def flatten_data(json_data: dict) -> dict: 15 """ 16 Extracts and flattens metrics from JSON data. 17 This is needed because the JSON data can be nested. 18 For example, the JSON data entry might look like this: 19 20 "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464} 21 22 Flattened: 23 24 "gc.cycles": 13 25 "gc.heapSize": 5404549120 26 ... 27 28 See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846 29 for the ultimate source of this data. 30 31 Args: 32 json_data (dict): JSON data containing metrics. 33 Returns: 34 dict: Flattened metrics with keys as metric names. 35 """ 36 flat_metrics = {} 37 for key, value in json_data.items(): 38 # This key is duplicated as `time.cpu`; we keep that copy. 39 if key == "cpuTime": 40 continue 41 42 if isinstance(value, (int, float)): 43 flat_metrics[key] = value 44 elif isinstance(value, dict): 45 for subkey, subvalue in value.items(): 46 assert isinstance(subvalue, (int, float)), subvalue 47 flat_metrics[f"{key}.{subkey}"] = subvalue 48 else: 49 assert isinstance(value, (float, int, dict)), ( 50 f"Value `{value}` has unexpected type" 51 ) 52 53 return flat_metrics 54 55 56def load_all_metrics(path: Path) -> dict: 57 """ 58 Loads all stats JSON files in the specified file or directory and extracts metrics. 59 These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set. 60 61 If the provided path is a directory, it must have the structure $path/$system/$stats, 62 where $path is the provided path, $system is some system from `lib.systems.doubles.*`, 63 and $stats is a stats JSON file. 64 65 If the provided path is a file, it is a stats JSON file. 66 67 Args: 68 path (Path): Directory containing JSON files or a stats JSON file. 69 70 Returns: 71 dict: Dictionary with filenames as keys and extracted metrics as values. 72 """ 73 metrics = {} 74 if path.is_dir(): 75 for system_dir in path.iterdir(): 76 assert system_dir.is_dir() 77 78 for chunk_output in system_dir.iterdir(): 79 with chunk_output.open() as f: 80 data = json.load(f) 81 82 metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data) 83 else: 84 with path.open() as f: 85 metrics[path.name] = flatten_data(json.load(f)) 86 87 return metrics 88 89 90def metric_table_name(name: str, explain: bool) -> str: 91 """ 92 Returns the name of the metric, plus a footnote to explain it if needed. 93 """ 94 return f"{name}[^{name}]" if explain else name 95 96 97METRIC_EXPLANATION_FOOTNOTE: Final[str] = """ 98 99[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html). 100[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC. 101[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC. 102[^gc.cycles]: Number of times garbage collection has been performed. 103[^gc.heapSize]: Size in bytes of the garbage collector heap. 104[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector. 105[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html). 106[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator. 107[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator. 108[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table. 109[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator. 110[^envs.number]: The count of all `Env` objects allocated. 111[^nrAvoided]: The number of thunks avoided being created. 112[^nrExprs]: The number of expression objects ever created. 113[^nrFunctionCalls]: The number of function calls ever made. 114[^nrLookups]: The number of lookups into an attrset ever made. 115[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets. 116[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed. 117[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made. 118[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure. 119[^sets.number]: The number of attrsets ever made. 120[^symbols.number]: The number of symbols ever added to the symbol table. 121[^values.number]: The number of values ever made. 122[^envs.elements]: The number of values contained within an `Env` object. 123[^list.concats]: The number of list concatenation operations (`++`) performed. 124[^list.elements]: The number of values contained within a list. 125[^sets.elements]: The number of values contained within an attrset. 126[^sizes.Attr]: Size in bytes of the `Attr` type. 127[^sizes.Bindings]: Size in bytes of the `Bindings` type. 128[^sizes.Env]: Size in bytes of the `Env` type. 129[^sizes.Value]: Size in bytes of the `Value` type. 130""" 131 132 133@dataclass(frozen=True) 134class PairwiseTestResults: 135 updated: pd.DataFrame 136 equivalent: pd.DataFrame 137 138 @staticmethod 139 def tabulate(table, headers) -> str: 140 return tabulate( 141 table, headers, tablefmt="github", floatfmt=".4f", missingval="-" 142 ) 143 144 def updated_to_markdown(self, explain: bool) -> str: 145 assert not self.updated.empty 146 # Header (get column names and format them) 147 return self.tabulate( 148 headers=[str(column) for column in self.updated.columns], 149 table=[ 150 [ 151 # The metric acts as its own footnote name 152 metric_table_name(row["metric"], explain), 153 # Check for no change and NaN in p_value/t_stat 154 *[ 155 None if np.isnan(val) or np.allclose(val, 0) else val 156 for val in row[1:] 157 ], 158 ] 159 for _, row in self.updated.iterrows() 160 ], 161 ) 162 163 def equivalent_to_markdown(self, explain: bool) -> str: 164 assert not self.equivalent.empty 165 return self.tabulate( 166 headers=[str(column) for column in self.equivalent.columns], 167 table=[ 168 [ 169 # The metric acts as its own footnote name 170 metric_table_name(row["metric"], explain), 171 row["value"], 172 ] 173 for _, row in self.equivalent.iterrows() 174 ], 175 ) 176 177 def to_markdown(self, explain: bool) -> str: 178 result = "" 179 180 if not self.equivalent.empty: 181 result += "## Unchanged values\n\n" 182 result += self.equivalent_to_markdown(explain) 183 184 if not self.updated.empty: 185 result += ("\n\n" if result else "") + "## Updated values\n\n" 186 result += self.updated_to_markdown(explain) 187 188 if explain: 189 result += METRIC_EXPLANATION_FOOTNOTE 190 191 return result 192 193 194@dataclass(frozen=True) 195class Equivalent: 196 metric: str 197 value: float 198 199 200@dataclass(frozen=True) 201class Comparison: 202 metric: str 203 mean_before: float 204 mean_after: float 205 mean_diff: float 206 mean_pct_change: float 207 208 209@dataclass(frozen=True) 210class ComparisonWithPValue(Comparison): 211 p_value: float 212 t_stat: float 213 214 215def metric_sort_key(name: str) -> str: 216 if name in ("time.cpu", "time.gc", "time.gcFraction"): 217 return (1, name) 218 elif name.startswith("gc"): 219 return (2, name) 220 elif name.endswith(("bytes", "Bytes")): 221 return (3, name) 222 elif name.startswith("nr") or name.endswith("number"): 223 return (4, name) 224 else: 225 return (5, name) 226 227 228def perform_pairwise_tests( 229 before_metrics: dict, after_metrics: dict 230) -> PairwiseTestResults: 231 common_files = sorted(set(before_metrics) & set(after_metrics)) 232 all_keys = sorted( 233 { 234 metric_keys 235 for file_metrics in before_metrics.values() 236 for metric_keys in file_metrics.keys() 237 }, 238 key=metric_sort_key, 239 ) 240 241 updated = [] 242 equivalent = [] 243 244 for key in all_keys: 245 before_vals = [] 246 after_vals = [] 247 248 for fname in common_files: 249 if key in before_metrics[fname] and key in after_metrics[fname]: 250 before_vals.append(before_metrics[fname][key]) 251 after_vals.append(after_metrics[fname][key]) 252 253 if len(before_vals) == 0: 254 continue 255 256 before_arr = np.array(before_vals) 257 after_arr = np.array(after_vals) 258 259 diff = after_arr - before_arr 260 261 # If there's no difference, add it all to the equivalent output. 262 if np.allclose(diff, 0): 263 equivalent.append(Equivalent(metric=key, value=before_vals[0])) 264 else: 265 pct_change = 100 * diff / before_arr 266 267 result = Comparison( 268 metric=key, 269 mean_before=np.mean(before_arr), 270 mean_after=np.mean(after_arr), 271 mean_diff=np.mean(diff), 272 mean_pct_change=np.mean(pct_change), 273 ) 274 275 # If there are enough values to perform a t-test, do so. 276 if len(before_vals) > 1: 277 t_stat, p_val = ttest_rel(after_arr, before_arr) 278 result = ComparisonWithPValue( 279 **asdict(result), p_value=p_val, t_stat=t_stat 280 ) 281 282 updated.append(result) 283 284 return PairwiseTestResults( 285 updated=pd.DataFrame(map(asdict, updated)), 286 equivalent=pd.DataFrame(map(asdict, equivalent)), 287 ) 288 289 290def main(): 291 parser = argparse.ArgumentParser( 292 description="Performance comparison of Nix evaluation statistics" 293 ) 294 parser.add_argument( 295 "--explain", action="store_true", help="Explain the evaluation statistics" 296 ) 297 parser.add_argument( 298 "before", help="File or directory containing baseline (data before)" 299 ) 300 parser.add_argument( 301 "after", help="File or directory containing comparison (data after)" 302 ) 303 304 options = parser.parse_args() 305 306 before_stats = Path(options.before) 307 after_stats = Path(options.after) 308 309 before_metrics = load_all_metrics(before_stats) 310 after_metrics = load_all_metrics(after_stats) 311 pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics) 312 markdown_table = pairwise_test_results.to_markdown(explain=options.explain) 313 print(markdown_table) 314 315 316if __name__ == "__main__": 317 main()