ci/eval/compare/cmp-stats.py at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / ci / eval / compare / cmp-stats.py
at master 11 kB view raw
  1import argparse
  2import json
  3import numpy as np
  4import os
  5import pandas as pd
  6
  7from dataclasses import asdict, dataclass
  8from pathlib import Path
  9from scipy.stats import ttest_rel
 10from tabulate import tabulate
 11from typing import Final
 12
 13
 14def flatten_data(json_data: dict) -> dict:
 15    """
 16    Extracts and flattens metrics from JSON data.
 17    This is needed because the JSON data can be nested.
 18    For example, the JSON data entry might look like this:
 19
 20    "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}
 21
 22    Flattened:
 23
 24    "gc.cycles": 13
 25    "gc.heapSize": 5404549120
 26    ...
 27
 28    See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846
 29    for the ultimate source of this data.
 30
 31    Args:
 32        json_data (dict): JSON data containing metrics.
 33    Returns:
 34        dict: Flattened metrics with keys as metric names.
 35    """
 36    flat_metrics = {}
 37    for key, value in json_data.items():
 38        # This key is duplicated as `time.cpu`; we keep that copy.
 39        if key == "cpuTime":
 40            continue
 41
 42        if isinstance(value, (int, float)):
 43            flat_metrics[key] = value
 44        elif isinstance(value, dict):
 45            for subkey, subvalue in value.items():
 46                assert isinstance(subvalue, (int, float)), subvalue
 47                flat_metrics[f"{key}.{subkey}"] = subvalue
 48        else:
 49            assert isinstance(value, (float, int, dict)), (
 50                f"Value `{value}` has unexpected type"
 51            )
 52
 53    return flat_metrics
 54
 55
 56def load_all_metrics(path: Path) -> dict:
 57    """
 58    Loads all stats JSON files in the specified file or directory and extracts metrics.
 59    These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.
 60
 61    If the provided path is a directory, it must have the structure $path/$system/$stats,
 62    where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
 63    and $stats is a stats JSON file.
 64
 65    If the provided path is a file, it is a stats JSON file.
 66
 67    Args:
 68        path (Path): Directory containing JSON files or a stats JSON file.
 69
 70    Returns:
 71        dict: Dictionary with filenames as keys and extracted metrics as values.
 72    """
 73    metrics = {}
 74    if path.is_dir():
 75        for system_dir in path.iterdir():
 76            assert system_dir.is_dir()
 77
 78            for chunk_output in system_dir.iterdir():
 79                with chunk_output.open() as f:
 80                    data = json.load(f)
 81
 82                metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
 83    else:
 84        with path.open() as f:
 85            metrics[path.name] = flatten_data(json.load(f))
 86
 87    return metrics
 88
 89
 90def metric_table_name(name: str, explain: bool) -> str:
 91    """
 92    Returns the name of the metric, plus a footnote to explain it if needed.
 93    """
 94    return f"{name}[^{name}]" if explain else name
 95
 96
 97METRIC_EXPLANATION_FOOTNOTE: Final[str] = """
 98
 99[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
100[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
101[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
102[^gc.cycles]: Number of times garbage collection has been performed.
103[^gc.heapSize]: Size in bytes of the garbage collector heap.
104[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
105[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
106[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
107[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
108[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
109[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
110[^envs.number]: The count of all `Env` objects allocated.
111[^nrAvoided]: The number of thunks avoided being created.
112[^nrExprs]: The number of expression objects ever created.
113[^nrFunctionCalls]: The number of function calls ever made.
114[^nrLookups]: The number of lookups into an attrset ever made.
115[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
116[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
117[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
118[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
119[^sets.number]: The number of attrsets ever made.
120[^symbols.number]: The number of symbols ever added to the symbol table.
121[^values.number]: The number of values ever made.
122[^envs.elements]: The number of values contained within an `Env` object.
123[^list.concats]: The number of list concatenation operations (`++`) performed.
124[^list.elements]: The number of values contained within a list.
125[^sets.elements]: The number of values contained within an attrset.
126[^sizes.Attr]: Size in bytes of the `Attr` type.
127[^sizes.Bindings]: Size in bytes of the `Bindings` type.
128[^sizes.Env]: Size in bytes of the `Env` type.
129[^sizes.Value]: Size in bytes of the `Value` type.
130"""
131
132
133@dataclass(frozen=True)
134class PairwiseTestResults:
135    updated: pd.DataFrame
136    equivalent: pd.DataFrame
137
138    @staticmethod
139    def tabulate(table, headers) -> str:
140        return tabulate(
141            table, headers, tablefmt="github", floatfmt=".4f", missingval="-"
142        )
143
144    def updated_to_markdown(self, explain: bool) -> str:
145        assert not self.updated.empty
146        # Header (get column names and format them)
147        return self.tabulate(
148            headers=[str(column) for column in self.updated.columns],
149            table=[
150                [
151                    # The metric acts as its own footnote name
152                    metric_table_name(row["metric"], explain),
153                    # Check for no change and NaN in p_value/t_stat
154                    *[
155                        None if np.isnan(val) or np.allclose(val, 0) else val
156                        for val in row[1:]
157                    ],
158                ]
159                for _, row in self.updated.iterrows()
160            ],
161        )
162
163    def equivalent_to_markdown(self, explain: bool) -> str:
164        assert not self.equivalent.empty
165        return self.tabulate(
166            headers=[str(column) for column in self.equivalent.columns],
167            table=[
168                [
169                    # The metric acts as its own footnote name
170                    metric_table_name(row["metric"], explain),
171                    row["value"],
172                ]
173                for _, row in self.equivalent.iterrows()
174            ],
175        )
176
177    def to_markdown(self, explain: bool) -> str:
178        result = ""
179
180        if not self.equivalent.empty:
181            result += "## Unchanged values\n\n"
182            result += self.equivalent_to_markdown(explain)
183
184        if not self.updated.empty:
185            result += ("\n\n" if result else "") + "## Updated values\n\n"
186            result += self.updated_to_markdown(explain)
187
188        if explain:
189            result += METRIC_EXPLANATION_FOOTNOTE
190
191        return result
192
193
194@dataclass(frozen=True)
195class Equivalent:
196    metric: str
197    value: float
198
199
200@dataclass(frozen=True)
201class Comparison:
202    metric: str
203    mean_before: float
204    mean_after: float
205    mean_diff: float
206    mean_pct_change: float
207
208
209@dataclass(frozen=True)
210class ComparisonWithPValue(Comparison):
211    p_value: float
212    t_stat: float
213
214
215def metric_sort_key(name: str) -> str:
216    if name in ("time.cpu", "time.gc", "time.gcFraction"):
217        return (1, name)
218    elif name.startswith("gc"):
219        return (2, name)
220    elif name.endswith(("bytes", "Bytes")):
221        return (3, name)
222    elif name.startswith("nr") or name.endswith("number"):
223        return (4, name)
224    else:
225        return (5, name)
226
227
228def perform_pairwise_tests(
229    before_metrics: dict, after_metrics: dict
230) -> PairwiseTestResults:
231    common_files = sorted(set(before_metrics) & set(after_metrics))
232    all_keys = sorted(
233        {
234            metric_keys
235            for file_metrics in before_metrics.values()
236            for metric_keys in file_metrics.keys()
237        },
238        key=metric_sort_key,
239    )
240
241    updated = []
242    equivalent = []
243
244    for key in all_keys:
245        before_vals = []
246        after_vals = []
247
248        for fname in common_files:
249            if key in before_metrics[fname] and key in after_metrics[fname]:
250                before_vals.append(before_metrics[fname][key])
251                after_vals.append(after_metrics[fname][key])
252
253        if len(before_vals) == 0:
254            continue
255
256        before_arr = np.array(before_vals)
257        after_arr = np.array(after_vals)
258
259        diff = after_arr - before_arr
260
261        # If there's no difference, add it all to the equivalent output.
262        if np.allclose(diff, 0):
263            equivalent.append(Equivalent(metric=key, value=before_vals[0]))
264        else:
265            pct_change = 100 * diff / before_arr
266
267            result = Comparison(
268                metric=key,
269                mean_before=np.mean(before_arr),
270                mean_after=np.mean(after_arr),
271                mean_diff=np.mean(diff),
272                mean_pct_change=np.mean(pct_change),
273            )
274
275            # If there are enough values to perform a t-test, do so.
276            if len(before_vals) > 1:
277                t_stat, p_val = ttest_rel(after_arr, before_arr)
278                result = ComparisonWithPValue(
279                    **asdict(result), p_value=p_val, t_stat=t_stat
280                )
281
282            updated.append(result)
283
284    return PairwiseTestResults(
285        updated=pd.DataFrame(map(asdict, updated)),
286        equivalent=pd.DataFrame(map(asdict, equivalent)),
287    )
288
289
290def main():
291    parser = argparse.ArgumentParser(
292        description="Performance comparison of Nix evaluation statistics"
293    )
294    parser.add_argument(
295        "--explain", action="store_true", help="Explain the evaluation statistics"
296    )
297    parser.add_argument(
298        "before", help="File or directory containing baseline (data before)"
299    )
300    parser.add_argument(
301        "after", help="File or directory containing comparison (data after)"
302    )
303
304    options = parser.parse_args()
305
306    before_stats = Path(options.before)
307    after_stats = Path(options.after)
308
309    before_metrics = load_all_metrics(before_stats)
310    after_metrics = load_all_metrics(after_stats)
311    pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)
312    markdown_table = pairwise_test_results.to_markdown(explain=options.explain)
313    print(markdown_table)
314
315
316if __name__ == "__main__":
317    main()