commit d1376cc16ac658a22ffb197ebeb24f16f9c96285 · pyrox.dev/nixpkgs

+245 -78

ci/eval/compare/cmp-stats.py

···

       0
        
       
     

       1
        
       import json

     

       0
        
       
     

       2
        
       import os

     

       3
       -
       from scipy.stats import ttest_rel

     

       4
        
       import pandas as pd

     

       5
       -
       import numpy as np

     

       0
        
       
     

       0
        
       
     

       6
        
       from pathlib import Path

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       7
        
       

     

       8
       -
       # Define metrics of interest (can be expanded as needed)

     

       9
       -
       METRIC_PREFIXES = ("nr", "gc")

     

       10
        
       

     

       11
        
       def flatten_data(json_data: dict) -> dict:

     

       12
        
           """

     
···

       22
        
           "gc.heapSize": 5404549120

     

       23
        
           ...

     

       24
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       25
        
           Args:

     

       26
        
               json_data (dict): JSON data containing metrics.

     

       27
        
           Returns:

     

       28
        
               dict: Flattened metrics with keys as metric names.

     

       29
        
           """

     

       30
        
           flat_metrics = {}

     

       31
       -
           for k, v in json_data.items():

     

       32
       -
               if isinstance(v, (int, float)):

     

       33
       -
                   flat_metrics[k] = v

     

       34
       -
               elif isinstance(v, dict):

     

       35
       -
                   for sub_k, sub_v in v.items():

     

       36
       -
                       flat_metrics[f"{k}.{sub_k}"] = sub_v

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       37
        
           return flat_metrics

     

       38
        
       

     

       39
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       40
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       41
        
       

     

       42
       -
       def load_all_metrics(directory: Path) -> dict:

     

       43
       -
           """

     

       44
       -
           Loads all stats JSON files in the specified directory and extracts metrics.

     

       45
        
       

     

       46
        
           Args:

     

       47
       -
               directory (Path): Directory containing JSON files.

     

       0
        
       
     

       48
        
           Returns:

     

       49
        
               dict: Dictionary with filenames as keys and extracted metrics as values.

     

       50
        
           """

     

       51
        
           metrics = {}

     

       52
       -
           for system_dir in directory.iterdir():

     

       53
       -
               assert system_dir.is_dir()

     

       0
        
       
     

       54
        
       

     

       55
       -
               for chunk_output in system_dir.iterdir():

     

       56
        
                       with chunk_output.open() as f:

     

       57
        
                           data = json.load(f)

     

       0
        
       
     

       58
        
                       metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       59
        
       

     

       60
        
           return metrics

     

       61
        
       

     

       62
       -
       def dataframe_to_markdown(df: pd.DataFrame) -> str:

     

       63
       -
           df = df.sort_values(by=df.columns[0], ascending=True)

     

       64
       -
           markdown_lines = []

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       65
        
       

     

       66
       -
           # Header (get column names and format them)

     

       67
       -
           header = '\n| ' + ' | '.join(df.columns) + ' |'

     

       68
       -
           markdown_lines.append(header)

     

       69
       -
           markdown_lines.append("| - " * (len(df.columns)) + "|")  # Separator line

     

       70
        
       

     

       71
       -
           # Iterate over rows to build Markdown rows

     

       72
       -
           for _, row in df.iterrows():

     

       73
       -
               # TODO: define threshold for highlighting

     

       74
       -
               highlight = False

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       75
        
       

     

       76
       -
               fmt = lambda x: f"**{x}**" if highlight else f"{x}"

     

       77
        
       

     

       78
       -
               # Check for no change and NaN in p_value/t_stat

     

       79
       -
               row_values = []

     

       80
       -
               for val in row:

     

       81
       -
                   if isinstance(val, float) and np.isnan(val):  # For NaN values in p-value or t-stat

     

       82
       -
                       row_values.append("-")  # Custom symbol for NaN

     

       83
       -
                   elif isinstance(val, float) and val == 0:  # For no change (mean_diff == 0)

     

       84
       -
                       row_values.append("-")  # Custom symbol for no change

     

       85
       -
                   else:

     

       86
       -
                       row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))

     

       87
        
       

     

       88
       -
               markdown_lines.append('| ' + ' | '.join(row_values) + ' |')

     

       89
        
       

     

       90
       -
           return '\n'.join(markdown_lines)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       91
        
       

     

       92
        
       

     

       93
       -
       def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:

     

       0
        
       
     

       0
        
       
     

       94
        
           common_files = sorted(set(before_metrics) & set(after_metrics))

     

       95
       -
           all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       96
        
       

     

       97
       -
           results = []

     

       0
        
       
     

       98
        
       

     

       99
        
           for key in all_keys:

     

       100
       -
               before_vals, after_vals = [], []

     

       0
        
       
     

       101
        
       

     

       102
        
               for fname in common_files:

     

       103
        
                   if key in before_metrics[fname] and key in after_metrics[fname]:

     

       104
        
                       before_vals.append(before_metrics[fname][key])

     

       105
        
                       after_vals.append(after_metrics[fname][key])

     

       106
        
       

     

       107
       -
               if len(before_vals) >= 2:

     

       108
       -
                   before_arr = np.array(before_vals)

     

       109
       -
                   after_arr = np.array(after_vals)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       110
        
       

     

       111
       -
                   diff = after_arr - before_arr

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       112
        
                   pct_change = 100 * diff / before_arr

     

       113
       -
                   t_stat, p_val = ttest_rel(after_arr, before_arr)

     

       114
        
       

     

       115
       -
                   results.append({

     

       116
       -
                       "metric": key,

     

       117
       -
                       "mean_before": np.mean(before_arr),

     

       118
       -
                       "mean_after": np.mean(after_arr),

     

       119
       -
                       "mean_diff": np.mean(diff),

     

       120
       -
                       "mean_%_change": np.mean(pct_change),

     

       121
       -
                       "p_value": p_val,

     

       122
       -
                       "t_stat": t_stat

     

       123
       -
                   })

     

       124
        
       

     

       125
       -
           df = pd.DataFrame(results).sort_values("p_value")

     

       126
       -
           return df

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       127
        
       

     

       0
        
       
     

       128
        
       

     

       129
       -
       if __name__ == "__main__":

     

       130
       -
           before_dir = os.environ.get("BEFORE_DIR")

     

       131
       -
           after_dir = os.environ.get("AFTER_DIR")

     

       0
        
       
     

       132
        
       

     

       133
       -
           if not before_dir or not after_dir:

     

       134
       -
               print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")

     

       135
       -
               exit(1)

     

       136
        
       

     

       137
       -
           before_stats = Path(before_dir) / "stats"

     

       138
       -
           after_stats = Path(after_dir) / "stats"

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       139
        
       

     

       140
       -
           # This may happen if the pull request target does not include PR#399720 yet.

     

       141
       -
           if not before_stats.exists():

     

       142
       -
               print("⚠️  Skipping comparison: stats directory is missing in the target commit.")

     

       143
       -
               exit(0)

     

       144
        
       

     

       145
       -
           # This should never happen, but we're exiting gracefully anyways

     

       146
       -
           if not after_stats.exists():

     

       147
       -
               print("⚠️  Skipping comparison: stats directory missing in current PR evaluation.")

     

       148
       -
               exit(0)

     

       0
        
       
     

       149
        
       

     

       150
        
           before_metrics = load_all_metrics(before_stats)

     

       151
        
           after_metrics = load_all_metrics(after_stats)

     

       152
       -
           df1 = perform_pairwise_tests(before_metrics, after_metrics)

     

       153
       -
           markdown_table = dataframe_to_markdown(df1)

     

       154
        
           print(markdown_table)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0

···

       1
       +
       import argparse

     

       2
        
       import json

     

       3
       +
       import numpy as np

     

       4
        
       import os

     

       0
        
       
     

       5
        
       import pandas as pd

     

       6
       +
       import warnings

     

       7
       +
       

     

       8
       +
       from dataclasses import asdict, dataclass

     

       9
        
       from pathlib import Path

     

       10
       +
       from scipy.stats import ttest_rel

     

       11
       +
       from tabulate import tabulate

     

       12
       +
       from typing import Final

     

       13
        
       

     

       0
        
       
     

       0
        
       
     

       14
        
       

     

       15
        
       def flatten_data(json_data: dict) -> dict:

     

       16
        
           """

     
···

       26
        
           "gc.heapSize": 5404549120

     

       27
        
           ...

     

       28
        
       

     

       29
       +
           See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846

     

       30
       +
           for the ultimate source of this data.

     

       31
       +
       

     

       32
        
           Args:

     

       33
        
               json_data (dict): JSON data containing metrics.

     

       34
        
           Returns:

     

       35
        
               dict: Flattened metrics with keys as metric names.

     

       36
        
           """

     

       37
        
           flat_metrics = {}

     

       38
       +
           for key, value in json_data.items():

     

       39
       +
               # This key is duplicated as `time.cpu`; we keep that copy.

     

       40
       +
               if key == "cpuTime":

     

       41
       +
                   continue

     

       42
       +
       

     

       43
       +
               if isinstance(value, (int, float)):

     

       44
       +
                   flat_metrics[key] = value

     

       45
       +
               elif isinstance(value, dict):

     

       46
       +
                   for subkey, subvalue in value.items():

     

       47
       +
                       assert isinstance(subvalue, (int, float)), subvalue

     

       48
       +
                       flat_metrics[f"{key}.{subkey}"] = subvalue

     

       49
       +
               else:

     

       50
       +
                   assert isinstance(value, (float, int, dict)), (

     

       51
       +
                       f"Value `{value}` has unexpected type"

     

       52
       +
                   )

     

       53
       +
       

     

       54
        
           return flat_metrics

     

       55
        
       

     

       56
        
       

     

       57
       +
       def load_all_metrics(path: Path) -> dict:

     

       58
       +
           """

     

       59
       +
           Loads all stats JSON files in the specified file or directory and extracts metrics.

     

       60
       +
           These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.

     

       61
        
       

     

       62
       +
           If the provided path is a directory, it must have the structure $path/$system/$stats,

     

       63
       +
           where $path is the provided path, $system is some system from `lib.systems.doubles.*`,

     

       64
       +
           and $stats is a stats JSON file.

     

       65
        
       

     

       66
       +
           If the provided path is a file, it is a stats JSON file.

     

       0
        
       
     

       0
        
       
     

       67
        
       

     

       68
        
           Args:

     

       69
       +
               path (Path): Directory containing JSON files or a stats JSON file.

     

       70
       +
       

     

       71
        
           Returns:

     

       72
        
               dict: Dictionary with filenames as keys and extracted metrics as values.

     

       73
        
           """

     

       74
        
           metrics = {}

     

       75
       +
           if path.is_dir():

     

       76
       +
               for system_dir in path.iterdir():

     

       77
       +
                   assert system_dir.is_dir()

     

       78
        
       

     

       79
       +
                   for chunk_output in system_dir.iterdir():

     

       80
        
                       with chunk_output.open() as f:

     

       81
        
                           data = json.load(f)

     

       82
       +
       

     

       83
        
                       metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)

     

       84
       +
           else:

     

       85
       +
               with path.open() as f:

     

       86
       +
                   metrics[path.name] = flatten_data(json.load(f))

     

       87
        
       

     

       88
        
           return metrics

     

       89
        
       

     

       90
       +
       

     

       91
       +
       def metric_table_name(name: str, explain: bool) -> str:

     

       92
       +
           """

     

       93
       +
           Returns the name of the metric, plus a footnote to explain it if needed.

     

       94
       +
           """

     

       95
       +
           return f"{name}[^{name}]" if explain else name

     

       96
       +
       

     

       97
       +
       

     

       98
       +
       METRIC_EXPLANATION_FOOTNOTE: Final[str] = """

     

       99
       +
       

     

       100
       +
       [^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).

     

       101
       +
       [^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.

     

       102
       +
       [^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.

     

       103
       +
       [^gc.cycles]: Number of times garbage collection has been performed.

     

       104
       +
       [^gc.heapSize]: Size in bytes of the garbage collector heap.

     

       105
       +
       [^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.

     

       106
       +
       [^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).

     

       107
       +
       [^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.

     

       108
       +
       [^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.

     

       109
       +
       [^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.

     

       110
       +
       [^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.

     

       111
       +
       [^envs.number]: The count of all `Env` objects allocated.

     

       112
       +
       [^nrAvoided]: The number of thunks avoided being created.

     

       113
       +
       [^nrExprs]: The number of expression objects ever created.

     

       114
       +
       [^nrFunctionCalls]: The number of function calls ever made.

     

       115
       +
       [^nrLookups]: The number of lookups into an attrset ever made.

     

       116
       +
       [^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.

     

       117
       +
       [^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.

     

       118
       +
       [^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.

     

       119
       +
       [^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.

     

       120
       +
       [^sets.number]: The number of attrsets ever made.

     

       121
       +
       [^symbols.number]: The number of symbols ever added to the symbol table.

     

       122
       +
       [^values.number]: The number of values ever made.

     

       123
       +
       [^envs.elements]: The number of values contained within an `Env` object.

     

       124
       +
       [^list.concats]: The number of list concatenation operations (`++`) performed.

     

       125
       +
       [^list.elements]: The number of values contained within a list.

     

       126
       +
       [^sets.elements]: The number of values contained within an attrset.

     

       127
       +
       [^sizes.Attr]: Size in bytes of the `Attr` type.

     

       128
       +
       [^sizes.Bindings]: Size in bytes of the `Bindings` type.

     

       129
       +
       [^sizes.Env]: Size in bytes of the `Env` type.

     

       130
       +
       [^sizes.Value]: Size in bytes of the `Value` type.

     

       131
       +
       """

     

       132
       +
       

     

       133
       +
       

     

       134
       +
       @dataclass(frozen=True)

     

       135
       +
       class PairwiseTestResults:

     

       136
       +
           updated: pd.DataFrame

     

       137
       +
           equivalent: pd.DataFrame

     

       138
       +
       

     

       139
       +
           @staticmethod

     

       140
       +
           def tabulate(table, headers) -> str:

     

       141
       +
               return tabulate(

     

       142
       +
                   table, headers, tablefmt="github", floatfmt=".4f", missingval="-"

     

       143
       +
               )

     

       144
       +
       

     

       145
       +
           def updated_to_markdown(self, explain: bool) -> str:

     

       146
       +
               assert not self.updated.empty

     

       147
       +
               # Header (get column names and format them)

     

       148
       +
               return self.tabulate(

     

       149
       +
                   headers=[str(column) for column in self.updated.columns],

     

       150
       +
                   table=[

     

       151
       +
                       [

     

       152
       +
                           # The metric acts as its own footnote name

     

       153
       +
                           metric_table_name(row["metric"], explain),

     

       154
       +
                           # Check for no change and NaN in p_value/t_stat

     

       155
       +
                           *[

     

       156
       +
                               None if np.isnan(val) or np.allclose(val, 0) else val

     

       157
       +
                               for val in row[1:]

     

       158
       +
                           ],

     

       159
       +
                       ]

     

       160
       +
                       for _, row in self.updated.iterrows()

     

       161
       +
                   ],

     

       162
       +
               )

     

       163
       +
       

     

       164
       +
           def equivalent_to_markdown(self, explain: bool) -> str:

     

       165
       +
               assert not self.equivalent.empty

     

       166
       +
               return self.tabulate(

     

       167
       +
                   headers=[str(column) for column in self.equivalent.columns],

     

       168
       +
                   table=[

     

       169
       +
                       [

     

       170
       +
                           # The metric acts as its own footnote name

     

       171
       +
                           metric_table_name(row["metric"], explain),

     

       172
       +
                           row["value"],

     

       173
       +
                       ]

     

       174
       +
                       for _, row in self.equivalent.iterrows()

     

       175
       +
                   ],

     

       176
       +
               )

     

       177
       +
       

     

       178
       +
           def to_markdown(self, explain: bool) -> str:

     

       179
       +
               result = ""

     

       180
       +
       

     

       181
       +
               if not self.equivalent.empty:

     

       182
       +
                   result += "## Unchanged values\n\n"

     

       183
       +
                   result += self.equivalent_to_markdown(explain)

     

       184
       +
       

     

       185
       +
               if not self.updated.empty:

     

       186
       +
                   result += ("\n\n" if result else "") + "## Updated values\n\n"

     

       187
       +
                   result += self.updated_to_markdown(explain)

     

       188
       +
       

     

       189
       +
               if explain:

     

       190
       +
                   result += METRIC_EXPLANATION_FOOTNOTE

     

       191
       +
       

     

       192
       +
               return result

     

       193
       +
       

     

       194
       +
       

     

       195
       +
       @dataclass(frozen=True)

     

       196
       +
       class Equivalent:

     

       197
       +
           metric: str

     

       198
       +
           value: float

     

       199
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       200
        
       

     

       201
       +
       @dataclass(frozen=True)

     

       202
       +
       class Comparison:

     

       203
       +
           metric: str

     

       204
       +
           mean_before: float

     

       205
       +
           mean_after: float

     

       206
       +
           mean_diff: float

     

       207
       +
           mean_pct_change: float

     

       208
        
       

     

       0
        
       
     

       209
        
       

     

       210
       +
       @dataclass(frozen=True)

     

       211
       +
       class ComparisonWithPValue(Comparison):

     

       212
       +
           p_value: float

     

       213
       +
           t_stat: float

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       214
        
       

     

       0
        
       
     

       215
        
       

     

       216
       +
       def metric_sort_key(name: str) -> str:

     

       217
       +
           if name in ("time.cpu", "time.gc", "time.gcFraction"):

     

       218
       +
               return (1, name)

     

       219
       +
           elif name.startswith("gc"):

     

       220
       +
               return (2, name)

     

       221
       +
           elif name.endswith(("bytes", "Bytes")):

     

       222
       +
               return (3, name)

     

       223
       +
           elif name.startswith("nr") or name.endswith("number"):

     

       224
       +
               return (4, name)

     

       225
       +
           else:

     

       226
       +
               return (5, name)

     

       227
        
       

     

       228
        
       

     

       229
       +
       def perform_pairwise_tests(

     

       230
       +
           before_metrics: dict, after_metrics: dict

     

       231
       +
       ) -> PairwiseTestResults:

     

       232
        
           common_files = sorted(set(before_metrics) & set(after_metrics))

     

       233
       +
           all_keys = sorted(

     

       234
       +
               {

     

       235
       +
                   metric_keys

     

       236
       +
                   for file_metrics in before_metrics.values()

     

       237
       +
                   for metric_keys in file_metrics.keys()

     

       238
       +
               },

     

       239
       +
               key=metric_sort_key,

     

       240
       +
           )

     

       241
        
       

     

       242
       +
           updated = []

     

       243
       +
           equivalent = []

     

       244
        
       

     

       245
        
           for key in all_keys:

     

       246
       +
               before_vals = []

     

       247
       +
               after_vals = []

     

       248
        
       

     

       249
        
               for fname in common_files:

     

       250
        
                   if key in before_metrics[fname] and key in after_metrics[fname]:

     

       251
        
                       before_vals.append(before_metrics[fname][key])

     

       252
        
                       after_vals.append(after_metrics[fname][key])

     

       253
        
       

     

       254
       +
               if len(before_vals) == 0:

     

       255
       +
                   continue

     

       256
       +
       

     

       257
       +
               before_arr = np.array(before_vals)

     

       258
       +
               after_arr = np.array(after_vals)

     

       259
       +
       

     

       260
       +
               diff = after_arr - before_arr

     

       261
        
       

     

       262
       +
               # If there's no difference, add it all to the equivalent output.

     

       263
       +
               if np.allclose(diff, 0):

     

       264
       +
                   equivalent.append(Equivalent(metric=key, value=before_vals[0]))

     

       265
       +
               else:

     

       266
        
                   pct_change = 100 * diff / before_arr

     

       0
        
       
     

       267
        
       

     

       268
       +
                   result = Comparison(

     

       269
       +
                       metric=key,

     

       270
       +
                       mean_before=np.mean(before_arr),

     

       271
       +
                       mean_after=np.mean(after_arr),

     

       272
       +
                       mean_diff=np.mean(diff),

     

       273
       +
                       mean_pct_change=np.mean(pct_change),

     

       274
       +
                   )

     

       0
        
       
     

       0
        
       
     

       275
        
       

     

       276
       +
                   # If there are enough values to perform a t-test, do so.

     

       277
       +
                   if len(before_vals) > 1:

     

       278
       +
                       t_stat, p_val = ttest_rel(after_arr, before_arr)

     

       279
       +
                       result = ComparisonWithPValue(

     

       280
       +
                           **asdict(result), p_value=p_val, t_stat=t_stat

     

       281
       +
                       )

     

       282
        
       

     

       283
       +
                   updated.append(result)

     

       284
        
       

     

       285
       +
           return PairwiseTestResults(

     

       286
       +
               updated=pd.DataFrame(map(asdict, updated)),

     

       287
       +
               equivalent=pd.DataFrame(map(asdict, equivalent)),

     

       288
       +
           )

     

       289
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       290
        
       

     

       291
       +
       def main():

     

       292
       +
           parser = argparse.ArgumentParser(

     

       293
       +
               description="Performance comparison of Nix evaluation statistics"

     

       294
       +
           )

     

       295
       +
           parser.add_argument(

     

       296
       +
               "--explain", action="store_true", help="Explain the evaluation statistics"

     

       297
       +
           )

     

       298
       +
           parser.add_argument(

     

       299
       +
               "before", help="File or directory containing baseline (data before)"

     

       300
       +
           )

     

       301
       +
           parser.add_argument(

     

       302
       +
               "after", help="File or directory containing comparison (data after)"

     

       303
       +
           )

     

       304
        
       

     

       305
       +
           options = parser.parse_args()

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       306
        
       

     

       307
       +
           # Turn warnings into errors

     

       308
       +
           warnings.simplefilter("error")

     

       309
       +
       

     

       310
       +
           before_stats = Path(options.before)

     

       311
       +
           after_stats = Path(options.after)

     

       312
        
       

     

       313
        
           before_metrics = load_all_metrics(before_stats)

     

       314
        
           after_metrics = load_all_metrics(after_stats)

     

       315
       +
           pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)

     

       316
       +
           markdown_table = pairwise_test_results.to_markdown(explain=options.explain)

     

       317
        
           print(markdown_table)

     

       318
       +
       

     

       319
       +
       

     

       320
       +
       if __name__ == "__main__":

     

       321
       +
           main()

+41 -13

ci/eval/compare/default.nix

···

       5
        
         runCommand,

     

       6
        
         writeText,

     

       7
        
         python3,

     

       0
        
       
     

       0
        
       
     

       8
        
       }:

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       9
        
       {

     

       10
        
         combinedDir,

     

       11
        
         touchedFilesJson,

     
···

       140
        
           # Don't depend on -dev outputs to reduce closure size for CI.

     

       141
        
           nativeBuildInputs = map lib.getBin [

     

       142
        
             jq

     

       143
       -
             (python3.withPackages (

     

       144
       -
               ps: with ps; [

     

       145
       -
                 numpy

     

       146
       -
                 pandas

     

       147
       -
                 scipy

     

       148
       -
               ]

     

       149
       -
             ))

     

       150
       -
       

     

       151
        
           ];

     

       152
        
           maintainers = builtins.toJSON maintainers;

     

       153
        
           passAsFile = [ "maintainers" ];

     

       154
       -
           env = {

     

       155
       -
             BEFORE_DIR = "${combined}/before";

     

       156
       -
             AFTER_DIR = "${combined}/after";

     

       157
       -
           };

     

       158
        
         }

     

       159
        
         ''

     

       160
        
           mkdir $out

     
···

       181
        
               echo

     

       182
        
             } >> $out/step-summary.md

     

       183
        
       

     

       184
       -
             python3 ${./cmp-stats.py} >> $out/step-summary.md

     

       185
        
       

     

       186
        
           else

     

       187
        
             # Package chunks are the same in both revisions

···

       5
        
         runCommand,

     

       6
        
         writeText,

     

       7
        
         python3,

     

       8
       +
         stdenvNoCC,

     

       9
       +
         makeWrapper,

     

       10
        
       }:

     

       11
       +
       let

     

       12
       +
         python = python3.withPackages (ps: [

     

       13
       +
           ps.numpy

     

       14
       +
           ps.pandas

     

       15
       +
           ps.scipy

     

       16
       +
           ps.tabulate

     

       17
       +
         ]);

     

       18
       +
       

     

       19
       +
         cmp-stats = stdenvNoCC.mkDerivation {

     

       20
       +
           pname = "cmp-stats";

     

       21
       +
           version = lib.trivial.release;

     

       22
       +
       

     

       23
       +
           dontUnpack = true;

     

       24
       +
       

     

       25
       +
           nativeBuildInputs = [ makeWrapper ];

     

       26
       +
       

     

       27
       +
           installPhase = ''

     

       28
       +
             runHook preInstall

     

       29
       +
       

     

       30
       +
             mkdir -p $out/share/cmp-stats

     

       31
       +
       

     

       32
       +
             cp ${./cmp-stats.py} "$out/share/cmp-stats/cmp-stats.py"

     

       33
       +
       

     

       34
       +
             makeWrapper ${python.interpreter} "$out/bin/cmp-stats" \

     

       35
       +
                 --add-flags "$out/share/cmp-stats/cmp-stats.py"

     

       36
       +
       

     

       37
       +
             runHook postInstall

     

       38
       +
           '';

     

       39
       +
       

     

       40
       +
           meta = {

     

       41
       +
             description = "Performance comparison of Nix evaluation statistics";

     

       42
       +
             license = lib.licenses.mit;

     

       43
       +
             mainProgram = "cmp-stats";

     

       44
       +
             maintainers = with lib.maintainers; [ philiptaron ];

     

       45
       +
           };

     

       46
       +
         };

     

       47
       +
       in

     

       48
        
       {

     

       49
        
         combinedDir,

     

       50
        
         touchedFilesJson,

     
···

       179
        
           # Don't depend on -dev outputs to reduce closure size for CI.

     

       180
        
           nativeBuildInputs = map lib.getBin [

     

       181
        
             jq

     

       182
       +
             cmp-stats

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       183
        
           ];

     

       184
        
           maintainers = builtins.toJSON maintainers;

     

       185
        
           passAsFile = [ "maintainers" ];

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       186
        
         }

     

       187
        
         ''

     

       188
        
           mkdir $out

     
···

       209
        
               echo

     

       210
        
             } >> $out/step-summary.md

     

       211
        
       

     

       212
       +
             cmp-stats --explain ${combined}/before/stats ${combined}/after/stats >> $out/step-summary.md

     

       213
        
       

     

       214
        
           else

     

       215
        
             # Package chunks are the same in both revisions