ci/eval/compare/cmp-stats.py at 25.11-pre · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / ci / eval / compare / cmp-stats.py
at 25.11-pre 5.2 kB view raw
  1import json
  2import os
  3from scipy.stats import ttest_rel
  4import pandas as pd
  5import numpy as np
  6from pathlib import Path
  7
  8# Define metrics of interest (can be expanded as needed)
  9METRIC_PREFIXES = ("nr", "gc")
 10
 11def flatten_data(json_data: dict) -> dict:
 12    """
 13    Extracts and flattens metrics from JSON data.
 14    This is needed because the JSON data can be nested.
 15    For example, the JSON data entry might look like this:
 16
 17    "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}
 18
 19    Flattened:
 20
 21    "gc.cycles": 13
 22    "gc.heapSize": 5404549120
 23    ...
 24
 25    Args:
 26        json_data (dict): JSON data containing metrics.
 27    Returns:
 28        dict: Flattened metrics with keys as metric names.
 29    """
 30    flat_metrics = {}
 31    for k, v in json_data.items():
 32        if isinstance(v, (int, float)):
 33            flat_metrics[k] = v
 34        elif isinstance(v, dict):
 35            for sub_k, sub_v in v.items():
 36                flat_metrics[f"{k}.{sub_k}"] = sub_v
 37    return flat_metrics
 38
 39
 40
 41
 42def load_all_metrics(directory: Path) -> dict:
 43    """
 44    Loads all stats JSON files in the specified directory and extracts metrics.
 45
 46    Args:
 47        directory (Path): Directory containing JSON files.
 48    Returns:
 49        dict: Dictionary with filenames as keys and extracted metrics as values.
 50    """
 51    metrics = {}
 52    for system_dir in directory.iterdir():
 53        assert system_dir.is_dir()
 54
 55        for chunk_output in system_dir.iterdir():
 56                with chunk_output.open() as f:
 57                    data = json.load(f)
 58                metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
 59
 60    return metrics
 61
 62def dataframe_to_markdown(df: pd.DataFrame) -> str:
 63    df = df.sort_values(by=df.columns[0], ascending=True)
 64    markdown_lines = []
 65
 66    # Header (get column names and format them)
 67    header = '\n| ' + ' | '.join(df.columns) + ' |'
 68    markdown_lines.append(header)
 69    markdown_lines.append("| - " * (len(df.columns)) + "|")  # Separator line
 70
 71    # Iterate over rows to build Markdown rows
 72    for _, row in df.iterrows():
 73        # TODO: define threshold for highlighting
 74        highlight = False
 75
 76        fmt = lambda x: f"**{x}**" if highlight else f"{x}"
 77
 78        # Check for no change and NaN in p_value/t_stat
 79        row_values = []
 80        for val in row:
 81            if isinstance(val, float) and np.isnan(val):  # For NaN values in p-value or t-stat
 82                row_values.append("-")  # Custom symbol for NaN
 83            elif isinstance(val, float) and val == 0:  # For no change (mean_diff == 0)
 84                row_values.append("-")  # Custom symbol for no change
 85            else:
 86                row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))
 87
 88        markdown_lines.append('| ' + ' | '.join(row_values) + ' |')
 89
 90    return '\n'.join(markdown_lines)
 91
 92
 93def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
 94    common_files = sorted(set(before_metrics) & set(after_metrics))
 95    all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })
 96
 97    results = []
 98
 99    for key in all_keys:
100        before_vals, after_vals = [], []
101
102        for fname in common_files:
103            if key in before_metrics[fname] and key in after_metrics[fname]:
104                before_vals.append(before_metrics[fname][key])
105                after_vals.append(after_metrics[fname][key])
106
107        if len(before_vals) >= 2:
108            before_arr = np.array(before_vals)
109            after_arr = np.array(after_vals)
110
111            diff = after_arr - before_arr
112            pct_change = 100 * diff / before_arr
113            t_stat, p_val = ttest_rel(after_arr, before_arr)
114
115            results.append({
116                "metric": key,
117                "mean_before": np.mean(before_arr),
118                "mean_after": np.mean(after_arr),
119                "mean_diff": np.mean(diff),
120                "mean_%_change": np.mean(pct_change),
121                "p_value": p_val,
122                "t_stat": t_stat
123            })
124
125    df = pd.DataFrame(results).sort_values("p_value")
126    return df
127
128
129if __name__ == "__main__":
130    before_dir = os.environ.get("BEFORE_DIR")
131    after_dir = os.environ.get("AFTER_DIR")
132
133    if not before_dir or not after_dir:
134        print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
135        exit(1)
136
137    before_stats = Path(before_dir) / "stats"
138    after_stats = Path(after_dir) / "stats"
139
140    # This may happen if the pull request target does not include PR#399720 yet.
141    if not before_stats.exists():
142        print("⚠️  Skipping comparison: stats directory is missing in the target commit.")
143        exit(0)
144
145    # This should never happen, but we're exiting gracefully anyways
146    if not after_stats.exists():
147        print("⚠️  Skipping comparison: stats directory missing in current PR evaluation.")
148        exit(0)
149
150    before_metrics = load_all_metrics(before_stats)
151    after_metrics = load_all_metrics(after_stats)
152    df1 = perform_pairwise_tests(before_metrics, after_metrics)
153    markdown_table = dataframe_to_markdown(df1)
154    print(markdown_table)