commit 6daa93cca188eef12454c6061a25b1906cab57e7 · treybastian.com/1brc

+9 -13
src/main/python/create_measurements.py
···

       84
       84
        
           """

     

       85
       85
        
           Tries to estimate how large a file the test data will be

     

       86
       86
        
           """

     

       87
       87
       -
           max_string = float('-inf')

     

       88
       88
       -
           min_string = float('inf')

     

       89
       89
       -
           per_record_size = 0

     

       90
       90
       -
           record_size_unit = "bytes"

     

       87
       87
       +
           total_name_bytes = sum(len(s.encode("utf-8")) for s in weather_station_names)

     

       88
       88
       +
           avg_name_bytes = total_name_bytes / float(len(weather_station_names))

     

       91
       89
        
       

     

       92
       92
       -
           for station in weather_station_names:

     

       93
       93
       -
               if len(station) > max_string:

     

       94
       94
       -
                   max_string = len(station)

     

       95
       95
       -
               if len(station) < min_string:

     

       96
       96
       -
                   min_string = len(station)

     

       97
       97
       -
               per_record_size = ((max_string + min_string * 2) + len(",-123.4")) / 2

     

       90
       90
       +
           # avg_temp_bytes = sum(len(str(n / 10)) for n in range(-999, 1000)) / 1999

     

       91
       91
       +
           avg_temp_bytes = 4.400200100050025

     

       92
       92
       +
       

     

       93
       93
       +
           # add 2 for separator and newline

     

       94
       94
       +
           avg_line_length = avg_name_bytes + avg_temp_bytes + 2

     

       98
       95
        
       

     

       99
       99
       -
           total_file_size = num_rows_to_create * per_record_size

     

       100
       100
       -
           human_file_size = convert_bytes(total_file_size)

     

       96
       96
       +
           human_file_size = convert_bytes(num_rows_to_create * avg_line_length)

     

       101
       97
        
       

     

       102
       102
       -
           return f"Estimated max file size is:  {human_file_size}.\nTrue size is probably much smaller (around half)."

     

       98
       98
       +
           return f"Estimated max file size is:  {human_file_size}."

     

       103
       99
        
       

     

       104
       100
        
       

     

       105
       101
        
       def build_test_data(weather_station_names, num_rows_to_create):