413 lines
14 KiB
Diff
413 lines
14 KiB
Diff
|
diff -pruN a/benchtests/scripts/compare_bench.py b/benchtests/scripts/compare_bench.py
|
||
|
--- a/benchtests/scripts/compare_bench.py 1970-01-01 05:30:00.000000000 +0530
|
||
|
+++ b/benchtests/scripts/compare_bench.py 2015-05-07 15:32:41.843584024 +0530
|
||
|
@@ -0,0 +1,184 @@
|
||
|
+#!/usr/bin/python
|
||
|
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
|
+# This file is part of the GNU C Library.
|
||
|
+#
|
||
|
+# The GNU C Library is free software; you can redistribute it and/or
|
||
|
+# modify it under the terms of the GNU Lesser General Public
|
||
|
+# License as published by the Free Software Foundation; either
|
||
|
+# version 2.1 of the License, or (at your option) any later version.
|
||
|
+#
|
||
|
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+# Lesser General Public License for more details.
|
||
|
+#
|
||
|
+# You should have received a copy of the GNU Lesser General Public
|
||
|
+# License along with the GNU C Library; if not, see
|
||
|
+# <http://www.gnu.org/licenses/>.
|
||
|
+"""Compare two benchmark results
|
||
|
+
|
||
|
+Given two benchmark result files and a threshold, this script compares the
|
||
|
+benchmark results and flags differences in performance beyond a given
|
||
|
+threshold.
|
||
|
+"""
|
||
|
+import sys
|
||
|
+import os
|
||
|
+import pylab
|
||
|
+import import_bench as bench
|
||
|
+
|
||
|
+def do_compare(func, var, tl1, tl2, par, threshold):
|
||
|
+ """Compare one of the aggregate measurements
|
||
|
+
|
||
|
+ Helper function to compare one of the aggregate measurements of a function
|
||
|
+ variant.
|
||
|
+
|
||
|
+ Args:
|
||
|
+ func: Function name
|
||
|
+ var: Function variant name
|
||
|
+ tl1: The first timings list
|
||
|
+ tl2: The second timings list
|
||
|
+ par: The aggregate to measure
|
||
|
+ threshold: The threshold for differences, beyond which the script should
|
||
|
+ print a warning.
|
||
|
+ """
|
||
|
+ d = abs(tl2[par] - tl1[par]) * 100 / tl1[str(par)]
|
||
|
+ if d > threshold:
|
||
|
+ if tl1[par] > tl2[par]:
|
||
|
+ ind = '+++'
|
||
|
+ else:
|
||
|
+ ind = '---'
|
||
|
+ print('%s %s(%s)[%s]: (%.2lf%%) from %g to %g' %
|
||
|
+ (ind, func, var, par, d, tl1[par], tl2[par]))
|
||
|
+
|
||
|
+
|
||
|
+def compare_runs(pts1, pts2, threshold):
|
||
|
+ """Compare two benchmark runs
|
||
|
+
|
||
|
+ Args:
|
||
|
+ pts1: Timing data from first machine
|
||
|
+ pts2: Timing data from second machine
|
||
|
+ """
|
||
|
+
|
||
|
+ # XXX We assume that the two benchmarks have identical functions and
|
||
|
+ # variants. We cannot compare two benchmarks that may have different
|
||
|
+ # functions or variants. Maybe that is something for the future.
|
||
|
+ for func in pts1['functions'].keys():
|
||
|
+ for var in pts1['functions'][func].keys():
|
||
|
+ tl1 = pts1['functions'][func][var]
|
||
|
+ tl2 = pts2['functions'][func][var]
|
||
|
+
|
||
|
+ # Compare the consolidated numbers
|
||
|
+ # do_compare(func, var, tl1, tl2, 'max', threshold)
|
||
|
+ do_compare(func, var, tl1, tl2, 'min', threshold)
|
||
|
+ do_compare(func, var, tl1, tl2, 'mean', threshold)
|
||
|
+
|
||
|
+ # Skip over to the next variant or function if there is no detailed
|
||
|
+ # timing info for the function variant.
|
||
|
+ if 'timings' not in pts1['functions'][func][var].keys() or \
|
||
|
+ 'timings' not in pts2['functions'][func][var].keys():
|
||
|
+ continue
|
||
|
+
|
||
|
+ # If two lists do not have the same length then it is likely that
|
||
|
+ # the performance characteristics of the function have changed.
|
||
|
+ # XXX: It is also likely that there was some measurement that
|
||
|
+ # strayed outside the usual range. Such ouiers should not
|
||
|
+ # happen on an idle machine with identical hardware and
|
||
|
+ # configuration, but ideal environments are hard to come by.
|
||
|
+ if len(tl1['timings']) != len(tl2['timings']):
|
||
|
+ print('* %s(%s): Timing characteristics changed' %
|
||
|
+ (func, var))
|
||
|
+ print('\tBefore: [%s]' %
|
||
|
+ ', '.join([str(x) for x in tl1['timings']]))
|
||
|
+ print('\tAfter: [%s]' %
|
||
|
+ ', '.join([str(x) for x in tl2['timings']]))
|
||
|
+ continue
|
||
|
+
|
||
|
+ # Collect numbers whose differences cross the threshold we have
|
||
|
+ # set.
|
||
|
+ issues = [(x, y) for x, y in zip(tl1['timings'], tl2['timings']) \
|
||
|
+ if abs(y - x) * 100 / x > threshold]
|
||
|
+
|
||
|
+ # Now print them.
|
||
|
+ for t1, t2 in issues:
|
||
|
+ d = abs(t2 - t1) * 100 / t1
|
||
|
+ if t2 > t1:
|
||
|
+ ind = '-'
|
||
|
+ else:
|
||
|
+ ind = '+'
|
||
|
+
|
||
|
+ print("%s %s(%s): (%.2lf%%) from %g to %g" %
|
||
|
+ (ind, func, var, d, t1, t2))
|
||
|
+
|
||
|
+
|
||
|
+def plot_graphs(bench1, bench2):
|
||
|
+ """Plot graphs for functions
|
||
|
+
|
||
|
+ Make scatter plots for the functions and their variants.
|
||
|
+
|
||
|
+ Args:
|
||
|
+ bench1: Set of points from the first machine
|
||
|
+ bench2: Set of points from the second machine.
|
||
|
+ """
|
||
|
+ for func in bench1['functions'].keys():
|
||
|
+ for var in bench1['functions'][func].keys():
|
||
|
+ # No point trying to print a graph if there are no detailed
|
||
|
+ # timings.
|
||
|
+ if u'timings' not in bench1['functions'][func][var].keys():
|
||
|
+ print('Skipping graph for %s(%s)' % (func, var))
|
||
|
+ continue
|
||
|
+
|
||
|
+ pylab.clf()
|
||
|
+ pylab.ylabel('Time (cycles)')
|
||
|
+
|
||
|
+ # First set of points
|
||
|
+ length = len(bench1['functions'][func][var]['timings'])
|
||
|
+ X = [float(x) for x in range(length)]
|
||
|
+ lines = pylab.scatter(X, bench1['functions'][func][var]['timings'],
|
||
|
+ 1.5 + 100 / length)
|
||
|
+ pylab.setp(lines, 'color', 'r')
|
||
|
+
|
||
|
+ # Second set of points
|
||
|
+ length = len(bench2['functions'][func][var]['timings'])
|
||
|
+ X = [float(x) for x in range(length)]
|
||
|
+ lines = pylab.scatter(X, bench2['functions'][func][var]['timings'],
|
||
|
+ 1.5 + 100 / length)
|
||
|
+ pylab.setp(lines, 'color', 'g')
|
||
|
+
|
||
|
+ if var:
|
||
|
+ filename = "%s-%s.png" % (func, var)
|
||
|
+ else:
|
||
|
+ filename = "%s.png" % func
|
||
|
+ print('Writing out %s' % filename)
|
||
|
+ pylab.savefig(filename)
|
||
|
+
|
||
|
+
|
||
|
+def main(args):
|
||
|
+ """Program Entry Point
|
||
|
+
|
||
|
+ Take two benchmark output files and compare their timings.
|
||
|
+ """
|
||
|
+ if len(args) > 4 or len(args) < 3:
|
||
|
+ print('Usage: %s <schema> <file1> <file2> [threshold in %%]' % sys.argv[0])
|
||
|
+ sys.exit(os.EX_USAGE)
|
||
|
+
|
||
|
+ bench1 = bench.parse_bench(args[1], args[0])
|
||
|
+ bench2 = bench.parse_bench(args[2], args[0])
|
||
|
+ if len(args) == 4:
|
||
|
+ threshold = float(args[3])
|
||
|
+ else:
|
||
|
+ threshold = 10.0
|
||
|
+
|
||
|
+ if (bench1['timing_type'] != bench2['timing_type']):
|
||
|
+ print('Cannot compare benchmark outputs: timing types are different')
|
||
|
+ return
|
||
|
+
|
||
|
+ plot_graphs(bench1, bench2)
|
||
|
+
|
||
|
+ bench.compress_timings(bench1)
|
||
|
+ bench.compress_timings(bench2)
|
||
|
+
|
||
|
+ compare_runs(bench1, bench2, threshold)
|
||
|
+
|
||
|
+
|
||
|
+if __name__ == '__main__':
|
||
|
+ main(sys.argv[1:])
|
||
|
diff -pruN a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
|
||
|
--- a/benchtests/scripts/import_bench.py 1970-01-01 05:30:00.000000000 +0530
|
||
|
+++ b/benchtests/scripts/import_bench.py 2015-05-07 15:32:41.844584032 +0530
|
||
|
@@ -0,0 +1,141 @@
|
||
|
+#!/usr/bin/python
|
||
|
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
|
+# This file is part of the GNU C Library.
|
||
|
+#
|
||
|
+# The GNU C Library is free software; you can redistribute it and/or
|
||
|
+# modify it under the terms of the GNU Lesser General Public
|
||
|
+# License as published by the Free Software Foundation; either
|
||
|
+# version 2.1 of the License, or (at your option) any later version.
|
||
|
+#
|
||
|
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+# Lesser General Public License for more details.
|
||
|
+#
|
||
|
+# You should have received a copy of the GNU Lesser General Public
|
||
|
+# License along with the GNU C Library; if not, see
|
||
|
+# <http://www.gnu.org/licenses/>.
|
||
|
+"""Functions to import benchmark data and process it"""
|
||
|
+
|
||
|
+import json
|
||
|
+try:
|
||
|
+ import jsonschema as validator
|
||
|
+except ImportError:
|
||
|
+ print('Could not find jsonschema module.')
|
||
|
+ raise
|
||
|
+
|
||
|
+
|
||
|
+def mean(lst):
|
||
|
+ """Compute and return mean of numbers in a list
|
||
|
+
|
||
|
+ The numpy average function has horrible performance, so implement our
|
||
|
+ own mean function.
|
||
|
+
|
||
|
+ Args:
|
||
|
+ lst: The list of numbers to average.
|
||
|
+ Return:
|
||
|
+ The mean of members in the list.
|
||
|
+ """
|
||
|
+ return sum(lst) / len(lst)
|
||
|
+
|
||
|
+
|
||
|
+def split_list(bench, func, var):
|
||
|
+ """ Split the list into a smaller set of more distinct points
|
||
|
+
|
||
|
+ Group together points such that the difference between the smallest
|
||
|
+ point and the mean is less than 1/3rd of the mean. This means that
|
||
|
+ the mean is at most 1.5x the smallest member of that group.
|
||
|
+
|
||
|
+ mean - xmin < mean / 3
|
||
|
+ i.e. 2 * mean / 3 < xmin
|
||
|
+ i.e. mean < 3 * xmin / 2
|
||
|
+
|
||
|
+ For an evenly distributed group, the largest member will be less than
|
||
|
+ twice the smallest member of the group.
|
||
|
+ Derivation:
|
||
|
+
|
||
|
+ An evenly distributed series would be xmin, xmin + d, xmin + 2d...
|
||
|
+
|
||
|
+ mean = (2 * n * xmin + n * (n - 1) * d) / 2 * n
|
||
|
+ and max element is xmin + (n - 1) * d
|
||
|
+
|
||
|
+ Now, mean < 3 * xmin / 2
|
||
|
+
|
||
|
+ 3 * xmin > 2 * mean
|
||
|
+ 3 * xmin > (2 * n * xmin + n * (n - 1) * d) / n
|
||
|
+ 3 * n * xmin > 2 * n * xmin + n * (n - 1) * d
|
||
|
+ n * xmin > n * (n - 1) * d
|
||
|
+ xmin > (n - 1) * d
|
||
|
+ 2 * xmin > xmin + (n-1) * d
|
||
|
+ 2 * xmin > xmax
|
||
|
+
|
||
|
+ Hence, proved.
|
||
|
+
|
||
|
+ Similarly, it is trivial to prove that for a similar aggregation by using
|
||
|
+ the maximum element, the maximum element in the group must be at most 4/3
|
||
|
+ times the mean.
|
||
|
+
|
||
|
+ Args:
|
||
|
+ bench: The benchmark object
|
||
|
+ func: The function name
|
||
|
+ var: The function variant name
|
||
|
+ """
|
||
|
+ means = []
|
||
|
+ lst = bench['functions'][func][var]['timings']
|
||
|
+ last = len(lst) - 1
|
||
|
+ while lst:
|
||
|
+ for i in range(last + 1):
|
||
|
+ avg = mean(lst[i:])
|
||
|
+ if avg > 0.75 * lst[last]:
|
||
|
+ means.insert(0, avg)
|
||
|
+ lst = lst[:i]
|
||
|
+ last = i - 1
|
||
|
+ break
|
||
|
+ bench['functions'][func][var]['timings'] = means
|
||
|
+
|
||
|
+
|
||
|
+def do_for_all_timings(bench, callback):
|
||
|
+ """Call a function for all timing objects for each function and its
|
||
|
+ variants.
|
||
|
+
|
||
|
+ Args:
|
||
|
+ bench: The benchmark object
|
||
|
+ callback: The callback function
|
||
|
+ """
|
||
|
+ for func in bench['functions'].keys():
|
||
|
+ for k in bench['functions'][func].keys():
|
||
|
+ if 'timings' not in bench['functions'][func][k].keys():
|
||
|
+ continue
|
||
|
+
|
||
|
+ callback(bench, func, k)
|
||
|
+
|
||
|
+
|
||
|
+def compress_timings(points):
|
||
|
+ """Club points with close enough values into a single mean value
|
||
|
+
|
||
|
+ See split_list for details on how the clubbing is done.
|
||
|
+
|
||
|
+ Args:
|
||
|
+ points: The set of points.
|
||
|
+ """
|
||
|
+ do_for_all_timings(points, split_list)
|
||
|
+
|
||
|
+
|
||
|
+def parse_bench(filename, schema_filename):
|
||
|
+ """Parse the input file
|
||
|
+
|
||
|
+ Parse and validate the json file containing the benchmark outputs. Return
|
||
|
+ the resulting object.
|
||
|
+ Args:
|
||
|
+ filename: Name of the benchmark output file.
|
||
|
+ Return:
|
||
|
+ The bench dictionary.
|
||
|
+ """
|
||
|
+ with open(schema_filename, 'r') as schemafile:
|
||
|
+ schema = json.load(schemafile)
|
||
|
+ with open(filename, 'r') as benchfile:
|
||
|
+ bench = json.load(benchfile)
|
||
|
+ validator.validate(bench, schema)
|
||
|
+ do_for_all_timings(bench, lambda b, f, v:
|
||
|
+ b['functions'][f][v]['timings'].sort())
|
||
|
+ return bench
|
||
|
diff -pruN a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
|
||
|
--- a/benchtests/scripts/validate_benchout.py 2015-05-07 11:58:40.000000000 +0530
|
||
|
+++ b/benchtests/scripts/validate_benchout.py 2015-05-07 15:32:41.844584032 +0530
|
||
|
@@ -27,37 +27,26 @@ import sys
|
||
|
import os
|
||
|
|
||
|
try:
|
||
|
- import jsonschema
|
||
|
+ import import_bench as bench
|
||
|
except ImportError:
|
||
|
- print('Could not find jsonschema module. Output not validated.')
|
||
|
+ print('Import Error: Output will not be validated.')
|
||
|
# Return success because we don't want the bench target to fail just
|
||
|
# because the jsonschema module was not found.
|
||
|
sys.exit(os.EX_OK)
|
||
|
|
||
|
|
||
|
-def validate_bench(benchfile, schemafile):
|
||
|
- """Validate benchmark file
|
||
|
-
|
||
|
- Validate a benchmark output file against a JSON schema.
|
||
|
+def print_and_exit(message, exitcode):
|
||
|
+ """Prints message to stderr and returns the exit code.
|
||
|
|
||
|
Args:
|
||
|
- benchfile: The file name of the bench.out file.
|
||
|
- schemafile: The file name of the JSON schema file to validate
|
||
|
- bench.out against.
|
||
|
+ message: The message to print
|
||
|
+ exitcode: The exit code to return
|
||
|
|
||
|
- Exceptions:
|
||
|
- jsonschema.ValidationError: When bench.out is not valid
|
||
|
- jsonschema.SchemaError: When the JSON schema is not valid
|
||
|
- IOError: If any of the files are not found.
|
||
|
+ Returns:
|
||
|
+ The passed exit code
|
||
|
"""
|
||
|
- with open(benchfile, 'r') as bfile:
|
||
|
- with open(schemafile, 'r') as sfile:
|
||
|
- bench = json.load(bfile)
|
||
|
- schema = json.load(sfile)
|
||
|
- jsonschema.validate(bench, schema)
|
||
|
-
|
||
|
- # If we reach here, we're all good.
|
||
|
- print("Benchmark output in %s is valid." % benchfile)
|
||
|
+ print(message, file=sys.stderr)
|
||
|
+ return exitcode
|
||
|
|
||
|
|
||
|
def main(args):
|
||
|
@@ -73,11 +62,23 @@ def main(args):
|
||
|
Exceptions thrown by validate_bench
|
||
|
"""
|
||
|
if len(args) != 2:
|
||
|
- print("Usage: %s <bench.out file> <bench.out schema>" % sys.argv[0],
|
||
|
- file=sys.stderr)
|
||
|
- return os.EX_USAGE
|
||
|
+ return print_and_exit("Usage: %s <bench.out file> <bench.out schema>"
|
||
|
+ % sys.argv[0], os.EX_USAGE)
|
||
|
+
|
||
|
+ try:
|
||
|
+ bench.parse_bench(args[0], args[1])
|
||
|
+ except IOError as e:
|
||
|
+ return print_and_exit("IOError(%d): %s" % (e.errno, e.strerror),
|
||
|
+ os.EX_OSFILE)
|
||
|
+
|
||
|
+ except bench.validator.ValidationError as e:
|
||
|
+ return print_and_exit("Invalid benchmark output: %s" % e.message,
|
||
|
+ os.EX_DATAERR)
|
||
|
+
|
||
|
+ except bench.validator.SchemaError as e:
|
||
|
+ return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
|
||
|
|
||
|
- validate_bench(args[0], args[1])
|
||
|
+ print("Benchmark output in %s is valid." % args[0])
|
||
|
return os.EX_OK
|
||
|
|
||
|
|